aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java116
1 files changed, 64 insertions, 52 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
index ca4a2e9bb..8f73b27b5 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
@@ -107,24 +107,26 @@ public final class FusionDictionary implements Iterable<WordProperty> {
}
/**
- * PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children
- * (Pt means Patricia Trie).
+ * PtNode is a group of characters, with probability information, shortcut targets, bigrams,
+ * and children (Pt means Patricia Trie).
*
* This is the central class of the in-memory representation. A PtNode is what can
* be seen as a traditional "trie node", except it can hold several characters at the
* same time. A PtNode essentially represents one or several characters in the middle
* of the trie tree; as such, it can be a terminal, and it can have children.
* In this in-memory representation, whether the PtNode is a terminal or not is represented
- * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
- * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
- * bigrams, but a non-terminal may not. Moreover, children, if present, are null.
+ * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the
+ * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null
+ * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present,
+ * are non-null.
*/
public static final class PtNode {
- public static final int NOT_A_TERMINAL = -1;
+ private static final int NOT_A_TERMINAL = -1;
final int mChars[];
ArrayList<WeightedString> mShortcutTargets;
ArrayList<WeightedString> mBigrams;
- int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
+ // null == mProbabilityInfo indicates this is not a terminal.
+ ProbabilityInfo mProbabilityInfo;
int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
PtNodeArray mChildren;
boolean mIsNotAWord; // Only a shortcut
@@ -140,11 +142,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
- final ArrayList<WeightedString> bigrams, final int frequency,
+ final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
final boolean isNotAWord, final boolean isBlacklistEntry) {
mChars = chars;
- mFrequency = frequency;
- mTerminalId = frequency;
+ mProbabilityInfo = probabilityInfo;
+ mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
mShortcutTargets = shortcutTargets;
mBigrams = bigrams;
mChildren = null;
@@ -153,11 +155,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
}
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
- final ArrayList<WeightedString> bigrams, final int frequency,
+ final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
final boolean isNotAWord, final boolean isBlacklistEntry,
final PtNodeArray children) {
mChars = chars;
- mFrequency = frequency;
+ mProbabilityInfo = probabilityInfo;
mShortcutTargets = shortcutTargets;
mBigrams = bigrams;
mChildren = children;
@@ -177,11 +179,15 @@ public final class FusionDictionary implements Iterable<WordProperty> {
}
public boolean isTerminal() {
- return NOT_A_TERMINAL != mFrequency;
+ return mProbabilityInfo != null;
}
- public int getFrequency() {
- return mFrequency;
+ public int getProbability() {
+ if (isTerminal()) {
+ return mProbabilityInfo.mProbability;
+ } else {
+ return NOT_A_TERMINAL;
+ }
}
public boolean getIsNotAWord() {
@@ -213,18 +219,18 @@ public final class FusionDictionary implements Iterable<WordProperty> {
}
/**
- * Adds a word to the bigram list. Updates the probability if the word already
+ * Adds a word to the bigram list. Updates the probability information if the word already
* exists.
*/
- public void addBigram(final String word, final int probability) {
+ public void addBigram(final String word, final ProbabilityInfo probabilityInfo) {
if (mBigrams == null) {
mBigrams = new ArrayList<WeightedString>();
}
WeightedString bigram = getBigram(word);
if (bigram != null) {
- bigram.setProbability(probability);
+ bigram.mProbabilityInfo = probabilityInfo;
} else {
- bigram = new WeightedString(word, probability);
+ bigram = new WeightedString(word, probabilityInfo);
mBigrams.add(bigram);
}
}
@@ -270,12 +276,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
* the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
* updated if they are higher than the existing ones.
*/
- public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
+ private void update(final ProbabilityInfo probabilityInfo,
+ final ArrayList<WeightedString> shortcutTargets,
final ArrayList<WeightedString> bigrams,
final boolean isNotAWord, final boolean isBlacklistEntry) {
- if (frequency > mFrequency) {
- mFrequency = frequency;
- }
+ mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
if (shortcutTargets != null) {
if (mShortcutTargets == null) {
mShortcutTargets = shortcutTargets;
@@ -286,8 +291,9 @@ public final class FusionDictionary implements Iterable<WordProperty> {
final WeightedString existingShortcut = getShortcut(shortcut.mWord);
if (existingShortcut == null) {
mShortcutTargets.add(shortcut);
- } else if (existingShortcut.getProbability() < shortcut.getProbability()) {
- existingShortcut.setProbability(shortcut.getProbability());
+ } else {
+ existingShortcut.mProbabilityInfo = ProbabilityInfo.max(
+ existingShortcut.mProbabilityInfo, shortcut.mProbabilityInfo);
}
}
}
@@ -302,8 +308,9 @@ public final class FusionDictionary implements Iterable<WordProperty> {
final WeightedString existingBigram = getBigram(bigram.mWord);
if (existingBigram == null) {
mBigrams.add(bigram);
- } else if (existingBigram.getProbability() < bigram.getProbability()) {
- existingBigram.setProbability(bigram.getProbability());
+ } else {
+ existingBigram.mProbabilityInfo = ProbabilityInfo.max(
+ existingBigram.mProbabilityInfo, bigram.mProbabilityInfo);
}
}
}
@@ -393,13 +400,13 @@ public final class FusionDictionary implements Iterable<WordProperty> {
* they will be added to the dictionary as necessary.
*
* @param word the word to add.
- * @param frequency the frequency of the word, in the range [0..255].
+ * @param probabilityInfo probability information of the word.
* @param shortcutTargets a list of shortcut targets for this word, or null.
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
*/
- public void add(final String word, final int frequency,
+ public void add(final String word, final ProbabilityInfo probabilityInfo,
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
- add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
+ add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord,
false /* isBlacklistEntry */);
}
@@ -412,7 +419,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
*/
public void addBlacklistEntry(final String word,
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
- add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
+ add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord,
+ true /* isBlacklistEntry */);
}
/**
@@ -438,21 +446,22 @@ public final class FusionDictionary implements Iterable<WordProperty> {
*
* @param word0 the previous word of the context
* @param word1 the next word of the context
- * @param frequency the bigram frequency
+ * @param probabilityInfo the bigram probability info
*/
- public void setBigram(final String word0, final String word1, final int frequency) {
+ public void setBigram(final String word0, final String word1,
+ final ProbabilityInfo probabilityInfo) {
PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
if (ptNode0 != null) {
final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
if (ptNode1 == null) {
- add(getCodePoints(word1), 0, null, false /* isNotAWord */,
+ add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */,
false /* isBlacklistEntry */);
// The PtNode for the first word may have moved by the above insertion,
// if word1 and word2 share a common stem that happens not to have been
// a cutting point until now. In this case, we need to refresh ptNode.
ptNode0 = findWordInTree(mRootNodeArray, word0);
}
- ptNode0.addBigram(word1, frequency);
+ ptNode0.addBigram(word1, probabilityInfo);
} else {
throw new RuntimeException("First word of bigram not found " + word0);
}
@@ -465,15 +474,15 @@ public final class FusionDictionary implements Iterable<WordProperty> {
* an exception is thrown.
*
* @param word the word, as an int array.
- * @param frequency the frequency of the word, in the range [0..255].
+ * @param probabilityInfo the probability information of the word.
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
* @param isBlacklistEntry true if this is a blacklisted word, false otherwise
*/
- private void add(final int[] word, final int frequency,
+ private void add(final int[] word, final ProbabilityInfo probabilityInfo,
final ArrayList<WeightedString> shortcutTargets,
final boolean isNotAWord, final boolean isBlacklistEntry) {
- assert(frequency >= 0 && frequency <= 255);
+ assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
return;
@@ -501,7 +510,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
// No node at this point to accept the word. Create one.
final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
- shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
+ shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord,
+ isBlacklistEntry);
currentNodeArray.mData.add(insertionIndex, newPtNode);
if (DBG) checkStack(currentNodeArray);
} else {
@@ -511,15 +521,15 @@ public final class FusionDictionary implements Iterable<WordProperty> {
// The new word is a prefix of an existing word, but the node on which it
// should end already exists as is. Since the old PtNode was not a terminal,
// make it one by filling in its frequency and other attributes
- currentPtNode.update(frequency, shortcutTargets, null, isNotAWord,
+ currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord,
isBlacklistEntry);
} else {
// The new word matches the full old word and extends past it.
// We only have to create a new node and add it to the end of this.
final PtNode newNode = new PtNode(
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
- shortcutTargets, null /* bigrams */, frequency, isNotAWord,
- isBlacklistEntry);
+ shortcutTargets, null /* bigrams */, probabilityInfo,
+ isNotAWord, isBlacklistEntry);
currentPtNode.mChildren = new PtNodeArray();
currentPtNode.mChildren.mData.add(newNode);
}
@@ -527,7 +537,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
if (0 == differentCharIndex) {
// Exact same word. Update the frequency if higher. This will also add the
// new shortcuts to the existing shortcut list if it already exists.
- currentPtNode.update(frequency, shortcutTargets, null,
+ currentPtNode.update(probabilityInfo, shortcutTargets, null,
currentPtNode.mIsNotAWord && isNotAWord,
currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
} else {
@@ -537,7 +547,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
final PtNode newOldWord = new PtNode(
Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
- currentPtNode.mBigrams, currentPtNode.mFrequency,
+ currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
currentPtNode.mChildren);
newChildren.mData.add(newOldWord);
@@ -546,16 +556,17 @@ public final class FusionDictionary implements Iterable<WordProperty> {
if (charIndex + differentCharIndex >= word.length) {
newParent = new PtNode(
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
- shortcutTargets, null /* bigrams */, frequency,
+ shortcutTargets, null /* bigrams */, probabilityInfo,
isNotAWord, isBlacklistEntry, newChildren);
} else {
newParent = new PtNode(
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
- null /* shortcutTargets */, null /* bigrams */, -1,
- false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
+ null /* shortcutTargets */, null /* bigrams */,
+ null /* probabilityInfo */, false /* isNotAWord */,
+ false /* isBlacklistEntry */, newChildren);
final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
charIndex + differentCharIndex, word.length),
- shortcutTargets, null /* bigrams */, frequency,
+ shortcutTargets, null /* bigrams */, probabilityInfo,
isNotAWord, isBlacklistEntry);
final int addIndex = word[charIndex + differentCharIndex]
> currentPtNode.mChars[differentCharIndex] ? 1 : 0;
@@ -617,8 +628,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
final ArrayList<PtNode> data = nodeArray.mData;
final PtNode reference = new PtNode(new int[] { character },
- null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
- false /* isBlacklistEntry */);
+ null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */,
+ false /* isNotAWord */, false /* isBlacklistEntry */);
int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
return result >= 0 ? result : -result - 1;
}
@@ -752,8 +763,9 @@ public final class FusionDictionary implements Iterable<WordProperty> {
currentPos.length = mCurrentString.length();
mPositions.addLast(currentPos);
}
- if (currentPtNode.mFrequency >= 0) {
- return new WordProperty(mCurrentString.toString(), currentPtNode.mFrequency,
+ if (currentPtNode.isTerminal()) {
+ return new WordProperty(mCurrentString.toString(),
+ currentPtNode.mProbabilityInfo,
currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
}