diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/ExpandableDictionary.java')
-rw-r--r-- | java/src/com/android/inputmethod/latin/ExpandableDictionary.java | 308 |
1 files changed, 178 insertions, 130 deletions
diff --git a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java index e954c0818..97a4a1816 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java @@ -16,11 +16,13 @@ package com.android.inputmethod.latin; -import java.util.LinkedList; - import android.content.Context; import android.os.AsyncTask; +import com.android.inputmethod.keyboard.Keyboard; + +import java.util.LinkedList; + /** * Base class for an in-memory dictionary that can grow dynamically and can * be searched for suggestions and valid words. @@ -32,15 +34,14 @@ public class ExpandableDictionary extends Dictionary { */ protected static final int MAX_WORD_LENGTH = 32; + // Bigram frequency is a fixed point number with 1 meaning 1.2 and 255 meaning 1.8. + protected static final int BIGRAM_MAX_FREQUENCY = 255; + private Context mContext; private char[] mWordBuilder = new char[MAX_WORD_LENGTH]; private int mDicTypeId; private int mMaxDepth; private int mInputLength; - private int[] mNextLettersFrequencies; - private StringBuilder sb = new StringBuilder(MAX_WORD_LENGTH); - - private static final char QUOTE = '\''; private boolean mRequiresReload; @@ -49,53 +50,66 @@ public class ExpandableDictionary extends Dictionary { // Use this lock before touching mUpdatingDictionary & mRequiresDownload private Object mUpdatingLock = new Object(); - static class Node { - char code; - int frequency; - boolean terminal; - Node parent; - NodeArray children; - LinkedList<NextWord> ngrams; // Supports ngram + private static class Node { + char mCode; + int mFrequency; + boolean mTerminal; + Node mParent; + NodeArray mChildren; + LinkedList<NextWord> mNGrams; // Supports ngram } - static class NodeArray { - Node[] data; - int length = 0; + private static class NodeArray { + Node[] mData; + int mLength = 0; private static final int INCREMENT = 2; NodeArray() { - data = new Node[INCREMENT]; + mData = new Node[INCREMENT]; } void add(Node n) { - if (length + 1 > data.length) { - Node[] tempData = new Node[length + INCREMENT]; - if (length > 0) { - System.arraycopy(data, 0, tempData, 0, length); + if (mLength + 1 > mData.length) { + Node[] tempData = new Node[mLength + INCREMENT]; + if (mLength > 0) { + System.arraycopy(mData, 0, tempData, 0, mLength); } - data = tempData; + mData = tempData; } - data[length++] = n; + mData[mLength++] = n; } } - static class NextWord { - Node word; - NextWord nextWord; - int frequency; + private static class NextWord { + public final Node mWord; + private int mFrequency; - NextWord(Node word, int frequency) { - this.word = word; - this.frequency = frequency; + public NextWord(Node word, int frequency) { + mWord = word; + mFrequency = frequency; + } + + public int getFrequency() { + return mFrequency; } - } + public int setFrequency(int freq) { + mFrequency = freq; + return mFrequency; + } + + public int addFrequency(int add) { + mFrequency += add; + if (mFrequency > BIGRAM_MAX_FREQUENCY) mFrequency = BIGRAM_MAX_FREQUENCY; + return mFrequency; + } + } private NodeArray mRoots; private int[][] mCodes; - ExpandableDictionary(Context context, int dicTypeId) { + public ExpandableDictionary(Context context, int dicTypeId) { mContext = context; clearDictionary(); mCodes = new int[MAX_WORD_LENGTH][]; @@ -128,13 +142,14 @@ public class ExpandableDictionary extends Dictionary { /** Override to load your dictionary here, on a background thread. */ public void loadDictionaryAsync() { + // empty base implementation } - Context getContext() { + public Context getContext() { return mContext; } - - int getMaxWordLength() { + + public int getMaxWordLength() { return MAX_WORD_LENGTH; } @@ -145,40 +160,40 @@ public class ExpandableDictionary extends Dictionary { private void addWordRec(NodeArray children, final String word, final int depth, final int frequency, Node parentNode) { final int wordLength = word.length(); + if (wordLength <= depth) return; final char c = word.charAt(depth); // Does children have the current character? - final int childrenLength = children.length; + final int childrenLength = children.mLength; Node childNode = null; boolean found = false; for (int i = 0; i < childrenLength; i++) { - childNode = children.data[i]; - if (childNode.code == c) { + childNode = children.mData[i]; + if (childNode.mCode == c) { found = true; break; } } if (!found) { childNode = new Node(); - childNode.code = c; - childNode.parent = parentNode; + childNode.mCode = c; + childNode.mParent = parentNode; children.add(childNode); } if (wordLength == depth + 1) { // Terminate this word - childNode.terminal = true; - childNode.frequency = Math.max(frequency, childNode.frequency); - if (childNode.frequency > 255) childNode.frequency = 255; + childNode.mTerminal = true; + childNode.mFrequency = Math.max(frequency, childNode.mFrequency); + if (childNode.mFrequency > 255) childNode.mFrequency = 255; return; } - if (childNode.children == null) { - childNode.children = new NodeArray(); + if (childNode.mChildren == null) { + childNode.mChildren = new NodeArray(); } - addWordRec(childNode.children, word, depth + 1, frequency, childNode); + addWordRec(childNode.mChildren, word, depth + 1, frequency, childNode); } @Override - public void getWords(final WordComposer codes, final WordCallback callback, - int[] nextLettersFrequencies) { + public void getWords(final WordComposer codes, final WordCallback callback) { synchronized (mUpdatingLock) { // If we need to update, start off a background task if (mRequiresReload) startDictionaryLoadingTaskLocked(); @@ -187,7 +202,6 @@ public class ExpandableDictionary extends Dictionary { } mInputLength = codes.size(); - mNextLettersFrequencies = nextLettersFrequencies; if (mCodes.length < mInputLength) mCodes = new int[mInputLength][]; // Cache the codes so that we don't have to lookup an array list for (int i = 0; i < mInputLength; i++) { @@ -214,9 +228,20 @@ public class ExpandableDictionary extends Dictionary { /** * Returns the word's frequency or -1 if not found */ - public int getWordFrequency(CharSequence word) { + protected int getWordFrequency(CharSequence word) { + // Case-sensitive search Node node = searchNode(mRoots, word, 0, word.length()); - return (node == null) ? -1 : node.frequency; + return (node == null) ? -1 : node.mFrequency; + } + + private static int computeSkippedWordFinalFreq(int freq, int snr, int inputLength) { + // The computation itself makes sense for >= 2, but the == 2 case returns 0 + // anyway so we may as well test against 3 instead and return the constant + if (inputLength >= 3) { + return (freq * snr * (inputLength - 2)) / (inputLength - 1); + } else { + return 0; + } } /** @@ -232,16 +257,17 @@ public class ExpandableDictionary extends Dictionary { * @param completion whether the traversal is now in completion mode - meaning that we've * exhausted the input and we're looking for all possible suffixes. * @param snr current weight of the word being formed - * @param inputIndex position in the input characters. This can be off from the depth in + * @param inputIndex position in the input characters. This can be off from the depth in * case we skip over some punctuations such as apostrophe in the traversal. That is, if you type * "wouldve", it could be matching "would've", so the depth will be one more than the * inputIndex * @param callback the callback class for adding a word */ - protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word, + // TODO: Share this routine with the native code for BinaryDictionary + protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word, final int depth, boolean completion, int snr, int inputIndex, int skipPos, WordCallback callback) { - final int count = roots.length; + final int count = roots.mLength; final int codeSize = mInputLength; // Optimization: Prune out words that are too long compared to how much was typed. if (depth > mMaxDepth) { @@ -255,34 +281,36 @@ public class ExpandableDictionary extends Dictionary { } for (int i = 0; i < count; i++) { - final Node node = roots.data[i]; - final char c = node.code; + final Node node = roots.mData[i]; + final char c = node.mCode; final char lowerC = toLowerCase(c); - final boolean terminal = node.terminal; - final NodeArray children = node.children; - final int freq = node.frequency; + final boolean terminal = node.mTerminal; + final NodeArray children = node.mChildren; + final int freq = node.mFrequency; if (completion) { word[depth] = c; if (terminal) { - if (!callback.addWord(word, 0, depth + 1, freq * snr, mDicTypeId, - DataType.UNIGRAM)) { - return; + final int finalFreq; + if (skipPos < 0) { + finalFreq = freq * snr; + } else { + finalFreq = computeSkippedWordFinalFreq(freq, snr, mInputLength); } - // Add to frequency of next letters for predictive correction - if (mNextLettersFrequencies != null && depth >= inputIndex && skipPos < 0 - && mNextLettersFrequencies.length > word[inputIndex]) { - mNextLettersFrequencies[word[inputIndex]]++; + if (!callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId, + DataType.UNIGRAM)) { + return; } } if (children != null) { getWordsRec(children, codes, word, depth + 1, completion, snr, inputIndex, skipPos, callback); } - } else if ((c == QUOTE && currentChars[0] != QUOTE) || depth == skipPos) { + } else if ((c == Keyboard.CODE_SINGLE_QUOTE + && currentChars[0] != Keyboard.CODE_SINGLE_QUOTE) || depth == skipPos) { // Skip the ' and continue deeper word[depth] = c; if (children != null) { - getWordsRec(children, codes, word, depth + 1, completion, snr, inputIndex, + getWordsRec(children, codes, word, depth + 1, completion, snr, inputIndex, skipPos, callback); } } else { @@ -299,10 +327,16 @@ public class ExpandableDictionary extends Dictionary { if (codeSize == inputIndex + 1) { if (terminal) { - if (INCLUDE_TYPED_WORD_IF_VALID + if (INCLUDE_TYPED_WORD_IF_VALID || !same(word, depth + 1, codes.getTypedWord())) { - int finalFreq = freq * snr * addedAttenuation; - if (skipPos < 0) finalFreq *= FULL_WORD_FREQ_MULTIPLIER; + final int finalFreq; + if (skipPos < 0) { + finalFreq = freq * snr * addedAttenuation + * FULL_WORD_SCORE_MULTIPLIER; + } else { + finalFreq = computeSkippedWordFinalFreq(freq, + snr * addedAttenuation, mInputLength); + } callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId, DataType.UNIGRAM); } @@ -313,7 +347,7 @@ public class ExpandableDictionary extends Dictionary { skipPos, callback); } } else if (children != null) { - getWordsRec(children, codes, word, depth + 1, + getWordsRec(children, codes, word, depth + 1, false, snr * addedAttenuation, inputIndex + 1, skipPos, callback); } @@ -333,31 +367,33 @@ public class ExpandableDictionary extends Dictionary { /** * Adds bigrams to the in-memory trie structure that is being used to retrieve any word - * @param frequency frequency for this bigrams - * @param addFrequency if true, it adds to current frequency + * @param frequency frequency for this bigram + * @param addFrequency if true, it adds to current frequency, else it overwrites the old value * @return returns the final frequency */ private int addOrSetBigram(String word1, String word2, int frequency, boolean addFrequency) { - Node firstWord = searchWord(mRoots, word1, 0, null); + // We don't want results to be different according to case of the looked up left hand side + // word. We do want however to return the correct case for the right hand side. + // So we want to squash the case of the left hand side, and preserve that of the right + // hand side word. + Node firstWord = searchWord(mRoots, word1.toLowerCase(), 0, null); Node secondWord = searchWord(mRoots, word2, 0, null); - LinkedList<NextWord> bigram = firstWord.ngrams; + LinkedList<NextWord> bigram = firstWord.mNGrams; if (bigram == null || bigram.size() == 0) { - firstWord.ngrams = new LinkedList<NextWord>(); - bigram = firstWord.ngrams; + firstWord.mNGrams = new LinkedList<NextWord>(); + bigram = firstWord.mNGrams; } else { for (NextWord nw : bigram) { - if (nw.word == secondWord) { + if (nw.mWord == secondWord) { if (addFrequency) { - nw.frequency += frequency; + return nw.addFrequency(frequency); } else { - nw.frequency = frequency; + return nw.setFrequency(frequency); } - return nw.frequency; } } } - NextWord nw = new NextWord(secondWord, frequency); - firstWord.ngrams.add(nw); + firstWord.mNGrams.add(new NextWord(secondWord, frequency)); return frequency; } @@ -369,31 +405,31 @@ public class ExpandableDictionary extends Dictionary { final int wordLength = word.length(); final char c = word.charAt(depth); // Does children have the current character? - final int childrenLength = children.length; + final int childrenLength = children.mLength; Node childNode = null; boolean found = false; for (int i = 0; i < childrenLength; i++) { - childNode = children.data[i]; - if (childNode.code == c) { + childNode = children.mData[i]; + if (childNode.mCode == c) { found = true; break; } } if (!found) { childNode = new Node(); - childNode.code = c; - childNode.parent = parentNode; + childNode.mCode = c; + childNode.mParent = parentNode; children.add(childNode); } if (wordLength == depth + 1) { // Terminate this word - childNode.terminal = true; + childNode.mTerminal = true; return childNode; } - if (childNode.children == null) { - childNode.children = new NodeArray(); + if (childNode.mChildren == null) { + childNode.mChildren = new NodeArray(); } - return searchWord(childNode.children, word, depth + 1, childNode); + return searchWord(childNode.mChildren, word, depth + 1, childNode); } // @VisibleForTesting @@ -406,18 +442,22 @@ public class ExpandableDictionary extends Dictionary { } } - private void runReverseLookUp(final CharSequence previousWord, final WordCallback callback) { - Node prevWord = searchNode(mRoots, previousWord, 0, previousWord.length()); - if (prevWord != null && prevWord.ngrams != null) { - reverseLookUp(prevWord.ngrams, callback); + private void runBigramReverseLookUp(final CharSequence previousWord, + final WordCallback callback) { + // Search for the lowercase version of the word only, because that's where bigrams + // store their sons. + Node prevWord = searchNode(mRoots, previousWord.toString().toLowerCase(), 0, + previousWord.length()); + if (prevWord != null && prevWord.mNGrams != null) { + reverseLookUp(prevWord.mNGrams, callback); } } @Override public void getBigrams(final WordComposer codes, final CharSequence previousWord, - final WordCallback callback, int[] nextLettersFrequencies) { + final WordCallback callback) { if (!reloadDictionaryIfRequired()) { - runReverseLookUp(previousWord, callback); + runBigramReverseLookUp(previousWord, callback); } } @@ -430,10 +470,14 @@ public class ExpandableDictionary extends Dictionary { try { Thread.sleep(100); } catch (InterruptedException e) { + // } } } + // Local to reverseLookUp, but do not allocate each time. + private final char[] mLookedUpString = new char[MAX_WORD_LENGTH]; + /** * reverseLookUp retrieves the full word given a list of terminal nodes and adds those words * through callback. @@ -444,42 +488,45 @@ public class ExpandableDictionary extends Dictionary { Node node; int freq; for (NextWord nextWord : terminalNodes) { - node = nextWord.word; - freq = nextWord.frequency; - // TODO Not the best way to limit suggestion threshold - if (freq >= UserBigramDictionary.SUGGEST_THRESHOLD) { - sb.setLength(0); - do { - sb.insert(0, node.code); - node = node.parent; - } while(node != null); - - // TODO better way to feed char array? - callback.addWord(sb.toString().toCharArray(), 0, sb.length(), freq, mDicTypeId, - DataType.BIGRAM); - } + node = nextWord.mWord; + freq = nextWord.getFrequency(); + int index = MAX_WORD_LENGTH; + do { + --index; + mLookedUpString[index] = node.mCode; + node = node.mParent; + } while (node != null); + + callback.addWord(mLookedUpString, index, MAX_WORD_LENGTH - index, freq, mDicTypeId, + DataType.BIGRAM); } } /** - * Search for the terminal node of the word + * Recursively search for the terminal node of the word. + * + * One iteration takes the full word to search for and the current index of the recursion. + * + * @param children the node of the trie to search under. + * @param word the word to search for. Only read [offset..length] so there may be trailing chars + * @param offset the index in {@code word} this recursion should operate on. + * @param length the length of the input word. * @return Returns the terminal node of the word if the word exists */ private Node searchNode(final NodeArray children, final CharSequence word, final int offset, final int length) { - // TODO Consider combining with addWordRec - final int count = children.length; - char currentChar = word.charAt(offset); + final int count = children.mLength; + final char currentChar = word.charAt(offset); for (int j = 0; j < count; j++) { - final Node node = children.data[j]; - if (node.code == currentChar) { + final Node node = children.mData[j]; + if (node.mCode == currentChar) { if (offset == length - 1) { - if (node.terminal) { + if (node.mTerminal) { return node; } } else { - if (node.children != null) { - Node returnNode = searchNode(node.children, word, offset + 1, length); + if (node.mChildren != null) { + Node returnNode = searchNode(node.mChildren, word, offset + 1, length); if (returnNode != null) return returnNode; } } @@ -503,16 +550,17 @@ public class ExpandableDictionary extends Dictionary { } } - static char toLowerCase(char c) { + private static char toLowerCase(char c) { + char baseChar = c; if (c < BASE_CHARS.length) { - c = BASE_CHARS[c]; + baseChar = BASE_CHARS[c]; } - if (c >= 'A' && c <= 'Z') { - c = (char) (c | 32); - } else if (c > 127) { - c = Character.toLowerCase(c); + if (baseChar >= 'A' && baseChar <= 'Z') { + return (char)(baseChar | 32); + } else if (baseChar > 127) { + return Character.toLowerCase(baseChar); } - return c; + return baseChar; } /** @@ -521,7 +569,7 @@ public class ExpandableDictionary extends Dictionary { * if c is not a combined character, or the base character if it * is combined. */ - static final char BASE_CHARS[] = { + private static final char BASE_CHARS[] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, |