diff options
-rw-r--r-- | java/src/com/android/inputmethod/latin/BinaryDictionary.java | 5 | ||||
-rw-r--r-- | java/src/com/android/inputmethod/latin/UserHistoryDictionary.java | 183 | ||||
-rw-r--r-- | java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java | 91 | ||||
-rw-r--r-- | java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java | 11 | ||||
-rw-r--r-- | native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp | 6 | ||||
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 14 | ||||
-rw-r--r-- | native/jni/src/binary_format.h | 27 | ||||
-rw-r--r-- | native/jni/src/dictionary.cpp | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary.h | 2 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 17 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.h | 2 |
11 files changed, 232 insertions, 130 deletions
diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index e18aee6ff..cb1069cfb 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -84,7 +84,7 @@ public class BinaryDictionary extends Dictionary { private native long openNative(String sourceDir, long dictOffset, long dictSize, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords); private native void closeNative(long dict); - private native boolean isValidWordNative(long dict, int[] word, int wordLength); + private native int getFrequencyNative(long dict, int[] word, int wordLength); private native boolean isValidBigramNative(long dict, int[] word1, int[] word2); private native int getSuggestionsNative(long dict, long proximityInfo, int[] xCoordinates, int[] yCoordinates, int[] inputCodes, int codesSize, int[] prevWordForBigrams, @@ -203,7 +203,8 @@ public class BinaryDictionary extends Dictionary { public boolean isValidWord(CharSequence word) { if (word == null) return false; int[] chars = StringUtils.toCodePointArray(word.toString()); - return isValidWordNative(mNativeDict, chars, chars.length); + final int freq = getFrequencyNative(mNativeDict, chars, chars.length); + return freq >= 0; } // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni diff --git a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java index fa3d1be11..d5163f2a1 100644 --- a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java +++ b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java @@ -30,8 +30,6 @@ import android.util.Log; import com.android.inputmethod.latin.UserHistoryForgettingCurveUtils.ForgettingCurveParams; import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; /** * Locally gathers stats about the words user types and various other signals like auto-correction @@ -39,6 +37,7 @@ import java.util.Iterator; */ public class UserHistoryDictionary extends ExpandableDictionary { private static final String TAG = "UserHistoryDictionary"; + public static final boolean DBG_SAVE_RESTORE = false; /** Any pair being typed or picked */ private static final int FREQUENCY_FOR_TYPED = 2; @@ -78,7 +77,8 @@ public class UserHistoryDictionary extends ExpandableDictionary { /** Locale for which this auto dictionary is storing words */ private String mLocale; - private HashSet<Bigram> mPendingWrites = new HashSet<Bigram>(); + private UserHistoryDictionaryBigramList mBigramList = + new UserHistoryDictionaryBigramList(); private final Object mPendingWritesLock = new Object(); private static volatile boolean sUpdatingDB = false; private final SharedPreferences mPrefs; @@ -99,35 +99,6 @@ public class UserHistoryDictionary extends ExpandableDictionary { private static DatabaseHelper sOpenHelper = null; - private static class Bigram { - public final String mWord1; - public final String mWord2; - - Bigram(String word1, String word2) { - this.mWord1 = word1; - this.mWord2 = word2; - } - - @Override - public boolean equals(Object bigram) { - if (!(bigram instanceof Bigram)) { - return false; - } - final Bigram bigram2 = (Bigram) bigram; - final boolean eq1 = - mWord1 == null ? bigram2.mWord1 == null : mWord1.equals(bigram2.mWord1); - if (!eq1) { - return false; - } - return mWord2 == null ? bigram2.mWord2 == null : mWord2.equals(bigram2.mWord2); - } - - @Override - public int hashCode() { - return (mWord1 + " " + mWord2).hashCode(); - } - } - public void setDatabaseMax(int maxHistoryBigram) { sMaxHistoryBigrams = maxHistoryBigram; } @@ -190,20 +161,17 @@ public class UserHistoryDictionary extends ExpandableDictionary { freq = super.setBigramAndGetFrequency(word1, word2, new ForgettingCurveParams()); } synchronized (mPendingWritesLock) { - final Bigram bi = new Bigram(word1, word2); - if (!mPendingWrites.contains(bi)) { - mPendingWrites.add(bi); - } + mBigramList.addBigram(word1, word2); } return freq; } public boolean cancelAddingUserHistory(String word1, String word2) { - final Bigram bi = new Bigram(word1, word2); - if (mPendingWrites.contains(bi)) { - mPendingWrites.remove(bi); - return super.removeBigram(word1, word2); + synchronized (mPendingWritesLock) { + if (mBigramList.removeBigram(word1, word2)) { + return super.removeBigram(word1, word2); + } } return false; } @@ -214,11 +182,11 @@ public class UserHistoryDictionary extends ExpandableDictionary { private void flushPendingWrites() { synchronized (mPendingWritesLock) { // Nothing pending? Return - if (mPendingWrites.isEmpty()) return; + if (mBigramList.isEmpty()) return; // Create a background thread to write the pending entries - new UpdateDbTask(sOpenHelper, mPendingWrites, mLocale, this).execute(); + new UpdateDbTask(sOpenHelper, mBigramList, mLocale, this).execute(); // Create a new map for writing new entries into while the old one is written to db - mPendingWrites = new HashSet<Bigram>(); + mBigramList = new UserHistoryDictionaryBigramList(); } } @@ -251,6 +219,9 @@ public class UserHistoryDictionary extends ExpandableDictionary { final String word1 = cursor.getString(word1Index); final String word2 = cursor.getString(word2Index); final int frequency = cursor.getInt(frequencyIndex); + if (DBG_SAVE_RESTORE) { + Log.d(TAG, "--- Load user history: " + word1 + ", " + word2); + } // Safeguard against adding really long words. Stack may overflow due // to recursive lookup if (null == word1) { @@ -259,8 +230,9 @@ public class UserHistoryDictionary extends ExpandableDictionary { && word2.length() < BinaryDictionary.MAX_WORD_LENGTH) { super.setBigramAndGetFrequency( word1, word2, new ForgettingCurveParams(frequency, now, last)); - // TODO: optimize - mPendingWrites.add(new Bigram(word1, word2)); + } + synchronized(mPendingWritesLock) { + mBigramList.addBigram(word1, word2); } cursor.moveToNext(); } @@ -339,14 +311,15 @@ public class UserHistoryDictionary extends ExpandableDictionary { * the in-memory trie. */ private static class UpdateDbTask extends AsyncTask<Void, Void, Void> { - private final HashSet<Bigram> mMap; + private final UserHistoryDictionaryBigramList mBigramList; private final DatabaseHelper mDbHelper; private final String mLocale; private final UserHistoryDictionary mUserHistoryDictionary; - public UpdateDbTask(DatabaseHelper openHelper, HashSet<Bigram> pendingWrites, + public UpdateDbTask( + DatabaseHelper openHelper, UserHistoryDictionaryBigramList pendingWrites, String locale, UserHistoryDictionary dict) { - mMap = pendingWrites; + mBigramList = pendingWrites; mLocale = locale; mDbHelper = openHelper; mUserHistoryDictionary = dict; @@ -401,67 +374,71 @@ public class UserHistoryDictionary extends ExpandableDictionary { return null; } db.execSQL("PRAGMA foreign_keys = ON;"); + final boolean addLevel0Bigram = mBigramList.size() <= sMaxHistoryBigrams; + // Write all the entries to the db - final Iterator<Bigram> iterator = mMap.iterator(); - while (iterator.hasNext()) { - // TODO: this process of making a text search for each pair each time - // is terribly inefficient. Optimize this. - final Bigram bi = iterator.next(); - - // find pair id - Cursor c = null; - try { - if (null != bi.mWord1) { - c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID }, - MAIN_COLUMN_WORD1 + "=? AND " + MAIN_COLUMN_WORD2 + "=? AND " - + MAIN_COLUMN_LOCALE + "=?", - new String[] { bi.mWord1, bi.mWord2, mLocale }, null, null, - null); - } else { - c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID }, - MAIN_COLUMN_WORD1 + " IS NULL AND " + MAIN_COLUMN_WORD2 + "=? AND " - + MAIN_COLUMN_LOCALE + "=?", - new String[] { bi.mWord2, mLocale }, null, null, null); - } + for (String word1 : mBigramList.keySet()) { + for (String word2 : mBigramList.getBigrams(word1)) { + // TODO: this process of making a text search for each pair each time + // is terribly inefficient. Optimize this. + // find pair id + Cursor c = null; + try { + if (null != word1) { + c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID }, + MAIN_COLUMN_WORD1 + "=? AND " + MAIN_COLUMN_WORD2 + "=? AND " + + MAIN_COLUMN_LOCALE + "=?", + new String[] { word1, word2, mLocale }, null, null, + null); + } else { + c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID }, + MAIN_COLUMN_WORD1 + " IS NULL AND " + MAIN_COLUMN_WORD2 + + "=? AND " + MAIN_COLUMN_LOCALE + "=?", + new String[] { word2, mLocale }, null, null, null); + } - final int pairId; - if (c.moveToFirst()) { - // existing pair - pairId = c.getInt(c.getColumnIndex(MAIN_COLUMN_ID)); - db.delete(FREQ_TABLE_NAME, FREQ_COLUMN_PAIR_ID + "=?", - new String[] { Integer.toString(pairId) }); - } else { - // new pair - Long pairIdLong = db.insert(MAIN_TABLE_NAME, null, - getContentValues(bi.mWord1, bi.mWord2, mLocale)); - pairId = pairIdLong.intValue(); - } - // insert new frequency - final int freq; - if (bi.mWord1 == null) { - freq = FREQUENCY_FOR_TYPED; - } else { - final NextWord nw = mUserHistoryDictionary.getBigramWord( - bi.mWord1, bi.mWord2); - if (nw != null) { - final int tempFreq = nw.getFcValue(); - // TODO: Check whether the word is valid or not - if (UserHistoryForgettingCurveUtils.needsToSave( - (byte)tempFreq, false)) { - freq = tempFreq; + final int pairId; + if (c.moveToFirst()) { + // existing pair + pairId = c.getInt(c.getColumnIndex(MAIN_COLUMN_ID)); + db.delete(FREQ_TABLE_NAME, FREQ_COLUMN_PAIR_ID + "=?", + new String[] { Integer.toString(pairId) }); + } else { + // new pair + Long pairIdLong = db.insert(MAIN_TABLE_NAME, null, + getContentValues(word1, word2, mLocale)); + pairId = pairIdLong.intValue(); + } + // insert new frequency + final int freq; + if (word1 == null) { + freq = FREQUENCY_FOR_TYPED; + } else { + final NextWord nw = mUserHistoryDictionary.getBigramWord(word1, word2); + if (nw != null) { + final int tempFreq = nw.getFcValue(); + // TODO: Check whether the word is valid or not + if (UserHistoryForgettingCurveUtils.needsToSave( + (byte)tempFreq, false, addLevel0Bigram)) { + freq = tempFreq; + } else { + freq = -1; + } } else { freq = -1; } - } else { - freq = -1; } - } - if (freq > 0) { - db.insert(FREQ_TABLE_NAME, null, getFrequencyContentValues(pairId, freq)); - } - } finally { - if (c != null) { - c.close(); + if (freq > 0) { + if (DBG_SAVE_RESTORE) { + Log.d(TAG, "--- Save user history: " + word1 + ", " + word2); + } + db.insert(FREQ_TABLE_NAME, null, + getFrequencyContentValues(pairId, freq)); + } + } finally { + if (c != null) { + c.close(); + } } } } diff --git a/java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java b/java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java new file mode 100644 index 000000000..409f921ff --- /dev/null +++ b/java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin; + +import android.util.Log; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +/** + * A store of bigrams which will be updated when the user history dictionary is closed + * All bigrams including stale ones in SQL DB should be stored in this class to avoid adding stale + * bigrams when we write to the SQL DB. + */ +public class UserHistoryDictionaryBigramList { + private static final String TAG = UserHistoryDictionaryBigramList.class.getSimpleName(); + private static final HashSet<String> EMPTY_STRING_SET = new HashSet<String>(); + private final HashMap<String, HashSet<String>> mBigramMap = + new HashMap<String, HashSet<String>>(); + private int mSize = 0; + + public void evictAll() { + mSize = 0; + mBigramMap.clear(); + } + + public void addBigram(String word1, String word2) { + if (UserHistoryDictionary.DBG_SAVE_RESTORE) { + Log.d(TAG, "--- add bigram: " + word1 + ", " + word2); + } + final HashSet<String> set; + if (mBigramMap.containsKey(word1)) { + set = mBigramMap.get(word1); + } else { + set = new HashSet<String>(); + mBigramMap.put(word1, set); + } + if (!set.contains(word2)) { + ++mSize; + set.add(word2); + } + } + + public int size() { + return mSize; + } + + public boolean isEmpty() { + return mBigramMap.isEmpty(); + } + + public Set<String> keySet() { + return mBigramMap.keySet(); + } + + public HashSet<String> getBigrams(String word1) { + if (!mBigramMap.containsKey(word1)) { + return EMPTY_STRING_SET; + } else { + return mBigramMap.get(word1); + } + } + + public boolean removeBigram(String word1, String word2) { + final HashSet<String> set = getBigrams(word1); + if (set.isEmpty()) { + return false; + } + if (set.contains(word2)) { + set.remove(word2); + --mSize; + return true; + } + return false; + } +} diff --git a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java index f30fee23e..feb1d0029 100644 --- a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java +++ b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java @@ -162,10 +162,15 @@ public class UserHistoryForgettingCurveUtils { // TODO: isValid should be false for a word whose frequency is 0, // or that is not in the dictionary. - public static boolean needsToSave(byte fc, boolean isValid) { + /** + * Check wheather we should save the bigram to the SQL DB or not + */ + public static boolean needsToSave(byte fc, boolean isValid, boolean addLevel0Bigram) { int level = fcToLevel(fc); - if (isValid && level == 0) { - return false; + if (level == 0) { + if (isValid || !addLevel0Bigram) { + return false; + } } final int elapsedTime = fcToElapsedTime(fc); return (elapsedTime < ELAPSED_TIME_MAX - 1 || level > 0); diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index f130062a1..d10dc962e 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -173,12 +173,12 @@ static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jlo return count; } -static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jlong dict, +static jint latinime_BinaryDictionary_getFrequency(JNIEnv *env, jobject object, jlong dict, jintArray wordArray, jint wordLength) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return (jboolean) false; jint *word = env->GetIntArrayElements(wordArray, 0); - jboolean result = dictionary->isValidWord(word, wordLength); + jint result = dictionary->getFrequency(word, wordLength); env->ReleaseIntArrayElements(wordArray, word, JNI_ABORT); return result; } @@ -253,7 +253,7 @@ static JNINativeMethod sMethods[] = { {"closeNative", "(J)V", (void*)latinime_BinaryDictionary_close}, {"getSuggestionsNative", "(JJ[I[I[II[IZ[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, - {"isValidWordNative", "(J[II)Z", (void*)latinime_BinaryDictionary_isValidWord}, + {"getFrequencyNative", "(J[II)I", (void*)latinime_BinaryDictionary_getFrequency}, {"isValidBigramNative", "(J[I[I)Z", (void*)latinime_BinaryDictionary_isValidBigram}, {"getBigramsNative", "(J[II[II[C[III)I", (void*)latinime_BinaryDictionary_getBigrams}, {"calcNormalizedScoreNative", "([CI[CII)F", diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index ac2a26172..eb4bf8d1a 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -117,14 +117,22 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); uint16_t bigramBuffer[MAX_WORD_LENGTH]; + int unigramFreq; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, - bigramBuffer); + bigramBuffer, &unigramFreq); // codesSize == 0 means we are trying to find bigram predictions. if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) { - const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + // Due to space constraints, the frequency for bigrams is approximate - the lower the + // unigram frequency, the worse the precision. The theoritical maximum error in + // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 + // in very bad cases. This means that sometimes, we'll see some bigrams interverted + // here, but it can't get too bad. + const int frequency = + BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq); if (addWordBigram(bigramBuffer, length, frequency)) { ++bigramCount; } @@ -149,8 +157,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, } else { pos = BinaryFormat::skipOtherCharacters(root, pos); } - pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipShortcuts(root, flags, pos); return pos; } diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 40f197619..51bf8ebbc 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -66,7 +66,8 @@ class BinaryFormat { static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, - uint16_t* outWord); + uint16_t* outWord, int* outUnigramFrequency); + static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); @@ -390,10 +391,11 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramFrequency: a pointer to an int to write the frequency into. * Return value : the length of the word, of 0 if the word was not found. */ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, - const int maxDepth, uint16_t* outWord) { + const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { int pos = 0; int wordPos = 0; @@ -421,11 +423,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug int charCount = maxDepth; - while (-1 != nextChar && --charCount > 0) { + while (NOT_A_CHARACTER != nextChar && --charCount > 0) { outWord[++wordPos] = nextChar; nextChar = getCharCodeAndForwardPointer(root, &pos); } } + *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the @@ -529,6 +532,16 @@ static inline int backoff(const int unigramFreq) { // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); } +inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { + // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the + // unigram frequency to be the median value of the 17th step from the top. A value of + // 0 for the bigram frequency represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictInputOutput for details. + const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); + return (int)(unigramFreq + (bigramFreq + 1) * stepSize); +} + // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq) { @@ -537,13 +550,7 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int, const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); if (bigramFreqIt != bigramMap->end()) { const int bigramFreq = bigramFreqIt->second; - // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the - // unigram frequency to be the median value of the 17th step from the top. A value of - // 0 for the bigram frequency represents the middle of the 16th step from the top, - // while a value of 15 represents the middle of the top step. - // See makedict.BinaryDictInputOutput for details. - const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); - return (int)(unigramFreq + bigramFreq * stepSize); + return computeFrequencyForBigram(unigramFreq, bigramFreq); } else { return backoff(unigramFreq); } diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp index 65d0f73a3..1fb02478b 100644 --- a/native/jni/src/dictionary.cpp +++ b/native/jni/src/dictionary.cpp @@ -55,8 +55,8 @@ Dictionary::~Dictionary() { delete mBigramDictionary; } -bool Dictionary::isValidWord(const int32_t *word, int length) { - return mUnigramDictionary->isValidWord(word, length); +int Dictionary::getFrequency(const int32_t *word, int length) { + return mUnigramDictionary->getFrequency(word, length); } bool Dictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2, diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h index 87891ee4d..9f2367904 100644 --- a/native/jni/src/dictionary.h +++ b/native/jni/src/dictionary.h @@ -52,7 +52,7 @@ class Dictionary { maxWordLength, maxBigrams); } - bool isValidWord(const int32_t *word, int length); + int getFrequency(const int32_t *word, int length); bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2); void *getDict() { return (void *)mDict; } int getDictSize() { return mDictSize; } diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 828582848..efe9c4fe3 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -747,8 +747,21 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor return maxFreq; } -bool UnigramDictionary::isValidWord(const int32_t* const inWord, const int length) const { - return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); +int UnigramDictionary::getFrequency(const int32_t* const inWord, const int length) const { + const uint8_t* const root = DICT_ROOT; + int pos = BinaryFormat::getTerminalPosition(root, inWord, length); + if (NOT_VALID_WORD == pos) { + return NOT_A_PROBABILITY; + } + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); + if (hasMultipleChars) { + pos = BinaryFormat::skipOtherCharacters(root, pos); + } else { + BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); + } + const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); + return unigramFreq; } // TODO: remove this function. diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index b9233518f..b70894004 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -72,7 +72,7 @@ class UnigramDictionary { UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags); - bool isValidWord(const int32_t* const inWord, const int length) const; + int getFrequency(const int32_t* const inWord, const int length) const; int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, |