aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--java/src/com/android/inputmethod/latin/BinaryDictionary.java5
-rw-r--r--java/src/com/android/inputmethod/latin/UserHistoryDictionary.java183
-rw-r--r--java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java91
-rw-r--r--java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java11
-rw-r--r--native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp6
-rw-r--r--native/jni/src/bigram_dictionary.cpp14
-rw-r--r--native/jni/src/binary_format.h27
-rw-r--r--native/jni/src/dictionary.cpp4
-rw-r--r--native/jni/src/dictionary.h2
-rw-r--r--native/jni/src/unigram_dictionary.cpp17
-rw-r--r--native/jni/src/unigram_dictionary.h2
11 files changed, 232 insertions, 130 deletions
diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java
index e18aee6ff..cb1069cfb 100644
--- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java
+++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java
@@ -84,7 +84,7 @@ public class BinaryDictionary extends Dictionary {
private native long openNative(String sourceDir, long dictOffset, long dictSize,
int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords);
private native void closeNative(long dict);
- private native boolean isValidWordNative(long dict, int[] word, int wordLength);
+ private native int getFrequencyNative(long dict, int[] word, int wordLength);
private native boolean isValidBigramNative(long dict, int[] word1, int[] word2);
private native int getSuggestionsNative(long dict, long proximityInfo, int[] xCoordinates,
int[] yCoordinates, int[] inputCodes, int codesSize, int[] prevWordForBigrams,
@@ -203,7 +203,8 @@ public class BinaryDictionary extends Dictionary {
public boolean isValidWord(CharSequence word) {
if (word == null) return false;
int[] chars = StringUtils.toCodePointArray(word.toString());
- return isValidWordNative(mNativeDict, chars, chars.length);
+ final int freq = getFrequencyNative(mNativeDict, chars, chars.length);
+ return freq >= 0;
}
// TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni
diff --git a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java
index fa3d1be11..d5163f2a1 100644
--- a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java
+++ b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java
@@ -30,8 +30,6 @@ import android.util.Log;
import com.android.inputmethod.latin.UserHistoryForgettingCurveUtils.ForgettingCurveParams;
import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
/**
* Locally gathers stats about the words user types and various other signals like auto-correction
@@ -39,6 +37,7 @@ import java.util.Iterator;
*/
public class UserHistoryDictionary extends ExpandableDictionary {
private static final String TAG = "UserHistoryDictionary";
+ public static final boolean DBG_SAVE_RESTORE = false;
/** Any pair being typed or picked */
private static final int FREQUENCY_FOR_TYPED = 2;
@@ -78,7 +77,8 @@ public class UserHistoryDictionary extends ExpandableDictionary {
/** Locale for which this auto dictionary is storing words */
private String mLocale;
- private HashSet<Bigram> mPendingWrites = new HashSet<Bigram>();
+ private UserHistoryDictionaryBigramList mBigramList =
+ new UserHistoryDictionaryBigramList();
private final Object mPendingWritesLock = new Object();
private static volatile boolean sUpdatingDB = false;
private final SharedPreferences mPrefs;
@@ -99,35 +99,6 @@ public class UserHistoryDictionary extends ExpandableDictionary {
private static DatabaseHelper sOpenHelper = null;
- private static class Bigram {
- public final String mWord1;
- public final String mWord2;
-
- Bigram(String word1, String word2) {
- this.mWord1 = word1;
- this.mWord2 = word2;
- }
-
- @Override
- public boolean equals(Object bigram) {
- if (!(bigram instanceof Bigram)) {
- return false;
- }
- final Bigram bigram2 = (Bigram) bigram;
- final boolean eq1 =
- mWord1 == null ? bigram2.mWord1 == null : mWord1.equals(bigram2.mWord1);
- if (!eq1) {
- return false;
- }
- return mWord2 == null ? bigram2.mWord2 == null : mWord2.equals(bigram2.mWord2);
- }
-
- @Override
- public int hashCode() {
- return (mWord1 + " " + mWord2).hashCode();
- }
- }
-
public void setDatabaseMax(int maxHistoryBigram) {
sMaxHistoryBigrams = maxHistoryBigram;
}
@@ -190,20 +161,17 @@ public class UserHistoryDictionary extends ExpandableDictionary {
freq = super.setBigramAndGetFrequency(word1, word2, new ForgettingCurveParams());
}
synchronized (mPendingWritesLock) {
- final Bigram bi = new Bigram(word1, word2);
- if (!mPendingWrites.contains(bi)) {
- mPendingWrites.add(bi);
- }
+ mBigramList.addBigram(word1, word2);
}
return freq;
}
public boolean cancelAddingUserHistory(String word1, String word2) {
- final Bigram bi = new Bigram(word1, word2);
- if (mPendingWrites.contains(bi)) {
- mPendingWrites.remove(bi);
- return super.removeBigram(word1, word2);
+ synchronized (mPendingWritesLock) {
+ if (mBigramList.removeBigram(word1, word2)) {
+ return super.removeBigram(word1, word2);
+ }
}
return false;
}
@@ -214,11 +182,11 @@ public class UserHistoryDictionary extends ExpandableDictionary {
private void flushPendingWrites() {
synchronized (mPendingWritesLock) {
// Nothing pending? Return
- if (mPendingWrites.isEmpty()) return;
+ if (mBigramList.isEmpty()) return;
// Create a background thread to write the pending entries
- new UpdateDbTask(sOpenHelper, mPendingWrites, mLocale, this).execute();
+ new UpdateDbTask(sOpenHelper, mBigramList, mLocale, this).execute();
// Create a new map for writing new entries into while the old one is written to db
- mPendingWrites = new HashSet<Bigram>();
+ mBigramList = new UserHistoryDictionaryBigramList();
}
}
@@ -251,6 +219,9 @@ public class UserHistoryDictionary extends ExpandableDictionary {
final String word1 = cursor.getString(word1Index);
final String word2 = cursor.getString(word2Index);
final int frequency = cursor.getInt(frequencyIndex);
+ if (DBG_SAVE_RESTORE) {
+ Log.d(TAG, "--- Load user history: " + word1 + ", " + word2);
+ }
// Safeguard against adding really long words. Stack may overflow due
// to recursive lookup
if (null == word1) {
@@ -259,8 +230,9 @@ public class UserHistoryDictionary extends ExpandableDictionary {
&& word2.length() < BinaryDictionary.MAX_WORD_LENGTH) {
super.setBigramAndGetFrequency(
word1, word2, new ForgettingCurveParams(frequency, now, last));
- // TODO: optimize
- mPendingWrites.add(new Bigram(word1, word2));
+ }
+ synchronized(mPendingWritesLock) {
+ mBigramList.addBigram(word1, word2);
}
cursor.moveToNext();
}
@@ -339,14 +311,15 @@ public class UserHistoryDictionary extends ExpandableDictionary {
* the in-memory trie.
*/
private static class UpdateDbTask extends AsyncTask<Void, Void, Void> {
- private final HashSet<Bigram> mMap;
+ private final UserHistoryDictionaryBigramList mBigramList;
private final DatabaseHelper mDbHelper;
private final String mLocale;
private final UserHistoryDictionary mUserHistoryDictionary;
- public UpdateDbTask(DatabaseHelper openHelper, HashSet<Bigram> pendingWrites,
+ public UpdateDbTask(
+ DatabaseHelper openHelper, UserHistoryDictionaryBigramList pendingWrites,
String locale, UserHistoryDictionary dict) {
- mMap = pendingWrites;
+ mBigramList = pendingWrites;
mLocale = locale;
mDbHelper = openHelper;
mUserHistoryDictionary = dict;
@@ -401,67 +374,71 @@ public class UserHistoryDictionary extends ExpandableDictionary {
return null;
}
db.execSQL("PRAGMA foreign_keys = ON;");
+ final boolean addLevel0Bigram = mBigramList.size() <= sMaxHistoryBigrams;
+
// Write all the entries to the db
- final Iterator<Bigram> iterator = mMap.iterator();
- while (iterator.hasNext()) {
- // TODO: this process of making a text search for each pair each time
- // is terribly inefficient. Optimize this.
- final Bigram bi = iterator.next();
-
- // find pair id
- Cursor c = null;
- try {
- if (null != bi.mWord1) {
- c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID },
- MAIN_COLUMN_WORD1 + "=? AND " + MAIN_COLUMN_WORD2 + "=? AND "
- + MAIN_COLUMN_LOCALE + "=?",
- new String[] { bi.mWord1, bi.mWord2, mLocale }, null, null,
- null);
- } else {
- c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID },
- MAIN_COLUMN_WORD1 + " IS NULL AND " + MAIN_COLUMN_WORD2 + "=? AND "
- + MAIN_COLUMN_LOCALE + "=?",
- new String[] { bi.mWord2, mLocale }, null, null, null);
- }
+ for (String word1 : mBigramList.keySet()) {
+ for (String word2 : mBigramList.getBigrams(word1)) {
+ // TODO: this process of making a text search for each pair each time
+ // is terribly inefficient. Optimize this.
+ // find pair id
+ Cursor c = null;
+ try {
+ if (null != word1) {
+ c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID },
+ MAIN_COLUMN_WORD1 + "=? AND " + MAIN_COLUMN_WORD2 + "=? AND "
+ + MAIN_COLUMN_LOCALE + "=?",
+ new String[] { word1, word2, mLocale }, null, null,
+ null);
+ } else {
+ c = db.query(MAIN_TABLE_NAME, new String[] { MAIN_COLUMN_ID },
+ MAIN_COLUMN_WORD1 + " IS NULL AND " + MAIN_COLUMN_WORD2
+ + "=? AND " + MAIN_COLUMN_LOCALE + "=?",
+ new String[] { word2, mLocale }, null, null, null);
+ }
- final int pairId;
- if (c.moveToFirst()) {
- // existing pair
- pairId = c.getInt(c.getColumnIndex(MAIN_COLUMN_ID));
- db.delete(FREQ_TABLE_NAME, FREQ_COLUMN_PAIR_ID + "=?",
- new String[] { Integer.toString(pairId) });
- } else {
- // new pair
- Long pairIdLong = db.insert(MAIN_TABLE_NAME, null,
- getContentValues(bi.mWord1, bi.mWord2, mLocale));
- pairId = pairIdLong.intValue();
- }
- // insert new frequency
- final int freq;
- if (bi.mWord1 == null) {
- freq = FREQUENCY_FOR_TYPED;
- } else {
- final NextWord nw = mUserHistoryDictionary.getBigramWord(
- bi.mWord1, bi.mWord2);
- if (nw != null) {
- final int tempFreq = nw.getFcValue();
- // TODO: Check whether the word is valid or not
- if (UserHistoryForgettingCurveUtils.needsToSave(
- (byte)tempFreq, false)) {
- freq = tempFreq;
+ final int pairId;
+ if (c.moveToFirst()) {
+ // existing pair
+ pairId = c.getInt(c.getColumnIndex(MAIN_COLUMN_ID));
+ db.delete(FREQ_TABLE_NAME, FREQ_COLUMN_PAIR_ID + "=?",
+ new String[] { Integer.toString(pairId) });
+ } else {
+ // new pair
+ Long pairIdLong = db.insert(MAIN_TABLE_NAME, null,
+ getContentValues(word1, word2, mLocale));
+ pairId = pairIdLong.intValue();
+ }
+ // insert new frequency
+ final int freq;
+ if (word1 == null) {
+ freq = FREQUENCY_FOR_TYPED;
+ } else {
+ final NextWord nw = mUserHistoryDictionary.getBigramWord(word1, word2);
+ if (nw != null) {
+ final int tempFreq = nw.getFcValue();
+ // TODO: Check whether the word is valid or not
+ if (UserHistoryForgettingCurveUtils.needsToSave(
+ (byte)tempFreq, false, addLevel0Bigram)) {
+ freq = tempFreq;
+ } else {
+ freq = -1;
+ }
} else {
freq = -1;
}
- } else {
- freq = -1;
}
- }
- if (freq > 0) {
- db.insert(FREQ_TABLE_NAME, null, getFrequencyContentValues(pairId, freq));
- }
- } finally {
- if (c != null) {
- c.close();
+ if (freq > 0) {
+ if (DBG_SAVE_RESTORE) {
+ Log.d(TAG, "--- Save user history: " + word1 + ", " + word2);
+ }
+ db.insert(FREQ_TABLE_NAME, null,
+ getFrequencyContentValues(pairId, freq));
+ }
+ } finally {
+ if (c != null) {
+ c.close();
+ }
}
}
}
diff --git a/java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java b/java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java
new file mode 100644
index 000000000..409f921ff
--- /dev/null
+++ b/java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.inputmethod.latin;
+
+import android.util.Log;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * A store of bigrams which will be updated when the user history dictionary is closed
+ * All bigrams including stale ones in SQL DB should be stored in this class to avoid adding stale
+ * bigrams when we write to the SQL DB.
+ */
+public class UserHistoryDictionaryBigramList {
+ private static final String TAG = UserHistoryDictionaryBigramList.class.getSimpleName();
+ private static final HashSet<String> EMPTY_STRING_SET = new HashSet<String>();
+ private final HashMap<String, HashSet<String>> mBigramMap =
+ new HashMap<String, HashSet<String>>();
+ private int mSize = 0;
+
+ public void evictAll() {
+ mSize = 0;
+ mBigramMap.clear();
+ }
+
+ public void addBigram(String word1, String word2) {
+ if (UserHistoryDictionary.DBG_SAVE_RESTORE) {
+ Log.d(TAG, "--- add bigram: " + word1 + ", " + word2);
+ }
+ final HashSet<String> set;
+ if (mBigramMap.containsKey(word1)) {
+ set = mBigramMap.get(word1);
+ } else {
+ set = new HashSet<String>();
+ mBigramMap.put(word1, set);
+ }
+ if (!set.contains(word2)) {
+ ++mSize;
+ set.add(word2);
+ }
+ }
+
+ public int size() {
+ return mSize;
+ }
+
+ public boolean isEmpty() {
+ return mBigramMap.isEmpty();
+ }
+
+ public Set<String> keySet() {
+ return mBigramMap.keySet();
+ }
+
+ public HashSet<String> getBigrams(String word1) {
+ if (!mBigramMap.containsKey(word1)) {
+ return EMPTY_STRING_SET;
+ } else {
+ return mBigramMap.get(word1);
+ }
+ }
+
+ public boolean removeBigram(String word1, String word2) {
+ final HashSet<String> set = getBigrams(word1);
+ if (set.isEmpty()) {
+ return false;
+ }
+ if (set.contains(word2)) {
+ set.remove(word2);
+ --mSize;
+ return true;
+ }
+ return false;
+ }
+}
diff --git a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java
index f30fee23e..feb1d0029 100644
--- a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java
+++ b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java
@@ -162,10 +162,15 @@ public class UserHistoryForgettingCurveUtils {
// TODO: isValid should be false for a word whose frequency is 0,
// or that is not in the dictionary.
- public static boolean needsToSave(byte fc, boolean isValid) {
+ /**
+ * Check wheather we should save the bigram to the SQL DB or not
+ */
+ public static boolean needsToSave(byte fc, boolean isValid, boolean addLevel0Bigram) {
int level = fcToLevel(fc);
- if (isValid && level == 0) {
- return false;
+ if (level == 0) {
+ if (isValid || !addLevel0Bigram) {
+ return false;
+ }
}
final int elapsedTime = fcToElapsedTime(fc);
return (elapsedTime < ELAPSED_TIME_MAX - 1 || level > 0);
diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
index f130062a1..d10dc962e 100644
--- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
+++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
@@ -173,12 +173,12 @@ static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jlo
return count;
}
-static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jlong dict,
+static jint latinime_BinaryDictionary_getFrequency(JNIEnv *env, jobject object, jlong dict,
jintArray wordArray, jint wordLength) {
Dictionary *dictionary = (Dictionary*)dict;
if (!dictionary) return (jboolean) false;
jint *word = env->GetIntArrayElements(wordArray, 0);
- jboolean result = dictionary->isValidWord(word, wordLength);
+ jint result = dictionary->getFrequency(word, wordLength);
env->ReleaseIntArrayElements(wordArray, word, JNI_ABORT);
return result;
}
@@ -253,7 +253,7 @@ static JNINativeMethod sMethods[] = {
{"closeNative", "(J)V", (void*)latinime_BinaryDictionary_close},
{"getSuggestionsNative", "(JJ[I[I[II[IZ[C[I)I",
(void*)latinime_BinaryDictionary_getSuggestions},
- {"isValidWordNative", "(J[II)Z", (void*)latinime_BinaryDictionary_isValidWord},
+ {"getFrequencyNative", "(J[II)I", (void*)latinime_BinaryDictionary_getFrequency},
{"isValidBigramNative", "(J[I[I)Z", (void*)latinime_BinaryDictionary_isValidBigram},
{"getBigramsNative", "(J[II[II[C[III)I", (void*)latinime_BinaryDictionary_getBigrams},
{"calcNormalizedScoreNative", "([CI[CII)F",
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index ac2a26172..eb4bf8d1a 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -117,14 +117,22 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
do {
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
uint16_t bigramBuffer[MAX_WORD_LENGTH];
+ int unigramFreq;
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
&pos);
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
- bigramBuffer);
+ bigramBuffer, &unigramFreq);
// codesSize == 0 means we are trying to find bigram predictions.
if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) {
- const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+ const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+ // Due to space constraints, the frequency for bigrams is approximate - the lower the
+ // unigram frequency, the worse the precision. The theoritical maximum error in
+ // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
+ // in very bad cases. This means that sometimes, we'll see some bigrams interverted
+ // here, but it can't get too bad.
+ const int frequency =
+ BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq);
if (addWordBigram(bigramBuffer, length, frequency)) {
++bigramCount;
}
@@ -149,8 +157,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
} else {
pos = BinaryFormat::skipOtherCharacters(root, pos);
}
- pos = BinaryFormat::skipChildrenPosition(flags, pos);
pos = BinaryFormat::skipFrequency(flags, pos);
+ pos = BinaryFormat::skipChildrenPosition(flags, pos);
pos = BinaryFormat::skipShortcuts(root, flags, pos);
return pos;
}
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 40f197619..51bf8ebbc 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -66,7 +66,8 @@ class BinaryFormat {
static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
const int length);
static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
- uint16_t* outWord);
+ uint16_t* outWord, int* outUnigramFrequency);
+ static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
static int getProbability(const int position, const std::map<int, int> *bigramMap,
const uint8_t *bigramFilter, const int unigramFreq);
@@ -390,10 +391,11 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
* address: the byte position of the last chargroup of the word we are searching for (this is
* what is stored as the "bigram address" in each bigram)
* outword: an array to write the found word, with MAX_WORD_LENGTH size.
+ * outUnigramFrequency: a pointer to an int to write the frequency into.
* Return value : the length of the word, of 0 if the word was not found.
*/
inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
- const int maxDepth, uint16_t* outWord) {
+ const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) {
int pos = 0;
int wordPos = 0;
@@ -421,11 +423,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
// We count chars in order to avoid infinite loops if the file is broken or
// if there is some other bug
int charCount = maxDepth;
- while (-1 != nextChar && --charCount > 0) {
+ while (NOT_A_CHARACTER != nextChar && --charCount > 0) {
outWord[++wordPos] = nextChar;
nextChar = getCharCodeAndForwardPointer(root, &pos);
}
}
+ *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
return ++wordPos;
}
// We need to skip past this char group, so skip any remaining chars after the
@@ -529,6 +532,16 @@ static inline int backoff(const int unigramFreq) {
// return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
}
+inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
+ // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
+ // unigram frequency to be the median value of the 17th step from the top. A value of
+ // 0 for the bigram frequency represents the middle of the 16th step from the top,
+ // while a value of 15 represents the middle of the top step.
+ // See makedict.BinaryDictInputOutput for details.
+ const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
+ return (int)(unigramFreq + (bigramFreq + 1) * stepSize);
+}
+
// This returns a probability in log space.
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
const uint8_t *bigramFilter, const int unigramFreq) {
@@ -537,13 +550,7 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int,
const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
if (bigramFreqIt != bigramMap->end()) {
const int bigramFreq = bigramFreqIt->second;
- // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
- // unigram frequency to be the median value of the 17th step from the top. A value of
- // 0 for the bigram frequency represents the middle of the 16th step from the top,
- // while a value of 15 represents the middle of the top step.
- // See makedict.BinaryDictInputOutput for details.
- const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
- return (int)(unigramFreq + bigramFreq * stepSize);
+ return computeFrequencyForBigram(unigramFreq, bigramFreq);
} else {
return backoff(unigramFreq);
}
diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp
index 65d0f73a3..1fb02478b 100644
--- a/native/jni/src/dictionary.cpp
+++ b/native/jni/src/dictionary.cpp
@@ -55,8 +55,8 @@ Dictionary::~Dictionary() {
delete mBigramDictionary;
}
-bool Dictionary::isValidWord(const int32_t *word, int length) {
- return mUnigramDictionary->isValidWord(word, length);
+int Dictionary::getFrequency(const int32_t *word, int length) {
+ return mUnigramDictionary->getFrequency(word, length);
}
bool Dictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2,
diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h
index 87891ee4d..9f2367904 100644
--- a/native/jni/src/dictionary.h
+++ b/native/jni/src/dictionary.h
@@ -52,7 +52,7 @@ class Dictionary {
maxWordLength, maxBigrams);
}
- bool isValidWord(const int32_t *word, int length);
+ int getFrequency(const int32_t *word, int length);
bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2);
void *getDict() { return (void *)mDict; }
int getDictSize() { return mDictSize; }
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index 828582848..efe9c4fe3 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -747,8 +747,21 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor
return maxFreq;
}
-bool UnigramDictionary::isValidWord(const int32_t* const inWord, const int length) const {
- return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
+int UnigramDictionary::getFrequency(const int32_t* const inWord, const int length) const {
+ const uint8_t* const root = DICT_ROOT;
+ int pos = BinaryFormat::getTerminalPosition(root, inWord, length);
+ if (NOT_VALID_WORD == pos) {
+ return NOT_A_PROBABILITY;
+ }
+ const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
+ const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
+ if (hasMultipleChars) {
+ pos = BinaryFormat::skipOtherCharacters(root, pos);
+ } else {
+ BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
+ }
+ const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
+ return unigramFreq;
}
// TODO: remove this function.
diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index b9233518f..b70894004 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -72,7 +72,7 @@ class UnigramDictionary {
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags);
- bool isValidWord(const int32_t* const inWord, const int length) const;
+ int getFrequency(const int32_t* const inWord, const int length) const;
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool,
Correction *correction, const int *xcoordinates, const int *ycoordinates,