aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-10-23 05:42:37 +0000
committerAndroid (Google) Code Review <android-gerrit@google.com>2014-10-23 05:42:38 +0000
commitade5ad1dae715081e0131e9ebd266e1e36409f1d (patch)
tree15e30930f3c2536bc85ac565edd2b26405310a2c
parentb11fc7d423667f1cd949b50ddea3761e72d16a2c (diff)
parent16cc3992d7468ef781753df7b4227330e0834501 (diff)
downloadlatinime-ade5ad1dae715081e0131e9ebd266e1e36409f1d.tar.gz
latinime-ade5ad1dae715081e0131e9ebd266e1e36409f1d.tar.xz
latinime-ade5ad1dae715081e0131e9ebd266e1e36409f1d.zip
Merge "Use trigrams for personalization dict."
-rw-r--r--java/src/com/android/inputmethod/latin/BinaryDictionary.java18
-rw-r--r--java/src/com/android/inputmethod/latin/DictionaryFacilitator.java6
-rw-r--r--java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java18
-rw-r--r--java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java26
-rw-r--r--java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java166
-rw-r--r--java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java117
-rw-r--r--native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp130
-rw-r--r--native/jni/src/utils/jni_data_utils.h3
-rw-r--r--tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java64
-rw-r--r--tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java58
-rw-r--r--tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java6
11 files changed, 267 insertions, 345 deletions
diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java
index cfe1ea6a7..dce321795 100644
--- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java
+++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java
@@ -32,8 +32,8 @@ import com.android.inputmethod.latin.settings.SettingsValuesForSuggestion;
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
import com.android.inputmethod.latin.utils.FileUtils;
import com.android.inputmethod.latin.utils.JniUtils;
-import com.android.inputmethod.latin.utils.LanguageModelParam;
import com.android.inputmethod.latin.utils.StringUtils;
+import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
import java.io.File;
import java.util.ArrayList;
@@ -205,8 +205,8 @@ public final class BinaryDictionary extends Dictionary {
private static native boolean updateEntriesForWordWithNgramContextNative(long dict,
int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray,
int[] word, boolean isValidWord, int count, int timestamp);
- private static native int addMultipleDictionaryEntriesNative(long dict,
- LanguageModelParam[] languageModelParams, int startIndex);
+ private static native int updateEntriesForInputEventsNative(long dict,
+ WordInputEventForPersonalization[] inputEvents, int startIndex);
private static native String getPropertyNative(long dict, String query);
private static native boolean isCorruptedNative(long dict);
private static native boolean migrateNative(long dict, String dictFilePath,
@@ -526,19 +526,19 @@ public final class BinaryDictionary extends Dictionary {
}
@UsedForTesting
- public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) {
+ public void updateEntriesForInputEvents(final WordInputEventForPersonalization[] inputEvents) {
if (!isValidDictionary()) {
return;
}
- int processedParamCount = 0;
- while (processedParamCount < languageModelParams.length) {
+ int processedEventCount = 0;
+ while (processedEventCount < inputEvents.length) {
if (needsToRunGC(true /* mindsBlockByGC */)) {
flushWithGC();
}
- processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict,
- languageModelParams, processedParamCount);
+ processedEventCount = updateEntriesForInputEventsNative(mNativeDict, inputEvents,
+ processedEventCount);
mHasUpdated = true;
- if (processedParamCount <= 0) {
+ if (processedEventCount <= 0) {
return;
}
}
diff --git a/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java b/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java
index 25cbabfea..5bf6bf47d 100644
--- a/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java
+++ b/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java
@@ -24,7 +24,7 @@ import android.view.inputmethod.InputMethodSubtype;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.keyboard.ProximityInfo;
-import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback;
+import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
import com.android.inputmethod.latin.NgramContext.WordInfo;
import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
import com.android.inputmethod.latin.personalization.ContextualDictionary;
@@ -796,8 +796,8 @@ public class DictionaryFacilitator {
public void addEntriesToPersonalizationDictionary(
final PersonalizationDataChunk personalizationDataChunk,
final SpacingAndPunctuations spacingAndPunctuations,
- final AddMultipleDictionaryEntriesCallback callback) {
- mPersonalizationHelper.addEntriesToPersonalizationDictionariesToUpdate(
+ final UpdateEntriesForInputEventsCallback callback) {
+ mPersonalizationHelper.updateEntriesOfPersonalizationDictionaries(
getMostProbableLocale(), personalizationDataChunk, spacingAndPunctuations,
callback);
}
diff --git a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java
index 97af02f21..a74ffcb26 100644
--- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java
+++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java
@@ -32,7 +32,7 @@ import com.android.inputmethod.latin.utils.CombinedFormatUtils;
import com.android.inputmethod.latin.utils.DistracterFilter;
import com.android.inputmethod.latin.utils.ExecutorUtils;
import com.android.inputmethod.latin.utils.FileUtils;
-import com.android.inputmethod.latin.utils.LanguageModelParam;
+import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
import java.io.File;
import java.util.ArrayList;
@@ -447,16 +447,16 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
}, word, distracterFilter);
}
- public interface AddMultipleDictionaryEntriesCallback {
+ public interface UpdateEntriesForInputEventsCallback {
public void onFinished();
}
/**
- * Dynamically add multiple entries to the dictionary.
+ * Dynamically update entries according to input events.
*/
- public void addMultipleDictionaryEntriesDynamically(
- @Nonnull final ArrayList<LanguageModelParam> languageModelParams,
- final AddMultipleDictionaryEntriesCallback callback) {
+ public void updateEntriesForInputEvents(
+ @Nonnull final ArrayList<WordInputEventForPersonalization> inputEvents,
+ final UpdateEntriesForInputEventsCallback callback) {
reloadDictionaryIfRequired();
asyncExecuteTaskWithWriteLock(new Runnable() {
@Override
@@ -466,9 +466,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
if (binaryDictionary == null) {
return;
}
- binaryDictionary.addMultipleDictionaryEntries(
- languageModelParams.toArray(
- new LanguageModelParam[languageModelParams.size()]));
+ binaryDictionary.updateEntriesForInputEvents(
+ inputEvents.toArray(
+ new WordInputEventForPersonalization[inputEvents.size()]));
} finally {
if (callback != null) {
callback.onFinished();
diff --git a/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java b/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java
index 396d062f8..2dbab0a3f 100644
--- a/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java
+++ b/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java
@@ -26,14 +26,14 @@ import java.util.concurrent.atomic.AtomicInteger;
import android.content.Context;
import android.view.inputmethod.InputMethodSubtype;
-import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback;
+import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
import com.android.inputmethod.latin.personalization.PersonalizationDataChunk;
import com.android.inputmethod.latin.personalization.PersonalizationDictionary;
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
import com.android.inputmethod.latin.utils.DistracterFilter;
import com.android.inputmethod.latin.utils.DistracterFilterCheckingIsInDictionary;
-import com.android.inputmethod.latin.utils.LanguageModelParam;
import com.android.inputmethod.latin.utils.SubtypeLocaleUtils;
+import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
/**
* Class for managing and updating personalization dictionaries.
@@ -119,10 +119,10 @@ public class PersonalizationHelperForDictionaryFacilitator {
return personalizationDict;
}
- private void addEntriesToPersonalizationDictionariesForLocale(final Locale locale,
+ private void updateEntriesOfPersonalizationDictionariesForLocale(final Locale locale,
final PersonalizationDataChunk personalizationDataChunk,
final SpacingAndPunctuations spacingAndPunctuations,
- final AddMultipleDictionaryEntriesCallback callback) {
+ final UpdateEntriesForInputEventsCallback callback) {
final ExpandableBinaryDictionary personalizationDict =
getPersonalizationDictToUpdate(mContext, locale);
if (personalizationDict == null) {
@@ -131,25 +131,25 @@ public class PersonalizationHelperForDictionaryFacilitator {
}
return;
}
- final ArrayList<LanguageModelParam> languageModelParams =
- LanguageModelParam.createLanguageModelParamsFrom(
+ final ArrayList<WordInputEventForPersonalization> inputEvents =
+ WordInputEventForPersonalization.createInputEventFrom(
personalizationDataChunk.mTokens,
personalizationDataChunk.mTimestampInSeconds, spacingAndPunctuations,
locale, new DistracterFilterCheckingIsInDictionary(
mDistracterFilter, personalizationDict));
- if (languageModelParams == null || languageModelParams.isEmpty()) {
+ if (inputEvents == null || inputEvents.isEmpty()) {
if (callback != null) {
callback.onFinished();
}
return;
}
- personalizationDict.addMultipleDictionaryEntriesDynamically(languageModelParams, callback);
+ personalizationDict.updateEntriesForInputEvents(inputEvents, callback);
}
- public void addEntriesToPersonalizationDictionariesToUpdate(final Locale defaultLocale,
+ public void updateEntriesOfPersonalizationDictionaries(final Locale defaultLocale,
final PersonalizationDataChunk personalizationDataChunk,
final SpacingAndPunctuations spacingAndPunctuations,
- final AddMultipleDictionaryEntriesCallback callback) {
+ final UpdateEntriesForInputEventsCallback callback) {
final String language = personalizationDataChunk.mDetectedLanguage;
final HashSet<Locale> locales;
if (mIsMonolingualUser && PersonalizationDataChunk.LANGUAGE_UNKNOWN.equals(language)
@@ -165,8 +165,8 @@ public class PersonalizationHelperForDictionaryFacilitator {
return;
}
final AtomicInteger remainingTaskCount = new AtomicInteger(locales.size());
- final AddMultipleDictionaryEntriesCallback callbackForLocales =
- new AddMultipleDictionaryEntriesCallback() {
+ final UpdateEntriesForInputEventsCallback callbackForLocales =
+ new UpdateEntriesForInputEventsCallback() {
@Override
public void onFinished() {
if (remainingTaskCount.decrementAndGet() == 0) {
@@ -178,7 +178,7 @@ public class PersonalizationHelperForDictionaryFacilitator {
}
};
for (final Locale locale : locales) {
- addEntriesToPersonalizationDictionariesForLocale(locale, personalizationDataChunk,
+ updateEntriesOfPersonalizationDictionariesForLocale(locale, personalizationDataChunk,
spacingAndPunctuations, callbackForLocales);
}
}
diff --git a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java b/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java
deleted file mode 100644
index 3e5cb33ca..000000000
--- a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.android.inputmethod.latin.utils;
-
-import android.util.Log;
-
-import com.android.inputmethod.annotations.UsedForTesting;
-import com.android.inputmethod.latin.Dictionary;
-import com.android.inputmethod.latin.NgramContext;
-import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
-import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
-// Note: this class is used as a parameter type of a native method. You should be careful when you
-// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative().
-public final class LanguageModelParam {
- private static final String TAG = LanguageModelParam.class.getSimpleName();
- private static final boolean DEBUG = false;
- private static final boolean DEBUG_TOKEN = false;
-
- // For now, these probability values are being referred to only when we add new entries to
- // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or
- // non-0. Thus, it's not meaningful to compare 10, 100, and so on.
- // TODO: Revise the logic in ForgettingCurveUtils in native code.
- private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100;
- private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
- private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10;
- private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
-
- public final CharSequence mTargetWord;
- public final int[] mWord0;
- public final int[] mWord1;
- // TODO: this needs to be a list of shortcuts
- public final int[] mShortcutTarget;
- public final int mUnigramProbability;
- public final int mBigramProbability;
- public final int mShortcutProbability;
- public final boolean mIsNotAWord;
- public final boolean mIsPossiblyOffensive;
- // Time stamp in seconds.
- public final int mTimestamp;
-
- // Constructor for unigram. TODO: support shortcuts
- @UsedForTesting
- public LanguageModelParam(final CharSequence word, final int unigramProbability,
- final int timestamp) {
- this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp);
- }
-
- // Constructor for unigram and bigram.
- @UsedForTesting
- public LanguageModelParam(final CharSequence word0, final CharSequence word1,
- final int unigramProbability, final int bigramProbability,
- final int timestamp) {
- mTargetWord = word1;
- mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0);
- mWord1 = StringUtils.toCodePointArray(word1);
- mShortcutTarget = null;
- mUnigramProbability = unigramProbability;
- mBigramProbability = bigramProbability;
- mShortcutProbability = Dictionary.NOT_A_PROBABILITY;
- mIsNotAWord = false;
- mIsPossiblyOffensive = false;
- mTimestamp = timestamp;
- }
-
- // Process a list of words and return a list of {@link LanguageModelParam} objects.
- public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom(
- final List<String> tokens, final int timestamp,
- final SpacingAndPunctuations spacingAndPunctuations, final Locale locale,
- final DistracterFilter distracterFilter) {
- final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>();
- final int N = tokens.size();
- NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
- for (int i = 0; i < N; ++i) {
- final String tempWord = tokens.get(i);
- if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) {
- // just skip this token
- if (DEBUG_TOKEN) {
- Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\"");
- }
- continue;
- }
- if (!DictionaryInfoUtils.looksValidForDictionaryInsertion(
- tempWord, spacingAndPunctuations)) {
- if (DEBUG_TOKEN) {
- Log.d(TAG, "--- not looksValidForDictionaryInsertion: \""
- + tempWord + "\"");
- }
- // Sentence terminator found. Split.
- ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
- continue;
- }
- if (DEBUG_TOKEN) {
- Log.d(TAG, "--- word: \"" + tempWord + "\"");
- }
- final LanguageModelParam languageModelParam =
- detectWhetherVaildWordOrNotAndGetLanguageModelParam(
- ngramContext, tempWord, timestamp, locale, distracterFilter);
- if (languageModelParam == null) {
- continue;
- }
- languageModelParams.add(languageModelParam);
- ngramContext = ngramContext.getNextNgramContext(
- new NgramContext.WordInfo(tempWord));
- }
- return languageModelParams;
- }
-
- private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam(
- final NgramContext ngramContext, final String targetWord, final int timestamp,
- final Locale locale, final DistracterFilter distracterFilter) {
- if (locale == null) {
- return null;
- }
- final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext,
- targetWord, locale);
- final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ?
- targetWord.toLowerCase(locale) : targetWord;
- if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) {
- // The word is a distracter.
- return null;
- }
- return createAndGetLanguageModelParamOfWord(ngramContext, word, timestamp,
- !HandlingType.shouldBeHandledAsOov(wordHandlingType));
- }
-
- private static LanguageModelParam createAndGetLanguageModelParamOfWord(
- final NgramContext ngramContext, final String word, final int timestamp,
- final boolean isValidWord) {
- final int unigramProbability = isValidWord ?
- UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD;
- if (!ngramContext.isValid()) {
- if (DEBUG) {
- Log.d(TAG, "--- add unigram: current("
- + (isValidWord ? "Valid" : "OOV") + ") = " + word);
- }
- return new LanguageModelParam(word, unigramProbability, timestamp);
- }
- if (DEBUG) {
- Log.d(TAG, "--- add bigram: prev = " + ngramContext + ", current("
- + (isValidWord ? "Valid" : "OOV") + ") = " + word);
- }
- final int bigramProbability = isValidWord ?
- BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD;
- return new LanguageModelParam(ngramContext.getNthPrevWord(1 /* n */), word,
- unigramProbability, bigramProbability, timestamp);
- }
-}
diff --git a/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java b/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java
new file mode 100644
index 000000000..644fda57f
--- /dev/null
+++ b/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.inputmethod.latin.utils;
+
+import android.util.Log;
+
+import com.android.inputmethod.annotations.UsedForTesting;
+import com.android.inputmethod.latin.Constants;
+import com.android.inputmethod.latin.NgramContext;
+import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
+import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+// Note: this class is used as a parameter type of a native method. You should be careful when you
+// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative().
+public final class WordInputEventForPersonalization {
+ private static final String TAG = WordInputEventForPersonalization.class.getSimpleName();
+ private static final boolean DEBUG_TOKEN = false;
+
+ public final int[] mTargetWord;
+ public final int mPrevWordsCount;
+ public final int[][] mPrevWordArray = new int[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM][];
+ public final boolean[] mIsPrevWordBeginningOfSentenceArray =
+ new boolean[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ public final boolean mIsValid;
+ // Time stamp in seconds.
+ public final int mTimestamp;
+
+ @UsedForTesting
+ public WordInputEventForPersonalization(final CharSequence targetWord,
+ final NgramContext ngramContext, final boolean isValid, final int timestamp) {
+ mTargetWord = StringUtils.toCodePointArray(targetWord);
+ mPrevWordsCount = ngramContext.getPrevWordCount();
+ ngramContext.outputToArray(mPrevWordArray, mIsPrevWordBeginningOfSentenceArray);
+ mIsValid = isValid;
+ mTimestamp = timestamp;
+ }
+
+ // Process a list of words and return a list of {@link WordInputEventForPersonalization}
+ // objects.
+ public static ArrayList<WordInputEventForPersonalization> createInputEventFrom(
+ final List<String> tokens, final int timestamp,
+ final SpacingAndPunctuations spacingAndPunctuations, final Locale locale,
+ final DistracterFilter distracterFilter) {
+ final ArrayList<WordInputEventForPersonalization> inputEvents = new ArrayList<>();
+ final int N = tokens.size();
+ NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
+ for (int i = 0; i < N; ++i) {
+ final String tempWord = tokens.get(i);
+ if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) {
+ // just skip this token
+ if (DEBUG_TOKEN) {
+ Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\"");
+ }
+ continue;
+ }
+ if (!DictionaryInfoUtils.looksValidForDictionaryInsertion(
+ tempWord, spacingAndPunctuations)) {
+ if (DEBUG_TOKEN) {
+ Log.d(TAG, "--- not looksValidForDictionaryInsertion: \""
+ + tempWord + "\"");
+ }
+ // Sentence terminator found. Split.
+ // TODO: Detect whether the context is beginning-of-sentence.
+ ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
+ continue;
+ }
+ if (DEBUG_TOKEN) {
+ Log.d(TAG, "--- word: \"" + tempWord + "\"");
+ }
+ final WordInputEventForPersonalization inputEvent =
+ detectWhetherVaildWordOrNotAndGetInputEvent(
+ ngramContext, tempWord, timestamp, locale, distracterFilter);
+ if (inputEvent == null) {
+ continue;
+ }
+ inputEvents.add(inputEvent);
+ ngramContext = ngramContext.getNextNgramContext(new NgramContext.WordInfo(tempWord));
+ }
+ return inputEvents;
+ }
+
+ private static WordInputEventForPersonalization detectWhetherVaildWordOrNotAndGetInputEvent(
+ final NgramContext ngramContext, final String targetWord, final int timestamp,
+ final Locale locale, final DistracterFilter distracterFilter) {
+ if (locale == null) {
+ return null;
+ }
+ final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext,
+ targetWord, locale);
+ final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ?
+ targetWord.toLowerCase(locale) : targetWord;
+ if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) {
+ // The word is a distracter.
+ return null;
+ }
+ return new WordInputEventForPersonalization(word, ngramContext,
+ !HandlingType.shouldBeHandledAsOov(wordHandlingType), timestamp);
+ }
+}
diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
index 9239c8400..118f600bb 100644
--- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
+++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
@@ -453,98 +453,60 @@ static bool latinime_BinaryDictionary_updateEntriesForWordWithNgramContext(JNIEn
historicalInfo);
}
-// Returns how many language model params are processed.
-static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, jclass clazz,
- jlong dict, jobjectArray languageModelParams, jint startIndex) {
+// Returns how many input events are processed.
+static int latinime_BinaryDictionary_updateEntriesForInputEvents(JNIEnv *env, jclass clazz,
+ jlong dict, jobjectArray inputEvents, jint startIndex) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) {
return 0;
}
- jsize languageModelParamCount = env->GetArrayLength(languageModelParams);
- if (languageModelParamCount == 0 || startIndex >= languageModelParamCount) {
+ jsize inputEventCount = env->GetArrayLength(inputEvents);
+ if (inputEventCount == 0 || startIndex >= inputEventCount) {
return 0;
}
- jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, 0);
- jclass languageModelParamClass = env->GetObjectClass(languageModelParam);
- env->DeleteLocalRef(languageModelParam);
-
- jfieldID word0FieldId = env->GetFieldID(languageModelParamClass, "mWord0", "[I");
- jfieldID word1FieldId = env->GetFieldID(languageModelParamClass, "mWord1", "[I");
- jfieldID unigramProbabilityFieldId =
- env->GetFieldID(languageModelParamClass, "mUnigramProbability", "I");
- jfieldID bigramProbabilityFieldId =
- env->GetFieldID(languageModelParamClass, "mBigramProbability", "I");
- jfieldID timestampFieldId =
- env->GetFieldID(languageModelParamClass, "mTimestamp", "I");
- jfieldID shortcutTargetFieldId =
- env->GetFieldID(languageModelParamClass, "mShortcutTarget", "[I");
- jfieldID shortcutProbabilityFieldId =
- env->GetFieldID(languageModelParamClass, "mShortcutProbability", "I");
- jfieldID isNotAWordFieldId =
- env->GetFieldID(languageModelParamClass, "mIsNotAWord", "Z");
- jfieldID isPossiblyOffensiveFieldId =
- env->GetFieldID(languageModelParamClass, "mIsPossiblyOffensive", "Z");
- env->DeleteLocalRef(languageModelParamClass);
-
- for (int i = startIndex; i < languageModelParamCount; ++i) {
- jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, i);
- // languageModelParam is a set of params for word1; thus, word1 cannot be null. On the
- // other hand, word0 can be null and then it means the set of params doesn't contain bigram
- // information.
- jintArray word0 = static_cast<jintArray>(
- env->GetObjectField(languageModelParam, word0FieldId));
- jsize word0Length = word0 ? env->GetArrayLength(word0) : 0;
- int word0CodePoints[word0Length];
- if (word0) {
- env->GetIntArrayRegion(word0, 0, word0Length, word0CodePoints);
- }
- jintArray word1 = static_cast<jintArray>(
- env->GetObjectField(languageModelParam, word1FieldId));
- jsize word1Length = env->GetArrayLength(word1);
- int word1CodePoints[word1Length];
- env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints);
- jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId);
- jint timestamp = env->GetIntField(languageModelParam, timestampFieldId);
- jboolean isNotAWord = env->GetBooleanField(languageModelParam, isNotAWordFieldId);
- jboolean isPossiblyOffensive = env->GetBooleanField(languageModelParam,
- isPossiblyOffensiveFieldId);
- jintArray shortcutTarget = static_cast<jintArray>(
- env->GetObjectField(languageModelParam, shortcutTargetFieldId));
- std::vector<UnigramProperty::ShortcutProperty> shortcuts;
- {
- std::vector<int> shortcutTargetCodePoints;
- JniDataUtils::jintarrayToVector(env, shortcutTarget, &shortcutTargetCodePoints);
- if (!shortcutTargetCodePoints.empty()) {
- jint shortcutProbability =
- env->GetIntField(languageModelParam, shortcutProbabilityFieldId);
- shortcuts.emplace_back(std::move(shortcutTargetCodePoints), shortcutProbability);
- }
- }
+ jobject inputEvent = env->GetObjectArrayElement(inputEvents, 0);
+ jclass wordInputEventClass = env->GetObjectClass(inputEvent);
+ env->DeleteLocalRef(inputEvent);
+
+ jfieldID targetWordFieldId = env->GetFieldID(wordInputEventClass, "mTargetWord", "[I");
+ jfieldID prevWordCountFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordsCount", "I");
+ jfieldID prevWordArrayFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordArray", "[[I");
+ jfieldID isPrevWordBoSArrayFieldId =
+ env->GetFieldID(wordInputEventClass, "mIsPrevWordBeginningOfSentenceArray", "[Z");
+ jfieldID isValidFieldId = env->GetFieldID(wordInputEventClass, "mIsValid", "Z");
+ jfieldID timestampFieldId = env->GetFieldID(wordInputEventClass, "mTimestamp", "I");
+ env->DeleteLocalRef(wordInputEventClass);
+
+ for (int i = startIndex; i < inputEventCount; ++i) {
+ jobject inputEvent = env->GetObjectArrayElement(inputEvents, i);
+ jintArray targetWord = static_cast<jintArray>(
+ env->GetObjectField(inputEvent, targetWordFieldId));
+ jsize wordLength = env->GetArrayLength(targetWord);
+ int wordCodePoints[wordLength];
+ env->GetIntArrayRegion(targetWord, 0, wordLength, wordCodePoints);
+ env->DeleteLocalRef(targetWord);
+
+ jint prevWordCount = env->GetIntField(inputEvent, prevWordCountFieldId);
+ jobjectArray prevWordArray =
+ static_cast<jobjectArray>(env->GetObjectField(inputEvent, prevWordArrayFieldId));
+ jbooleanArray isPrevWordBeginningOfSentenceArray = static_cast<jbooleanArray>(
+ env->GetObjectField(inputEvent, isPrevWordBoSArrayFieldId));
+ jboolean isValid = env->GetBooleanField(inputEvent, isValidFieldId);
+ jint timestamp = env->GetIntField(inputEvent, timestampFieldId);
+ const NgramContext ngramContext = JniDataUtils::constructNgramContext(env,
+ prevWordArray, isPrevWordBeginningOfSentenceArray, prevWordCount);
// Use 1 for count to indicate the word has inputted.
- const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
- isPossiblyOffensive, unigramProbability,
- HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts));
- dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length),
- &unigramProperty);
- if (word0) {
- jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
- // Use 1 for count to indicate the bigram has inputted.
- const NgramContext ngramContext(word0CodePoints, word0Length,
- false /* isBeginningOfSentence */);
- const NgramProperty ngramProperty(ngramContext,
- CodePointArrayView(word1CodePoints, word1Length).toVector(),
- bigramProbability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
- dictionary->addNgramEntry(&ngramProperty);
- }
+ dictionary->updateEntriesForWordWithNgramContext(&ngramContext,
+ CodePointArrayView(wordCodePoints, wordLength), isValid,
+ HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
return i + 1;
}
- env->DeleteLocalRef(word0);
- env->DeleteLocalRef(word1);
- env->DeleteLocalRef(shortcutTarget);
- env->DeleteLocalRef(languageModelParam);
+ env->DeleteLocalRef(prevWordArray);
+ env->DeleteLocalRef(isPrevWordBeginningOfSentenceArray);
+ env->DeleteLocalRef(inputEvent);
}
- return languageModelParamCount;
+ return inputEventCount;
}
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
@@ -754,10 +716,10 @@ static const JNINativeMethod sMethods[] = {
reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForWordWithNgramContext)
},
{
- const_cast<char *>("addMultipleDictionaryEntriesNative"),
+ const_cast<char *>("updateEntriesForInputEventsNative"),
const_cast<char *>(
- "(J[Lcom/android/inputmethod/latin/utils/LanguageModelParam;I)I"),
- reinterpret_cast<void *>(latinime_BinaryDictionary_addMultipleDictionaryEntries)
+ "(J[Lcom/android/inputmethod/latin/utils/WordInputEventForPersonalization;I)I"),
+ reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForInputEvents)
},
{
const_cast<char *>("getPropertyNative"),
diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h
index 25cc41742..a259e1cd0 100644
--- a/native/jni/src/utils/jni_data_utils.h
+++ b/native/jni/src/utils/jni_data_utils.h
@@ -50,6 +50,7 @@ class JniDataUtils {
const jsize keyUtf8Length = env->GetStringUTFLength(keyString);
char keyChars[keyUtf8Length + 1];
env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars);
+ env->DeleteLocalRef(keyString);
keyChars[keyUtf8Length] = '\0';
DictionaryHeaderStructurePolicy::AttributeMap::key_type key;
HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key);
@@ -59,6 +60,7 @@ class JniDataUtils {
const jsize valueUtf8Length = env->GetStringUTFLength(valueString);
char valueChars[valueUtf8Length + 1];
env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars);
+ env->DeleteLocalRef(valueString);
valueChars[valueUtf8Length] = '\0';
DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value;
HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value);
@@ -113,6 +115,7 @@ class JniDataUtils {
continue;
}
env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]);
+ env->DeleteLocalRef(prevWord);
prevWordCodePointCount[i] = prevWordLength;
jboolean isBeginningOfSentenceBoolean = JNI_FALSE;
env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */,
diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
index 15f7568c8..8f3373cc2 100644
--- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
+++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
@@ -32,6 +32,7 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
import com.android.inputmethod.latin.utils.FileUtils;
import com.android.inputmethod.latin.utils.LocaleUtils;
+import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
import java.io.File;
import java.io.IOException;
@@ -39,6 +40,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
+import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
@@ -748,4 +750,66 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
binaryDictionary.close();
}
+
+ public void testUpdateEntriesForInputEvents() {
+ for (final int formatVersion : DICT_FORMAT_VERSIONS) {
+ testUpdateEntriesForInputEvents(formatVersion);
+ }
+ }
+
+ private void testUpdateEntriesForInputEvents(final int formatVersion) {
+ setCurrentTimeForTestMode(mCurrentTime);
+ final int codePointSetSize = 20;
+ final int EVENT_COUNT = 1000;
+ final double CONTINUE_RATE = 0.9;
+ final long seed = System.currentTimeMillis();
+ final Random random = new Random(seed);
+ final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
+
+ final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
+ final ArrayList<String> unigrams = new ArrayList<>();
+ final ArrayList<Pair<String, String>> bigrams = new ArrayList<>();
+ final ArrayList<Pair<Pair<String, String>, String>> trigrams = new ArrayList<>();
+
+ final WordInputEventForPersonalization[] inputEvents =
+ new WordInputEventForPersonalization[EVENT_COUNT];
+ NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
+ int prevWordCount = 0;
+ for (int i = 0; i < inputEvents.length; i++) {
+ final String word = CodePointUtils.generateWord(random, codePointSet);
+ inputEvents[i] = new WordInputEventForPersonalization(word, ngramContext,
+ true /* isValid */, mCurrentTime);
+ unigrams.add(word);
+ if (prevWordCount >= 2) {
+ final Pair<String, String> prevWordsPair = bigrams.get(bigrams.size() - 1);
+ trigrams.add(new Pair<>(prevWordsPair, word));
+ }
+ if (prevWordCount >= 1) {
+ bigrams.add(new Pair<>(ngramContext.getNthPrevWord(1 /* n */).toString(), word));
+ }
+ if (random.nextDouble() > CONTINUE_RATE) {
+ ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
+ prevWordCount = 0;
+ } else {
+ ngramContext = ngramContext.getNextNgramContext(new WordInfo(word));
+ prevWordCount++;
+ }
+ }
+ final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
+ binaryDictionary.updateEntriesForInputEvents(inputEvents);
+
+ for (final String word : unigrams) {
+ assertTrue(binaryDictionary.isInDictionary(word));
+ }
+ for (final Pair<String, String> bigram : bigrams) {
+ assertTrue(isValidBigram(binaryDictionary, bigram.first, bigram.second));
+ }
+ if (!supportsNgram(formatVersion)) {
+ return;
+ }
+ for (final Pair<Pair<String, String>, String> trigram : trigrams) {
+ assertTrue(isValidTrigram(binaryDictionary, trigram.first.first, trigram.first.second,
+ trigram.second));
+ }
+ }
}
diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
index 5a72e417e..0f2fa2fb1 100644
--- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
+++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java
@@ -29,7 +29,6 @@ import com.android.inputmethod.latin.makedict.WeightedString;
import com.android.inputmethod.latin.makedict.WordProperty;
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
import com.android.inputmethod.latin.utils.FileUtils;
-import com.android.inputmethod.latin.utils.LanguageModelParam;
import java.io.File;
import java.io.IOException;
@@ -884,63 +883,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
}
}
- public void testAddMultipleDictionaryEntries() {
- for (final int formatVersion : DICT_FORMAT_VERSIONS) {
- testAddMultipleDictionaryEntries(formatVersion);
- }
- }
-
- private void testAddMultipleDictionaryEntries(final int formatVersion) {
- final int codePointSetSize = 20;
- final int lmParamCount = 1000;
- final double bigramContinueRate = 0.9;
- final long seed = System.currentTimeMillis();
- final Random random = new Random(seed);
- final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
-
- final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
- final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
- final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
-
- final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount];
- String prevWord = null;
- for (int i = 0; i < languageModelParams.length; i++) {
- final String word = CodePointUtils.generateWord(random, codePointSet);
- final int probability = random.nextInt(0xFF);
- final int bigramProbability = probability + random.nextInt(0xFF - probability);
- unigramProbabilities.put(word, probability);
- if (prevWord == null) {
- languageModelParams[i] = new LanguageModelParam(word, probability,
- BinaryDictionary.NOT_A_VALID_TIMESTAMP);
- } else {
- languageModelParams[i] = new LanguageModelParam(prevWord, word, probability,
- bigramProbability, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
- bigramProbabilities.put(new Pair<>(prevWord, word),
- bigramProbability);
- }
- prevWord = (random.nextDouble() < bigramContinueRate) ? word : null;
- }
-
- final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
- binaryDictionary.addMultipleDictionaryEntries(languageModelParams);
-
- for (Map.Entry<String, Integer> entry : unigramProbabilities.entrySet()) {
- assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey()));
- }
-
- for (Map.Entry<Pair<String, String>, Integer> entry : bigramProbabilities.entrySet()) {
- final String word0 = entry.getKey().first;
- final String word1 = entry.getKey().second;
- final int bigramProbability = entry.getValue();
- assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
- isValidBigram(binaryDictionary, word0, word1));
- if (canCheckBigramProbability(formatVersion)) {
- assertEquals(bigramProbability,
- getBigramProbability(binaryDictionary, word0, word1));
- }
- }
- }
-
public void testGetWordProperties() {
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
testGetWordProperties(formatVersion);
diff --git a/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java b/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java
index 6ccb79d76..dc6fb0075 100644
--- a/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java
+++ b/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java
@@ -30,7 +30,7 @@ import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.DictionaryFacilitator;
import com.android.inputmethod.latin.ExpandableBinaryDictionary;
import com.android.inputmethod.latin.RichInputMethodManager;
-import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback;
+import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
import com.android.inputmethod.latin.common.CodePointUtils;
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
@@ -96,8 +96,8 @@ public class PersonalizationDictionaryTests extends AndroidTestCase {
true /* inputByUser */, tokens, timeStampInSeconds, DUMMY_PACKAGE_NAME,
LOCALE_EN_US.getLanguage());
final CountDownLatch countDownLatch = new CountDownLatch(1);
- final AddMultipleDictionaryEntriesCallback callback =
- new AddMultipleDictionaryEntriesCallback() {
+ final UpdateEntriesForInputEventsCallback callback =
+ new UpdateEntriesForInputEventsCallback() {
@Override
public void onFinished() {
countDownLatch.countDown();