diff options
70 files changed, 548 insertions, 375 deletions
diff --git a/common/src/com/android/inputmethod/latin/common/ComposedData.java b/common/src/com/android/inputmethod/latin/common/ComposedData.java new file mode 100644 index 000000000..0508d49cb --- /dev/null +++ b/common/src/com/android/inputmethod/latin/common/ComposedData.java @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.common; + +/** + * An immutable class that encapsulates a snapshot of word composition data. + */ +public class ComposedData { + public final InputPointers mInputPointers; + public final boolean mIsBatchMode; + public final String mTypedWord; + public ComposedData(final InputPointers inputPointers, final boolean isBatchMode, + final String typedWord) { + mInputPointers = inputPointers; + mIsBatchMode = isBatchMode; + mTypedWord = typedWord; + } + + /** + * Copy the code points in the typed word to a destination array of ints. + * + * If the array is too small to hold the code points in the typed word, nothing is copied and + * -1 is returned. + * + * @param destination the array of ints. + * @return the number of copied code points. + */ + public int copyCodePointsExceptTrailingSingleQuotesAndReturnCodePointCount( + final int[] destination) { + // lastIndex is exclusive + final int lastIndex = mTypedWord.length() + - StringUtils.getTrailingSingleQuotesCount(mTypedWord); + if (lastIndex <= 0) { + // The string is empty or contains only single quotes. + return 0; + } + + // The following function counts the number of code points in the text range which begins + // at index 0 and extends to the character at lastIndex. + final int codePointSize = Character.codePointCount(mTypedWord, 0, lastIndex); + if (codePointSize > destination.length) { + return -1; + } + return StringUtils.copyCodePointsAndReturnCodePointCount(destination, mTypedWord, 0, + lastIndex, true /* downCase */); + } +} diff --git a/dictionaries/de_wordlist.combined.gz b/dictionaries/de_wordlist.combined.gz Binary files differindex 803211c01..92c95540c 100644 --- a/dictionaries/de_wordlist.combined.gz +++ b/dictionaries/de_wordlist.combined.gz diff --git a/dictionaries/en_GB_wordlist.combined.gz b/dictionaries/en_GB_wordlist.combined.gz Binary files differindex 1fa9b85ea..217660fc4 100644 --- a/dictionaries/en_GB_wordlist.combined.gz +++ b/dictionaries/en_GB_wordlist.combined.gz diff --git a/dictionaries/en_US_wordlist.combined.gz b/dictionaries/en_US_wordlist.combined.gz Binary files differindex 2e039ff05..8aed9c5e0 100644 --- a/dictionaries/en_US_wordlist.combined.gz +++ b/dictionaries/en_US_wordlist.combined.gz diff --git a/dictionaries/en_wordlist.combined.gz b/dictionaries/en_wordlist.combined.gz Binary files differindex e845346d6..7fe6618cf 100644 --- a/dictionaries/en_wordlist.combined.gz +++ b/dictionaries/en_wordlist.combined.gz diff --git a/dictionaries/es_wordlist.combined.gz b/dictionaries/es_wordlist.combined.gz Binary files differindex 3391e64b4..71e7309fc 100644 --- a/dictionaries/es_wordlist.combined.gz +++ b/dictionaries/es_wordlist.combined.gz diff --git a/dictionaries/fr_wordlist.combined.gz b/dictionaries/fr_wordlist.combined.gz Binary files differindex 1b9fd73f9..afe44a6d9 100644 --- a/dictionaries/fr_wordlist.combined.gz +++ b/dictionaries/fr_wordlist.combined.gz diff --git a/dictionaries/it_wordlist.combined.gz b/dictionaries/it_wordlist.combined.gz Binary files differindex 5a5cbdc7a..ed58a12c5 100644 --- a/dictionaries/it_wordlist.combined.gz +++ b/dictionaries/it_wordlist.combined.gz diff --git a/dictionaries/nl_wordlist.combined.gz b/dictionaries/nl_wordlist.combined.gz Binary files differindex 37ba8ab42..19c3a7ea8 100644 --- a/dictionaries/nl_wordlist.combined.gz +++ b/dictionaries/nl_wordlist.combined.gz diff --git a/dictionaries/pl_wordlist.combined.gz b/dictionaries/pl_wordlist.combined.gz Binary files differindex ba71a5581..2b84eecfd 100644 --- a/dictionaries/pl_wordlist.combined.gz +++ b/dictionaries/pl_wordlist.combined.gz diff --git a/dictionaries/pt_BR_wordlist.combined.gz b/dictionaries/pt_BR_wordlist.combined.gz Binary files differindex 02df1c1ee..7aac61e50 100644 --- a/dictionaries/pt_BR_wordlist.combined.gz +++ b/dictionaries/pt_BR_wordlist.combined.gz diff --git a/dictionaries/pt_PT_wordlist.combined.gz b/dictionaries/pt_PT_wordlist.combined.gz Binary files differindex bcd50ab03..5bf9a60e8 100644 --- a/dictionaries/pt_PT_wordlist.combined.gz +++ b/dictionaries/pt_PT_wordlist.combined.gz diff --git a/dictionaries/ru_wordlist.combined.gz b/dictionaries/ru_wordlist.combined.gz Binary files differindex 401ad08b0..5e9266221 100644 --- a/dictionaries/ru_wordlist.combined.gz +++ b/dictionaries/ru_wordlist.combined.gz diff --git a/dictionaries/sv_wordlist.combined.gz b/dictionaries/sv_wordlist.combined.gz Binary files differindex b6ebab320..db44ae4c4 100644 --- a/dictionaries/sv_wordlist.combined.gz +++ b/dictionaries/sv_wordlist.combined.gz diff --git a/dictionaries/tr_wordlist.combined.gz b/dictionaries/tr_wordlist.combined.gz Binary files differindex 306cea184..d3c8825b9 100644 --- a/dictionaries/tr_wordlist.combined.gz +++ b/dictionaries/tr_wordlist.combined.gz diff --git a/java/res/raw/main_de.dict b/java/res/raw/main_de.dict Binary files differindex 45b288375..c3c2cbe46 100644 --- a/java/res/raw/main_de.dict +++ b/java/res/raw/main_de.dict diff --git a/java/res/raw/main_en.dict b/java/res/raw/main_en.dict Binary files differindex 5bbb85761..b9e5bc77b 100644 --- a/java/res/raw/main_en.dict +++ b/java/res/raw/main_en.dict diff --git a/java/res/raw/main_es.dict b/java/res/raw/main_es.dict Binary files differindex fae131850..076d5aa8f 100644 --- a/java/res/raw/main_es.dict +++ b/java/res/raw/main_es.dict diff --git a/java/res/raw/main_fr.dict b/java/res/raw/main_fr.dict Binary files differindex 19532d9bf..0e8686092 100644 --- a/java/res/raw/main_fr.dict +++ b/java/res/raw/main_fr.dict diff --git a/java/res/raw/main_it.dict b/java/res/raw/main_it.dict Binary files differindex ff11b9798..609ef13b7 100644 --- a/java/res/raw/main_it.dict +++ b/java/res/raw/main_it.dict diff --git a/java/res/raw/main_pt_br.dict b/java/res/raw/main_pt_br.dict Binary files differindex 9fa50442a..c33865187 100644 --- a/java/res/raw/main_pt_br.dict +++ b/java/res/raw/main_pt_br.dict diff --git a/java/res/raw/main_ru.dict b/java/res/raw/main_ru.dict Binary files differindex 76b5f805a..d0af70730 100644 --- a/java/res/raw/main_ru.dict +++ b/java/res/raw/main_ru.dict diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index 8e9b5c6f6..b5d0b446f 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -21,8 +21,8 @@ import android.util.Log; import android.util.SparseArray; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; +import com.android.inputmethod.latin.common.ComposedData; import com.android.inputmethod.latin.common.Constants; import com.android.inputmethod.latin.common.InputPointers; import com.android.inputmethod.latin.common.StringUtils; @@ -262,8 +262,8 @@ public final class BinaryDictionary extends Dictionary { } @Override - public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, - final NgramContext ngramContext, final ProximityInfo proximityInfo, + public ArrayList<SuggestedWordInfo> getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, final SettingsValuesForSuggestion settingsValuesForSuggestion, final int sessionId, final float weightForLocale, final float[] inOutWeightOfLangModelVsSpatialModel) { @@ -274,12 +274,13 @@ public final class BinaryDictionary extends Dictionary { Arrays.fill(session.mInputCodePoints, Constants.NOT_A_CODE); ngramContext.outputToArray(session.mPrevWordCodePointArrays, session.mIsBeginningOfSentenceArray); - final InputPointers inputPointers = composer.getInputPointers(); - final boolean isGesture = composer.isBatchMode(); + final InputPointers inputPointers = composedData.mInputPointers; + final boolean isGesture = composedData.mIsBatchMode; final int inputSize; if (!isGesture) { - inputSize = composer.copyCodePointsExceptTrailingSingleQuotesAndReturnCodePointCount( - session.mInputCodePoints); + inputSize = + composedData.copyCodePointsExceptTrailingSingleQuotesAndReturnCodePointCount( + session.mInputCodePoints); if (inputSize < 0) { return null; } @@ -303,7 +304,7 @@ public final class BinaryDictionary extends Dictionary { Dictionary.NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL; } // TOOD: Pass multiple previous words information for n-gram. - getSuggestionsNative(mNativeDict, proximityInfo.getNativeProximityInfo(), + getSuggestionsNative(mNativeDict, proximityInfoHandle, getTraverseSession(sessionId).getSession(), inputPointers.getXCoordinates(), inputPointers.getYCoordinates(), inputPointers.getTimes(), inputPointers.getPointerIds(), session.mInputCodePoints, inputSize, diff --git a/java/src/com/android/inputmethod/latin/Dictionary.java b/java/src/com/android/inputmethod/latin/Dictionary.java index 28a62b283..7d7ed77e7 100644 --- a/java/src/com/android/inputmethod/latin/Dictionary.java +++ b/java/src/com/android/inputmethod/latin/Dictionary.java @@ -17,8 +17,8 @@ package com.android.inputmethod.latin; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; +import com.android.inputmethod.latin.common.ComposedData; import com.android.inputmethod.latin.settings.SettingsValuesForSuggestion; import java.util.ArrayList; @@ -87,9 +87,9 @@ public abstract class Dictionary { /** * Searches for suggestions for a given context. - * @param composer the key sequence to match with coordinate info, as a WordComposer + * @param composedData the key sequence to match with coordinate info * @param ngramContext the context for n-gram. - * @param proximityInfo the object for key proximity. May be ignored by some implementations. + * @param proximityInfoHandle the handle for key proximity. Is ignored by some implementations. * @param settingsValuesForSuggestion the settings values used for the suggestion. * @param sessionId the session id. * @param weightForLocale the weight given to this locale, to multiply the output scores for @@ -99,8 +99,8 @@ public abstract class Dictionary { * a float array that has only one element. This can be updated when a different value is used. * @return the list of suggestions (possibly null if none) */ - abstract public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, - final NgramContext ngramContext, final ProximityInfo proximityInfo, + abstract public ArrayList<SuggestedWordInfo> getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, final SettingsValuesForSuggestion settingsValuesForSuggestion, final int sessionId, final float weightForLocale, final float[] inOutWeightOfLangModelVsSpatialModel); @@ -203,8 +203,8 @@ public abstract class Dictionary { } @Override - public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, - final NgramContext ngramContext, final ProximityInfo proximityInfo, + public ArrayList<SuggestedWordInfo> getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, final SettingsValuesForSuggestion settingsValuesForSuggestion, final int sessionId, final float weightForLocale, final float[] inOutWeightOfLangModelVsSpatialModel) { diff --git a/java/src/com/android/inputmethod/latin/DictionaryCollection.java b/java/src/com/android/inputmethod/latin/DictionaryCollection.java index a6d7205e2..96575f629 100644 --- a/java/src/com/android/inputmethod/latin/DictionaryCollection.java +++ b/java/src/com/android/inputmethod/latin/DictionaryCollection.java @@ -18,8 +18,8 @@ package com.android.inputmethod.latin; import android.util.Log; -import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; +import com.android.inputmethod.latin.common.ComposedData; import com.android.inputmethod.latin.settings.SettingsValuesForSuggestion; import java.util.ArrayList; @@ -59,8 +59,8 @@ public final class DictionaryCollection extends Dictionary { } @Override - public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, - final NgramContext ngramContext, final ProximityInfo proximityInfo, + public ArrayList<SuggestedWordInfo> getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, final SettingsValuesForSuggestion settingsValuesForSuggestion, final int sessionId, final float weightForLocale, final float[] inOutWeightOfLangModelVsSpatialModel) { @@ -68,15 +68,15 @@ public final class DictionaryCollection extends Dictionary { if (dictionaries.isEmpty()) return null; // To avoid creating unnecessary objects, we get the list out of the first // dictionary and add the rest to it if not null, hence the get(0) - ArrayList<SuggestedWordInfo> suggestions = dictionaries.get(0).getSuggestions(composer, - ngramContext, proximityInfo, settingsValuesForSuggestion, sessionId, + ArrayList<SuggestedWordInfo> suggestions = dictionaries.get(0).getSuggestions(composedData, + ngramContext, proximityInfoHandle, settingsValuesForSuggestion, sessionId, weightForLocale, inOutWeightOfLangModelVsSpatialModel); if (null == suggestions) suggestions = new ArrayList<>(); final int length = dictionaries.size(); for (int i = 1; i < length; ++ i) { - final ArrayList<SuggestedWordInfo> sugg = dictionaries.get(i).getSuggestions(composer, - ngramContext, proximityInfo, settingsValuesForSuggestion, sessionId, - weightForLocale, inOutWeightOfLangModelVsSpatialModel); + final ArrayList<SuggestedWordInfo> sugg = dictionaries.get(i).getSuggestions( + composedData, ngramContext, proximityInfoHandle, settingsValuesForSuggestion, + sessionId, weightForLocale, inOutWeightOfLangModelVsSpatialModel); if (null != sugg) suggestions.addAll(sugg); } return suggestions; diff --git a/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java b/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java index 4a22cde7b..d23639a0d 100644 --- a/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java +++ b/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java @@ -23,7 +23,6 @@ import android.util.Pair; import android.view.inputmethod.InputMethodSubtype; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback; import com.android.inputmethod.latin.NgramContext.WordInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; @@ -683,7 +682,7 @@ public class DictionaryFacilitator { // TODO: Revise the way to fusion suggestion results. public SuggestionResults getSuggestionResults(final WordComposer composer, - final NgramContext ngramContext, final ProximityInfo proximityInfo, + final NgramContext ngramContext, final long proximityInfoHandle, final SettingsValuesForSuggestion settingsValuesForSuggestion, final int sessionId) { final DictionaryGroup[] dictionaryGroups = mDictionaryGroups; final SuggestionResults suggestionResults = new SuggestionResults( @@ -698,8 +697,8 @@ public class DictionaryFacilitator { ? dictionaryGroup.mWeightForGesturingInLocale : dictionaryGroup.mWeightForTypingInLocale; final ArrayList<SuggestedWordInfo> dictionarySuggestions = - dictionary.getSuggestions(composer, ngramContext, proximityInfo, - settingsValuesForSuggestion, sessionId, + dictionary.getSuggestions(composer.getComposedDataSnapshot(), ngramContext, + proximityInfoHandle, settingsValuesForSuggestion, sessionId, weightForLocale, weightOfLangModelVsSpatialModel); if (null == dictionarySuggestions) continue; suggestionResults.addAll(dictionarySuggestions); diff --git a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java index 702d1536a..bbffece57 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java @@ -20,8 +20,8 @@ import android.content.Context; import android.util.Log; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; +import com.android.inputmethod.latin.common.ComposedData; import com.android.inputmethod.latin.common.Constants; import com.android.inputmethod.latin.makedict.DictionaryHeader; import com.android.inputmethod.latin.makedict.FormatSpec; @@ -480,8 +480,8 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { } @Override - public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, - final NgramContext ngramContext, final ProximityInfo proximityInfo, + public ArrayList<SuggestedWordInfo> getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, final SettingsValuesForSuggestion settingsValuesForSuggestion, final int sessionId, final float weightForLocale, final float[] inOutWeightOfLangModelVsSpatialModel) { reloadDictionaryIfRequired(); @@ -494,9 +494,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { return null; } final ArrayList<SuggestedWordInfo> suggestions = - mBinaryDictionary.getSuggestions(composer, ngramContext, proximityInfo, - settingsValuesForSuggestion, sessionId, weightForLocale, - inOutWeightOfLangModelVsSpatialModel); + mBinaryDictionary.getSuggestions(composedData, ngramContext, + proximityInfoHandle, settingsValuesForSuggestion, sessionId, + weightForLocale, inOutWeightOfLangModelVsSpatialModel); if (mBinaryDictionary.isCorrupted()) { Log.i(TAG, "Dictionary (" + mDictName +") is corrupted. " + "Remove and regenerate it."); diff --git a/java/src/com/android/inputmethod/latin/ReadOnlyBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ReadOnlyBinaryDictionary.java index bc8bd831c..7b1a53a6e 100644 --- a/java/src/com/android/inputmethod/latin/ReadOnlyBinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/ReadOnlyBinaryDictionary.java @@ -16,8 +16,8 @@ package com.android.inputmethod.latin; -import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; +import com.android.inputmethod.latin.common.ComposedData; import com.android.inputmethod.latin.settings.SettingsValuesForSuggestion; import java.util.ArrayList; @@ -50,16 +50,16 @@ public final class ReadOnlyBinaryDictionary extends Dictionary { } @Override - public ArrayList<SuggestedWordInfo> getSuggestions(final WordComposer composer, - final NgramContext ngramContext, final ProximityInfo proximityInfo, + public ArrayList<SuggestedWordInfo> getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, final SettingsValuesForSuggestion settingsValuesForSuggestion, final int sessionId, final float weightForLocale, final float[] inOutWeightOfLangModelVsSpatialModel) { if (mLock.readLock().tryLock()) { try { - return mBinaryDictionary.getSuggestions(composer, ngramContext, proximityInfo, - settingsValuesForSuggestion, sessionId, weightForLocale, - inOutWeightOfLangModelVsSpatialModel); + return mBinaryDictionary.getSuggestions(composedData, ngramContext, + proximityInfoHandle, settingsValuesForSuggestion, sessionId, + weightForLocale, inOutWeightOfLangModelVsSpatialModel); } finally { mLock.readLock().unlock(); } diff --git a/java/src/com/android/inputmethod/latin/Suggest.java b/java/src/com/android/inputmethod/latin/Suggest.java index 430f765ea..9b4619d35 100644 --- a/java/src/com/android/inputmethod/latin/Suggest.java +++ b/java/src/com/android/inputmethod/latin/Suggest.java @@ -140,8 +140,8 @@ public final class Suggest { : typedWord; final SuggestionResults suggestionResults = mDictionaryFacilitator.getSuggestionResults( - wordComposer, ngramContext, proximityInfo, settingsValuesForSuggestion, - SESSION_ID_TYPING); + wordComposer, ngramContext, proximityInfo.getNativeProximityInfo(), + settingsValuesForSuggestion, SESSION_ID_TYPING); final ArrayList<SuggestedWordInfo> suggestionsContainer = getTransformedSuggestedWordInfoList(wordComposer, suggestionResults, trailingSingleQuotesCount, @@ -247,8 +247,8 @@ public final class Suggest { final int inputStyle, final int sequenceNumber, final OnGetSuggestedWordsCallback callback) { final SuggestionResults suggestionResults = mDictionaryFacilitator.getSuggestionResults( - wordComposer, ngramContext, proximityInfo, settingsValuesForSuggestion, - SESSION_ID_GESTURE); + wordComposer, ngramContext, proximityInfo.getNativeProximityInfo(), + settingsValuesForSuggestion, SESSION_ID_GESTURE); // For transforming words that don't come from a dictionary, because it's our best bet final Locale defaultLocale = mDictionaryFacilitator.getMostProbableLocale(); final ArrayList<SuggestedWordInfo> suggestionsContainer = diff --git a/java/src/com/android/inputmethod/latin/WordComposer.java b/java/src/com/android/inputmethod/latin/WordComposer.java index 0b77f2ce3..fa55319d2 100644 --- a/java/src/com/android/inputmethod/latin/WordComposer.java +++ b/java/src/com/android/inputmethod/latin/WordComposer.java @@ -19,6 +19,7 @@ package com.android.inputmethod.latin; import com.android.inputmethod.event.CombinerChain; import com.android.inputmethod.event.Event; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; +import com.android.inputmethod.latin.common.ComposedData; import com.android.inputmethod.latin.common.Constants; import com.android.inputmethod.latin.common.InputPointers; import com.android.inputmethod.latin.common.StringUtils; @@ -90,6 +91,10 @@ public final class WordComposer { refreshTypedWordCache(); } + public ComposedData getComposedDataSnapshot() { + return new ComposedData(getInputPointers(), isBatchMode(), mTypedWordCache.toString()); + } + /** * Restart the combiners, possibly with a new spec. * @param combiningSpec The spec string for combining. This is found in the extra value. @@ -134,38 +139,6 @@ public final class WordComposer { return mCodePointSize; } - /** - * Copy the code points in the typed word to a destination array of ints. - * - * If the array is too small to hold the code points in the typed word, nothing is copied and - * -1 is returned. - * - * @param destination the array of ints. - * @return the number of copied code points. - */ - public int copyCodePointsExceptTrailingSingleQuotesAndReturnCodePointCount( - final int[] destination) { - // This method can be called on a separate thread and mTypedWordCache can change while we - // are executing this method. - final String typedWord = mTypedWordCache.toString(); - // lastIndex is exclusive - final int lastIndex = typedWord.length() - - StringUtils.getTrailingSingleQuotesCount(typedWord); - if (lastIndex <= 0) { - // The string is empty or contains only single quotes. - return 0; - } - - // The following function counts the number of code points in the text range which begins - // at index 0 and extends to the character at lastIndex. - final int codePointSize = Character.codePointCount(typedWord, 0, lastIndex); - if (codePointSize > destination.length) { - return -1; - } - return StringUtils.copyCodePointsAndReturnCodePointCount(destination, typedWord, 0, - lastIndex, true /* downCase */); - } - public boolean isSingleLetter() { return size() == 1; } diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index 78d79ae50..5925bdc4e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -171,14 +171,17 @@ public final class FormatSpec { // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). public static final int VERSION2 = 2; public static final int VERSION201 = 201; + public static final int VERSION202 = 202; public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201; // Dictionary version used for testing. public static final int VERSION4_ONLY_FOR_TESTING = 399; public static final int VERSION401 = 401; public static final int VERSION4 = 402; public static final int VERSION4_DEV = 403; - static final int MINIMUM_SUPPORTED_VERSION = VERSION2; - static final int MAXIMUM_SUPPORTED_VERSION = VERSION4_DEV; + static final int MINIMUM_SUPPORTED_STATIC_VERSION = VERSION202; + static final int MAXIMUM_SUPPORTED_STATIC_VERSION = VERSION202; + static final int MINIMUM_SUPPORTED_DYNAMIC_VERSION = VERSION4; + static final int MAXIMUM_SUPPORTED_DYNAMIC_VERSION = VERSION4_DEV; // TODO: Make this value adaptative to content data, store it in the header, and // use it in the reading code. diff --git a/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java b/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java index 315e3696b..bcf7bbfdc 100644 --- a/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java +++ b/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java @@ -168,7 +168,8 @@ public final class AndroidSpellCheckerService extends SpellCheckerService DictionaryFacilitator dictionaryFacilitatorForLocale = mDictionaryFacilitatorCache.get(locale); return dictionaryFacilitatorForLocale.getSuggestionResults(composer, ngramContext, - proximityInfo, mSettingsValuesForSuggestion, sessionId); + proximityInfo.getNativeProximityInfo(), mSettingsValuesForSuggestion, + sessionId); } finally { if (sessionId != null) { mSessionIdPool.add(sessionId); diff --git a/java/src/com/android/inputmethod/latin/utils/CollectionUtils.java b/java/src/com/android/inputmethod/latin/utils/CollectionUtils.java index f9839eb91..01f5e1079 100644 --- a/java/src/com/android/inputmethod/latin/utils/CollectionUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/CollectionUtils.java @@ -22,16 +22,27 @@ import java.util.Collection; import javax.annotation.Nonnull; import javax.annotation.Nullable; +/** + * Utility methods for working with collections. + */ public final class CollectionUtils { private CollectionUtils() { // This utility class is not publicly instantiable. } + /** + * Converts a sub-range of the given array to an ArrayList of the appropriate type. + * @param array Array to be converted. + * @param start First index inclusive to be converted. + * @param end Last index exclusive to be converted. + * @throws IllegalArgumentException if start or end are out of range or start > end. + */ @Nonnull public static <E> ArrayList<E> arrayAsList(@Nonnull final E[] array, final int start, final int end) { if (start < 0 || start > end || end > array.length) { - throw new IllegalArgumentException(); + throw new IllegalArgumentException("Invalid start: " + start + " end: " + end + + " with array.length: " + array.length); } final ArrayList<E> list = new ArrayList<>(end - start); diff --git a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java index 4e0f5f583..8699f2ce7 100644 --- a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java @@ -36,7 +36,8 @@ public class CombinedFormatUtils { public static final String WORD_TAG = "word"; public static final String BEGINNING_OF_SENTENCE_TAG = "beginning_of_sentence"; public static final String NOT_A_WORD_TAG = "not_a_word"; - public static final String BLACKLISTED_TAG = "blacklisted"; + public static final String POSSIBLY_OFFENSIVE_TAG = "possibly_offensive"; + public static final String TRUE_VALUE = "true"; public static String formatAttributeMap(final HashMap<String, String> attributeMap) { final StringBuilder builder = new StringBuilder(); @@ -61,13 +62,13 @@ public class CombinedFormatUtils { builder.append(","); builder.append(formatProbabilityInfo(wordProperty.mProbabilityInfo)); if (wordProperty.mIsBeginningOfSentence) { - builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true"); + builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=" + TRUE_VALUE); } if (wordProperty.mIsNotAWord) { - builder.append("," + NOT_A_WORD_TAG + "=true"); + builder.append("," + NOT_A_WORD_TAG + "=" + TRUE_VALUE); } if (wordProperty.mIsPossiblyOffensive) { - builder.append("," + BLACKLISTED_TAG + "=true"); + builder.append("," + POSSIBLY_OFFENSIVE_TAG + "=" + TRUE_VALUE); } builder.append("\n"); if (wordProperty.mHasShortcuts) { @@ -111,4 +112,8 @@ public class CombinedFormatUtils { } return builder.toString(); } + + public static boolean isLiteralTrue(final String value) { + return TRUE_VALUE.equalsIgnoreCase(value); + } } diff --git a/java/src/com/android/inputmethod/latin/utils/DistracterFilterCheckingExactMatchesAndSuggestions.java b/java/src/com/android/inputmethod/latin/utils/DistracterFilterCheckingExactMatchesAndSuggestions.java index 56c8249bd..9c6a94810 100644 --- a/java/src/com/android/inputmethod/latin/utils/DistracterFilterCheckingExactMatchesAndSuggestions.java +++ b/java/src/com/android/inputmethod/latin/utils/DistracterFilterCheckingExactMatchesAndSuggestions.java @@ -250,8 +250,9 @@ public class DistracterFilterCheckingExactMatchesAndSuggestions implements Distr composer.setComposingWord(codePoints, coordinates); final SuggestionResults suggestionResults; synchronized (mLock) { - suggestionResults = dictionaryFacilitator.getSuggestionResults( - composer, NgramContext.EMPTY_PREV_WORDS_INFO, keyboard.getProximityInfo(), + suggestionResults = dictionaryFacilitator.getSuggestionResults(composer, + NgramContext.EMPTY_PREV_WORDS_INFO, + keyboard.getProximityInfo().getNativeProximityInfo(), settingsValuesForSuggestion, 0 /* sessionId */); } if (suggestionResults.isEmpty()) { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index bfe17cc4c..6a5df9d95 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -81,6 +81,9 @@ void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbabi } const WordAttributes wordAttributes = mDictStructurePolicy->getWordAttributesInContext( mPrevWordIds, targetWordId, nullptr /* multiBigramMap */); + if (wordAttributes.getProbability() == NOT_A_PROBABILITY) { + return; + } mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, wordAttributes.getProbability()); } diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp index 1e2494e92..8f07ce275 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -31,6 +31,7 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x100; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h index fd1d5fcff..e92c509fa 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.h +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -52,6 +52,10 @@ class ErrorTypeUtils { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; } + static bool isPerfectMatch(const ErrorType containedErrorTypes) { + return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0; + } + static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0; @@ -73,6 +77,7 @@ class ErrorTypeUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; + static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH; static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/ngram_listener.h b/native/jni/src/suggest/core/dictionary/ngram_listener.h index e9b3c1aaf..2eb5e9fd1 100644 --- a/native/jni/src/suggest/core/dictionary/ngram_listener.h +++ b/native/jni/src/suggest/core/dictionary/ngram_listener.h @@ -26,6 +26,8 @@ namespace latinime { */ class NgramListener { public: + // ngramProbability is always 0 for v403 decaying dictionary. + // TODO: Remove ngramProbability. virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0; virtual ~NgramListener() {}; diff --git a/native/jni/src/suggest/core/policy/scoring.h b/native/jni/src/suggest/core/policy/scoring.h index ce3684a1c..b9dda83ad 100644 --- a/native/jni/src/suggest/core/policy/scoring.h +++ b/native/jni/src/suggest/core/policy/scoring.h @@ -30,7 +30,7 @@ class Scoring { public: virtual int calculateFinalScore(const float compoundDistance, const int inputSize, const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, - const bool boostExactMatches) const = 0; + const bool boostExactMatches, const bool hasProbabilityZero) const = 0; virtual void getMostProbableString(const DicTraverseSession *const traverseSession, const float weightOfLangModelVsSpatialModel, SuggestionResults *const outSuggestionResults) const = 0; diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp index 3283f6deb..74db95953 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -76,6 +76,52 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults); } +/* static */ bool SuggestionsOutputUtils::shouldBlockWord( + const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode, + const WordAttributes wordAttributes, const bool isLastWord) { + const bool currentWordExactMatch = + ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + // When we have to block offensive words, non-exact matched offensive words should not be + // output. + const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords(); + + const bool isBlockedOffensiveWord = shouldBlockOffensiveWords && + wordAttributes.isPossiblyOffensive(); + + // This function is called in two situations: + // + // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode + // of the search, and isLastWord will be true. + // "fuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true) + // In this case, if the current word is an exact match, we will always let the word + // through, even if the user is blocking offensive words (it's exactly what they typed!) + // + // 2) In the middle of the search, when we hit a terminal node, to decide whether or not + // to start a new search at root, to try to match the rest of the input. In this case, + // terminalDicNode will point to the terminal node we just hit, and isLastWord will be + // false. + // "fuckvthis" + // | + // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true) + // + // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this" + // when offensive words are blocked would be a bad idea). + // + // In the case of a multi-word correction where the offensive word is typed last (eg. + // for the input "allfuck"), this function will be called with isLastWord==true, but + // currentWordExactMatch==false. So we are OK in this case as well. + // "allfuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false) + if (isLastWord && currentWordExactMatch) { + return false; + } else { + return isBlockedOffensiveWord; + } +} + /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel, @@ -98,24 +144,16 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; const bool isExactMatchWithIntentionalOmission = ErrorTypeUtils::isExactMatchWithIntentionalOmission( terminalDicNode->getContainedErrorTypes()); - const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); - // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. - // (e.g. "AMD" and "and") - const bool isSafeExactMatch = isExactMatch - && !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase); const int outputTypeFlags = (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) - | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) + | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) | (isExactMatchWithIntentionalOmission ? Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); - // Entries that are blacklisted or do not represent a word should not be output. const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()); - // When we have to block offensive words, non-exact matched offensive words should not be - // output. - const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords(); - const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive() - && !isSafeExactMatch; + + const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(), + terminalDicNode, wordAttributes, true /* isLastWord */); // Increase output score of top typing suggestion to ensure autocorrection. // TODO: Better integration with java side autocorrection logic. @@ -123,11 +161,11 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; compoundDistance, traverseSession->getInputSize(), terminalDicNode->getContainedErrorTypes(), (forceCommitMultiWords && terminalDicNode->hasMultipleWords()), - boostExactMatches); + boostExactMatches, wordAttributes.getProbability() == 0); // Don't output invalid or blocked offensive words. However, we still need to submit their // shortcuts if any. - if (isValidWord && !isBlockedOffensiveWord) { + if (isValidWord && !shouldBlockThisWord) { int codePoints[MAX_WORD_LENGTH]; terminalDicNode->outputResult(codePoints); const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h index bf8497828..eca1f78b2 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.h +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h @@ -18,6 +18,7 @@ #define LATINIME_SUGGESTIONS_OUTPUT_UTILS #include "defines.h" +#include "suggest/core/dictionary/word_attributes.h" namespace latinime { @@ -25,11 +26,19 @@ class BinaryDictionaryShortcutIterator; class DicNode; class DicTraverseSession; class Scoring; +class SuggestOptions; class SuggestionResults; class SuggestionsOutputUtils { public: /** + * Returns true if we should block the incoming word, in the context of the user's + * preferences to include or not include possibly offensive words + */ + static bool shouldBlockWord(const SuggestOptions *const suggestOptions, + const DicNode *const terminalDicNode, const WordAttributes wordAttributes, + const bool isLastWord); + /** * Outputs the final list of suggestions (i.e., terminal nodes). */ static void outputSuggestions(const Scoring *const scoringPolicy, diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 68a36454e..c372d668b 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -416,6 +416,11 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext( dicNode->getPrevWordIds(), dicNode->getWordId(), traverseSession->getMultiBigramMap()); + if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(), + dicNode, wordAttributes, false /* isLastWord */)) { + return; + } + if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) { return; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 44c2f443f..abc7f9906 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -134,9 +134,11 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { // same so we use them for both here. switch (mDictFormatVersion) { case FormatUtils::VERSION_2: - return FormatUtils::VERSION_2; case FormatUtils::VERSION_201: - return FormatUtils::VERSION_201; + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return FormatUtils::UNKNOWN_VERSION; + case FormatUtils::VERSION_202: + return FormatUtils::VERSION_202; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: return FormatUtils::VERSION_4_ONLY_FOR_TESTING; case FormatUtils::VERSION_4: diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index 41a8b13b8..d69a53fce 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -111,7 +111,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; switch (version) { case FormatUtils::VERSION_2: case FormatUtils::VERSION_201: - // Version 2 or 201 dictionary writing is not supported. + case FormatUtils::VERSION_202: + // None of the static dictionaries (v2x) support writing return false; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp index 08e39ce43..9455222dd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -140,7 +140,7 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, const PtNodeParams &ptNodeParams) const { - return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), ptNodeParams.getProbability() == 0); } @@ -164,7 +164,7 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI } const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { + if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) { return NOT_A_PROBABILITY; } if (prevWordIds.empty()) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index 372c9e36f..a19a384f4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -115,7 +115,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str switch (formatVersion) { case FormatUtils::VERSION_2: case FormatUtils::VERSION_201: - AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path); + case FormatUtils::VERSION_202: + AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path); break; case FormatUtils::VERSION_4: { return newPolicyForV4Dict<backward::v402::Ver4DictConstants, @@ -177,6 +178,9 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { case FormatUtils::VERSION_2: case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + break; + case FormatUtils::VERSION_202: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); case FormatUtils::VERSION_4_ONLY_FOR_TESTING: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h index 585e87a24..e52706e07 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -144,17 +144,6 @@ class PtNodeParams { return PatriciaTrieReadingUtils::isTerminal(mFlags); } - AK_FORCE_INLINE bool isBlacklisted() const { - // Note: this method will be removed in the next change. - // It is used in getProbabilityOfWord and getWordAttributes for both v402 and v403. - // * getProbabilityOfWord will be changed to no longer return NOT_A_PROBABILITY - // when isBlacklisted (i.e. to only check if isNotAWord or isDeleted) - // * getWordAttributes will be changed to always return blacklisted=false and - // isPossiblyOffensive according to the function below (instead of the current - // behaviour of checking if the probability is zero) - return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); - } - AK_FORCE_INLINE bool isPossiblyOffensive() const { return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 66fd18a52..59873612a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ - #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" #include "defines.h" @@ -317,8 +316,8 @@ const WordAttributes PatriciaTriePolicy::getWordAttributesInContext( const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, const PtNodeParams &ptNodeParams) const { - return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), - ptNodeParams.getProbability() == 0); + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.isPossiblyOffensive()); } int PatriciaTriePolicy::getProbability(const int unigramProbability, @@ -345,10 +344,9 @@ int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams = mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) { - // If this is not a word, or if it's a blacklisted entry, it should behave as - // having no probability outside of the suggestion process (where it should be used - // for shortcuts). + if (ptNodeParams.isNotAWord()) { + // If this is not a word, it should behave as having no probability outside of the + // suggestion process (where it should be used for shortcuts). return NOT_A_PROBABILITY; } if (!prevWordIds.empty()) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp index 05a3a6356..31b1ea696 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -19,11 +19,11 @@ #include <algorithm> #include <cstring> -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { -const int LanguageModelDictContent::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0; const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1; @@ -39,7 +39,8 @@ bool LanguageModelDictContent::runGC( } const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, - const int wordId, const HeaderPolicy *const headerPolicy) const { + const int wordId, const bool mustMatchAllPrevWords, + const HeaderPolicy *const headerPolicy) const { int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); int maxPrevWordCount = 0; @@ -53,7 +54,15 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; } + const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); + if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) { + // The word should be treated as a invalid word. + return WordAttributes(); + } for (int i = maxPrevWordCount; i >= 0; --i) { + if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) { + break; + } const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); if (!result.mIsValid) { continue; @@ -62,36 +71,39 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); int probability = NOT_A_PROBABILITY; if (mHasHistoricalInfo) { - const int rawProbability = ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), headerPolicy); - if (rawProbability == NOT_A_PROBABILITY) { - // The entry should not be treated as a valid entry. - continue; - } + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + int contextCount = 0; if (i == 0) { // unigram - probability = rawProbability; + contextCount = mGlobalCounters.getTotalCount(); } else { const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); if (!prevWordProbabilityEntry.isValid()) { continue; } - if (prevWordProbabilityEntry.representsBeginningOfSentence()) { - probability = rawProbability; - } else { - const int prevWordRawProbability = ForgettingCurveUtils::decodeProbability( - prevWordProbabilityEntry.getHistoricalInfo(), headerPolicy); - probability = std::min(MAX_PROBABILITY - prevWordRawProbability - + rawProbability, MAX_PROBABILITY); + if (prevWordProbabilityEntry.representsBeginningOfSentence() + && historicalInfo->getCount() == 1) { + // BoS ngram requires multiple contextCount. + continue; } + contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount(); } + const float rawProbability = + DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts( + historicalInfo->getCount(), contextCount, i + 1); + const int encodedRawProbability = + ProbabilityUtils::encodeRawProbability(rawProbability); + const int decayedProbability = + DynamicLanguageModelProbabilityUtils::getDecayedProbability( + encodedRawProbability, *historicalInfo); + probability = DynamicLanguageModelProbabilityUtils::backoff( + decayedProbability, i + 1 /* n */); } else { probability = probabilityEntry.getProbability(); } // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in // probabilityEntry. - const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), unigramProbabilityEntry.isNotAWord(), unigramProbabilityEntry.isPossiblyOffensive()); @@ -167,7 +179,8 @@ void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner( ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); if (probabilityEntry.isValid()) { const WordAttributes wordAttributes = getWordAttributes( - WordIdArrayView(*prevWordIds), wordId, headerPolicy); + WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */, + headerPolicy); outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId, wordAttributes, probabilityEntry); } @@ -231,7 +244,7 @@ bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView return false; } mGlobalCounters.updateMaxValueOfCounters( - updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount()); + updatedNgramProbabilityEntry.getHistoricalInfo()->getCount()); if (!originalNgramProbabilityEntry.isValid()) { entryCountersToUpdate->incrementNgramCount(i + 2); } @@ -242,10 +255,9 @@ bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( const ProbabilityEntry &originalProbabilityEntry, const bool isValid, const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { - const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalProbabilityEntry.getHistoricalInfo(), isValid ? - DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY, - &historicalInfo, headerPolicy); + const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(), + 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount() + + historicalInfo.getCount()); if (originalProbabilityEntry.isValid()) { return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); } else { @@ -311,7 +323,7 @@ int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWord bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, const HeaderPolicy *const headerPolicy, - MutableEntryCounters *const outEntryCounters) { + const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", @@ -328,33 +340,41 @@ bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int b } continue; } - if (mHasHistoricalInfo && !probabilityEntry.representsBeginningOfSentence() - && probabilityEntry.isValid()) { - const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - probabilityEntry.getHistoricalInfo(), headerPolicy); - if (ForgettingCurveUtils::needsToKeep(&historicalInfo, headerPolicy)) { - // Update the entry. - const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), &historicalInfo); - if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), - bitmapEntryIndex)) { - return false; - } - } else { + if (mHasHistoricalInfo && probabilityEntry.isValid()) { + const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); + if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( + *originalHistoricalInfo)) { // Remove the entry. if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { return false; } continue; } + if (needsToHalveCounters) { + const int updatedCount = originalHistoricalInfo->getCount() / 2; + if (updatedCount == 0) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), + originalHistoricalInfo->getLevel(), updatedCount); + const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), + &historicalInfoToSave); + if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), + bitmapEntryIndex)) { + return false; + } + } } - if (!probabilityEntry.representsBeginningOfSentence()) { - outEntryCounters->incrementNgramCount(prevWordCount + 1); - } + outEntryCounters->incrementNgramCount(prevWordCount + 1); if (!entry.hasNextLevelMap()) { continue; } if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), - prevWordCount + 1, headerPolicy, outEntryCounters)) { + prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { return false; } } @@ -408,11 +428,11 @@ bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPoli } const ProbabilityEntry probabilityEntry = ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); - const int probability = (mHasHistoricalInfo) ? - ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo(), - headerPolicy) : probabilityEntry.getProbability(); - outEntryInfo->emplace_back(probability, - probabilityEntry.getHistoricalInfo()->getTimestamp(), + const int priority = mHasHistoricalInfo + ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction( + *probabilityEntry.getHistoricalInfo()) + : probabilityEntry.getProbability(); + outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(), entry.key(), targetLevel, prevWordIds->data()); } return true; @@ -420,11 +440,11 @@ bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPoli bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { - if (left.mProbability != right.mProbability) { - return left.mProbability < right.mProbability; + if (left.mPriority != right.mPriority) { + return left.mPriority < right.mPriority; } - if (left.mTimestamp != right.mTimestamp) { - return left.mTimestamp > right.mTimestamp; + if (left.mCount != right.mCount) { + return left.mCount < right.mCount; } if (left.mKey != right.mKey) { return left.mKey < right.mKey; @@ -441,10 +461,9 @@ bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( return false; } -LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int probability, - const int timestamp, const int key, const int prevWordCount, const int *const prevWordIds) - : mProbability(probability), mTimestamp(timestamp), mKey(key), - mPrevWordCount(prevWordCount) { +LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority, + const int count, const int key, const int prevWordCount, const int *const prevWordIds) + : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) { memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h index 5b92b96e3..9678c35f9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h @@ -151,13 +151,14 @@ class LanguageModelDictContent { const LanguageModelDictContent *const originalContent); const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, - const HeaderPolicy *const headerPolicy) const; + const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const; ProbabilityEntry getProbabilityEntry(const int wordId) const { return getNgramProbabilityEntry(WordIdArrayView(), wordId); } bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { + mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount()); return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); } @@ -180,8 +181,15 @@ class LanguageModelDictContent { bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters) { - return updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), - 0 /* prevWordCount */, headerPolicy, outEntryCounters); + if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), + 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(), + outEntryCounters)) { + return false; + } + if (mGlobalCounters.needsToHalveCounters()) { + mGlobalCounters.halveCounters(); + } + return true; } // entryCounts should be created by updateAllProbabilityEntries. @@ -206,11 +214,12 @@ class LanguageModelDictContent { DISALLOW_ASSIGNMENT_OPERATOR(Comparator); }; - EntryInfoToTurncate(const int probability, const int timestamp, const int key, + EntryInfoToTurncate(const int priority, const int count, const int key, const int prevWordCount, const int *const prevWordIds); - int mProbability; - int mTimestamp; + int mPriority; + // TODO: Remove. + int mCount; int mKey; int mPrevWordCount; int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; @@ -219,8 +228,6 @@ class LanguageModelDictContent { DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate); }; - // TODO: Remove - static const int DUMMY_PROBABILITY_FOR_VALID_WORDS; static const int TRIE_MAP_BUFFER_INDEX; static const int GLOBAL_COUNTERS_BUFFER_INDEX; @@ -233,7 +240,8 @@ class LanguageModelDictContent { int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds); int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, - const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters); + const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters, + MutableEntryCounters *const outEntryCounters); bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, int *const outEntryCount); bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h index 9953aa425..283c2691a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h @@ -63,6 +63,10 @@ class LanguageModelDictContentGlobalCounters { mTotalCount += 1; } + void addToTotalCount(const int count) { + mTotalCount += count; + } + void updateMaxValueOfCounters(const int count) { mMaxValueOfCounters = std::max(count, mMaxValueOfCounters); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index d3de322f9..96d789f58 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -110,7 +110,7 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( return WordAttributes(); } return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, - mHeaderPolicy); + false /* mustMatchAllPrevWords */, mHeaderPolicy); } int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, @@ -118,18 +118,13 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { return NOT_A_PROBABILITY; } - const ProbabilityEntry probabilityEntry = - mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId); - if (!probabilityEntry.isValid() || probabilityEntry.isBlacklisted() - || probabilityEntry.isNotAWord()) { + const WordAttributes wordAttributes = + mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + true /* mustMatchAllPrevWords */, mHeaderPolicy); + if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { return NOT_A_PROBABILITY; } - if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - return ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo(), - mHeaderPolicy); - } else { - return probabilityEntry.getProbability(); - } + return wordAttributes.getProbability(); } BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( @@ -152,9 +147,7 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI continue; } const int probability = probabilityEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), mHeaderPolicy) : - probabilityEntry.getProbability(); + 0 : probabilityEntry.getProbability(); listener->onVisitEntry(probability, entry.getWordId()); } } @@ -386,25 +379,35 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); return false; } + if (!isValidWord) { + return true; + } wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); - if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID - && ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { - const UnigramProperty beginningOfSentenceUnigramProperty( - true /* representsBeginningOfSentence */, - true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, - HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); - if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), - &beginningOfSentenceUnigramProperty)) { - AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, + true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, + HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + // Update entries for beginning of sentence. + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( + prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, + mHeaderPolicy, &mEntryCounters)) { return false; } - // Refresh word ids. - ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { @@ -542,7 +545,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty( } } const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( - WordIdArrayView(), wordId, mHeaderPolicy); + WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index 0cffe569d..8b47147e1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -28,9 +28,11 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; /* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { switch (formatVersion) { case VERSION_2: - return VERSION_2; case VERSION_201: - return VERSION_201; + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return UNKNOWN_VERSION; + case VERSION_202: + return VERSION_202; case VERSION_4_ONLY_FOR_TESTING: return VERSION_4_ONLY_FOR_TESTING; case VERSION_4: diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index 96310086b..05bd7eb8a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -31,8 +31,12 @@ class FormatUtils { public: enum FORMAT_VERSION { // These MUST have the same values as the relevant constants in FormatSpec.java. + // TODO: Remove VERSION_2 and VERSION_201 when we: + // * Confirm that old versions of LatinIME download old-format dictionaries + // * We no longer need the corresponding constants on the Java side for dicttool VERSION_2 = 2, VERSION_201 = 201, + VERSION_202 = 202, VERSION_4_ONLY_FOR_TESTING = 399, VERSION_4 = 402, VERSION_4_DEV = 403, diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index a6f9a8b23..856808a74 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120; const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f; const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f; +const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f; const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f; const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f; const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f; diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h index b8f889559..6f327a370 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h @@ -34,6 +34,7 @@ class ScoringParams { static const int THRESHOLD_SHORT_WORD_LENGTH; static const float EXACT_MATCH_PROMOTION; + static const float PERFECT_MATCH_PROMOTION; static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH; static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h index 0240bcf54..6acd767ea 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h @@ -44,23 +44,50 @@ class TypingScoring : public Scoring { AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize, const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, - const bool boostExactMatches) const { + const bool boostExactMatches, const bool hasProbabilityZero) const { const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE + static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT; float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance; if (forceCommit) { score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD; } - if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { - score += ScoringParams::EXACT_MATCH_PROMOTION; - if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) { - score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + if (hasProbabilityZero) { + // Previously, when both legitimate 0-frequency words (such as distracters) and + // offensive words were encoded in the same way, distracters would never show up + // when the user blocked offensive words (the default setting, as well as the + // setting for regression tests). + // + // When b/11031090 was fixed and a separate encoding was used for offensive words, + // 0-frequency words would no longer be blocked when they were an "exact match" + // (where case mismatches and accent mismatches would be considered an "exact + // match"). The exact match boosting functionality meant that, for example, when + // the user typed "mt" they would be suggested the word "Mt", although they most + // probably meant to type "my". + // + // For this reason, we introduced this change, which does the following: + // * Defines the "perfect match" as a really exact match, with no room for case or + // accent mismatches + // * When the target word has probability zero (as "Mt" does, because it is a + // distracter), ONLY boost its score if it is a perfect match. + // + // By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and + // they will get "my". However, if the user makes an explicit effort to type "Mt", + // we do boost the word "Mt" so that the user's input is not autocorrected to "My". + if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) { + score += ScoringParams::PERFECT_MATCH_PROMOTION; } - if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) { - score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; - } - if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { - score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } else { + if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { + score += ScoringParams::EXACT_MATCH_PROMOTION; + if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) { + score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) { + score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { + score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } } } return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE); diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp index 4469dc715..86040f12c 100644 --- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp +++ b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp @@ -108,14 +108,14 @@ TEST(LanguageModelDictContentTest, TestGetWordProbability) { languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), wordId, &bigramProbabilityEntry); EXPECT_EQ(bigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, - nullptr /* headerPolicy */).getProbability()); + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); const ProbabilityEntry trigramProbabilityEntry(flag, trigramProbability); languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), prevWordIds[1], &probabilityEntry); languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(2), wordId, &trigramProbabilityEntry); EXPECT_EQ(trigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, - nullptr /* headerPolicy */).getProbability()); + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); } } // namespace diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index f90b266b6..b9764488e 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -59,6 +59,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { super.setUp(); mCurrentTime = 0; mDictFilesToBeDeleted.clear(); + setCurrentTimeForTestMode(mCurrentTime); } @Override @@ -71,8 +72,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { super.tearDown(); } - private static boolean supportsBeginningOfSentence(final int formatVersion) { - return formatVersion > FormatSpec.VERSION401; + private static boolean supportsCountBasedNgram(final int formatVersion) { + return formatVersion >= FormatSpec.VERSION4_DEV; } private static boolean supportsNgram(final int formatVersion) { @@ -142,19 +143,13 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { private File createEmptyDictionaryWithAttributeMapAndGetFile(final int formatVersion, final HashMap<String, String> attributeMap) { - if (formatVersion == FormatSpec.VERSION4 - || formatVersion == FormatSpec.VERSION4_ONLY_FOR_TESTING - || formatVersion == FormatSpec.VERSION4_DEV) { - try { - final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion, - attributeMap); - mDictFilesToBeDeleted.add(dictFile); - return dictFile; - } catch (final IOException e) { - fail(e.toString()); - } - } else { - fail("Dictionary format version " + formatVersion + " is not supported."); + try { + final File dictFile = createEmptyVer4DictionaryAndGetFile(formatVersion, + attributeMap); + mDictFilesToBeDeleted.add(dictFile); + return dictFile; + } catch (final IOException e) { + fail(e.toString()); } return null; } @@ -263,12 +258,10 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { onInputWord(binaryDictionary, "a", false /* isValidWord */); assertTrue(binaryDictionary.isValidWord("a")); - onInputWord(binaryDictionary, "b", true /* isValidWord */); - assertTrue(binaryDictionary.isValidWord("b")); - onInputWordWithPrevWord(binaryDictionary, "b", false /* isValidWord */, "a"); assertFalse(isValidBigram(binaryDictionary, "a", "b")); onInputWordWithPrevWord(binaryDictionary, "b", false /* isValidWord */, "a"); + assertTrue(binaryDictionary.isValidWord("b")); assertTrue(isValidBigram(binaryDictionary, "a", "b")); onInputWordWithPrevWord(binaryDictionary, "c", true /* isValidWord */, "a"); @@ -284,16 +277,12 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { return; } - onInputWordWithPrevWords(binaryDictionary, "c", false /* isValidWord */, "b", "a"); - assertFalse(isValidTrigram(binaryDictionary, "a", "b", "c")); - assertFalse(isValidBigram(binaryDictionary, "b", "c")); - onInputWordWithPrevWords(binaryDictionary, "c", false /* isValidWord */, "b", "a"); + onInputWordWithPrevWords(binaryDictionary, "c", true /* isValidWord */, "b", "a"); assertTrue(isValidTrigram(binaryDictionary, "a", "b", "c")); assertTrue(isValidBigram(binaryDictionary, "b", "c")); - - onInputWordWithPrevWords(binaryDictionary, "d", true /* isValidWord */, "b", "a"); - assertTrue(isValidTrigram(binaryDictionary, "a", "b", "d")); - assertTrue(isValidBigram(binaryDictionary, "b", "d")); + onInputWordWithPrevWords(binaryDictionary, "d", false /* isValidWord */, "c", "b"); + assertFalse(isValidTrigram(binaryDictionary, "b", "c", "d")); + assertFalse(isValidBigram(binaryDictionary, "c", "d")); onInputWordWithPrevWords(binaryDictionary, "cd", true /* isValidWord */, "b", "a"); assertTrue(isValidTrigram(binaryDictionary, "a", "b", "cd")); @@ -312,6 +301,13 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { onInputWord(binaryDictionary, "a", true /* isValidWord */); assertTrue(binaryDictionary.isValidWord("a")); forcePassingShortTime(binaryDictionary); + if (supportsCountBasedNgram(formatVersion)) { + // Count based ngram language model doesn't support decaying based on the elapsed time. + assertTrue(binaryDictionary.isValidWord("a")); + } else { + assertFalse(binaryDictionary.isValidWord("a")); + } + forcePassingLongTime(binaryDictionary); assertFalse(binaryDictionary.isValidWord("a")); onInputWord(binaryDictionary, "a", true /* isValidWord */); @@ -327,6 +323,12 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { onInputWordWithPrevWord(binaryDictionary, "b", true /* isValidWord */, "a"); assertTrue(isValidBigram(binaryDictionary, "a", "b")); forcePassingShortTime(binaryDictionary); + if (supportsCountBasedNgram(formatVersion)) { + assertTrue(isValidBigram(binaryDictionary, "a", "b")); + } else { + assertFalse(isValidBigram(binaryDictionary, "a", "b")); + } + forcePassingLongTime(binaryDictionary); assertFalse(isValidBigram(binaryDictionary, "a", "b")); onInputWord(binaryDictionary, "a", true /* isValidWord */); @@ -349,7 +351,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { onInputWordWithPrevWord(binaryDictionary, "bc", true /* isValidWord */, "ab"); onInputWordWithPrevWords(binaryDictionary, "cd", true /* isValidWord */, "bc", "ab"); assertTrue(isValidTrigram(binaryDictionary, "ab", "bc", "cd")); - forcePassingShortTime(binaryDictionary); + forcePassingLongTime(binaryDictionary); assertFalse(isValidTrigram(binaryDictionary, "ab", "bc", "cd")); onInputWord(binaryDictionary, "ab", true /* isValidWord */); @@ -540,7 +542,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { assertTrue(bigramCountBeforeGC > bigramCountAfterGC); } } - + forcePassingShortTime(binaryDictionary); assertTrue(Integer.parseInt(binaryDictionary.getPropertyForGettingStats( BinaryDictionary.BIGRAM_COUNT_QUERY)) > 0); assertTrue(Integer.parseInt(binaryDictionary.getPropertyForGettingStats( @@ -666,14 +668,17 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { assertEquals(toFormatVersion, binaryDictionary.getFormatVersion()); assertTrue(binaryDictionary.isValidWord("aaa")); assertFalse(binaryDictionary.isValidWord("bbb")); - assertTrue(binaryDictionary.getFrequency("aaa") < binaryDictionary.getFrequency("ccc")); - onInputWord(binaryDictionary, "bbb", false /* isValidWord */); - assertTrue(binaryDictionary.isValidWord("bbb")); + if (supportsCountBasedNgram(toFormatVersion)) { + assertTrue(binaryDictionary.getFrequency("aaa") < binaryDictionary.getFrequency("ccc")); + onInputWord(binaryDictionary, "bbb", false /* isValidWord */); + assertTrue(binaryDictionary.isValidWord("bbb")); + } assertTrue(isValidBigram(binaryDictionary, "aaa", "abc")); assertFalse(isValidBigram(binaryDictionary, "aaa", "bbb")); - onInputWordWithPrevWord(binaryDictionary, "bbb", false /* isValidWord */, "aaa"); - assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb")); - + if (supportsCountBasedNgram(toFormatVersion)) { + onInputWordWithPrevWord(binaryDictionary, "bbb", false /* isValidWord */, "aaa"); + assertTrue(isValidBigram(binaryDictionary, "aaa", "bbb")); + } if (supportsNgram(toFormatVersion)) { assertTrue(isValidTrigram(binaryDictionary, "aaa", "abc", "xyz")); assertFalse(isValidTrigram(binaryDictionary, "aaa", "abc", "def")); @@ -686,9 +691,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { public void testBeginningOfSentence() { for (final int formatVersion : DICT_FORMAT_VERSIONS) { - if (supportsBeginningOfSentence(formatVersion)) { - testBeginningOfSentence(formatVersion); - } + testBeginningOfSentence(formatVersion); } } @@ -716,10 +719,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { assertFalse(binaryDictionary.isValidNgram(beginningOfSentenceContext, "aaa")); assertFalse(binaryDictionary.isValidNgram(beginningOfSentenceContext, "bbb")); onInputWordWithBeginningOfSentenceContext(binaryDictionary, "aaa", true /* isValidWord */); - assertFalse(binaryDictionary.isValidNgram(beginningOfSentenceContext, "aaa")); onInputWordWithBeginningOfSentenceContext(binaryDictionary, "aaa", true /* isValidWord */); onInputWordWithBeginningOfSentenceContext(binaryDictionary, "bbb", true /* isValidWord */); - assertFalse(binaryDictionary.isValidNgram(beginningOfSentenceContext, "bbb")); onInputWordWithBeginningOfSentenceContext(binaryDictionary, "bbb", true /* isValidWord */); assertTrue(binaryDictionary.isValidNgram(beginningOfSentenceContext, "aaa")); assertTrue(binaryDictionary.isValidNgram(beginningOfSentenceContext, "bbb")); diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java index a35fa13ce..d239f8dac 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java @@ -314,14 +314,14 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final String dictVersion = Long.toString(System.currentTimeMillis()); final String codePointTableAttribute = DictionaryHeader.CODE_POINT_TABLE_KEY; final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, - BinaryDictUtils.VERSION201_OPTIONS, getContext().getCacheDir()); + BinaryDictUtils.STATIC_OPTIONS, getContext().getCacheDir()); // Write a test dictionary final DictEncoder dictEncoder = new Ver2DictEncoder(file, Ver2DictEncoder.CODE_POINT_TABLE_ON); final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions( - FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE); + FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION); final FusionDictionary sourcedict = new FusionDictionary(new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(words.size(), sourcedict, words, null /* shortcutMap */); @@ -359,11 +359,11 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final List<String> results = new ArrayList<>(); runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, - BinaryDictUtils.VERSION2_OPTIONS); + BinaryDictUtils.STATIC_OPTIONS); runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, - BinaryDictUtils.VERSION4_OPTIONS_WITHOUT_TIMESTAMP); + BinaryDictUtils.DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP); runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, - BinaryDictUtils.VERSION4_OPTIONS_WITH_TIMESTAMP); + BinaryDictUtils.DYNAMIC_OPTIONS_WITH_TIMESTAMP); for (final String result : results) { Log.d(TAG, result); } @@ -373,11 +373,11 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final List<String> results = new ArrayList<>(); runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, - BinaryDictUtils.VERSION2_OPTIONS); + BinaryDictUtils.STATIC_OPTIONS); runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, - BinaryDictUtils.VERSION4_OPTIONS_WITHOUT_TIMESTAMP); + BinaryDictUtils.DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP); runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, - BinaryDictUtils.VERSION4_OPTIONS_WITH_TIMESTAMP); + BinaryDictUtils.DYNAMIC_OPTIONS_WITH_TIMESTAMP); for (final String result : results) { Log.d(TAG, result); @@ -501,7 +501,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final ArrayList<String> results = new ArrayList<>(); runReadUnigramsAndBigramsTests(results, BinaryDictUtils.USE_BYTE_BUFFER, - BinaryDictUtils.VERSION2_OPTIONS); + BinaryDictUtils.STATIC_OPTIONS); for (final String result : results) { Log.d(TAG, result); @@ -512,7 +512,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final ArrayList<String> results = new ArrayList<>(); runReadUnigramsAndBigramsTests(results, BinaryDictUtils.USE_BYTE_ARRAY, - BinaryDictUtils.VERSION2_OPTIONS); + BinaryDictUtils.STATIC_OPTIONS); for (final String result : results) { Log.d(TAG, result); @@ -623,9 +623,9 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final ArrayList<String> results = new ArrayList<>(); runGetTerminalPositionTests(BinaryDictUtils.USE_BYTE_ARRAY, - BinaryDictUtils.VERSION2_OPTIONS); + BinaryDictUtils.STATIC_OPTIONS); runGetTerminalPositionTests(BinaryDictUtils.USE_BYTE_BUFFER, - BinaryDictUtils.VERSION2_OPTIONS); + BinaryDictUtils.STATIC_OPTIONS); for (final String result : results) { Log.d(TAG, result); @@ -633,7 +633,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { } public void testVer2DictGetWordProperty() { - final FormatOptions formatOptions = BinaryDictUtils.VERSION2_OPTIONS; + final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS; final ArrayList<String> words = sWords; final HashMap<String, List<String>> shortcuts = sShortcuts; final String dictName = "testGetWordProperty"; @@ -669,7 +669,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { } public void testVer2DictIteration() { - final FormatOptions formatOptions = BinaryDictUtils.VERSION2_OPTIONS; + final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS; final ArrayList<String> words = sWords; final HashMap<String, List<String>> shortcuts = sShortcuts; final SparseArray<List<Integer>> bigrams = sEmptyBigrams; diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java index 60e38250f..ce905c499 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -819,12 +819,18 @@ public class BinaryDictEncoderUtils { final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) throws IOException, UnsupportedFormatException { final int version = formatOptions.mVersion; - if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION - || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { + if ((version >= FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION && + version <= FormatSpec.MAXIMUM_SUPPORTED_STATIC_VERSION) || ( + version >= FormatSpec.MINIMUM_SUPPORTED_DYNAMIC_VERSION && + version <= FormatSpec.MAXIMUM_SUPPORTED_DYNAMIC_VERSION)) { + // Dictionary is valid + } else { throw new UnsupportedFormatException("Requested file format version " + version - + ", but this implementation only supports versions " - + FormatSpec.MINIMUM_SUPPORTED_VERSION + " through " - + FormatSpec.MAXIMUM_SUPPORTED_VERSION); + + ", but this implementation only supports static versions " + + FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION + " through " + + FormatSpec.MAXIMUM_SUPPORTED_STATIC_VERSION + " and dynamic versions " + + FormatSpec.MINIMUM_SUPPORTED_DYNAMIC_VERSION + " through " + + FormatSpec.MAXIMUM_SUPPORTED_DYNAMIC_VERSION); } ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java index 8eabf749d..9c1e4cf84 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java @@ -28,13 +28,11 @@ public class BinaryDictUtils { public static final String TEST_DICT_FILE_EXTENSION = ".testDict"; - public static final FormatSpec.FormatOptions VERSION2_OPTIONS = - new FormatSpec.FormatOptions(FormatSpec.VERSION2); - public static final FormatSpec.FormatOptions VERSION201_OPTIONS = - new FormatSpec.FormatOptions(FormatSpec.VERSION201); - public static final FormatSpec.FormatOptions VERSION4_OPTIONS_WITHOUT_TIMESTAMP = + public static final FormatSpec.FormatOptions STATIC_OPTIONS = + new FormatSpec.FormatOptions(FormatSpec.VERSION202); + public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP = new FormatSpec.FormatOptions(FormatSpec.VERSION4, false /* hasTimestamp */); - public static final FormatSpec.FormatOptions VERSION4_OPTIONS_WITH_TIMESTAMP = + public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITH_TIMESTAMP = new FormatSpec.FormatOptions(FormatSpec.VERSION4, true /* hasTimestamp */); public static DictionaryOptions makeDictionaryOptions(final String id, final String version, @@ -55,7 +53,8 @@ public class BinaryDictUtils { public static File getDictFile(final String name, final String version, final FormatOptions formatOptions, final File directory) { if (formatOptions.mVersion == FormatSpec.VERSION2 - || formatOptions.mVersion == FormatSpec.VERSION201) { + || formatOptions.mVersion == FormatSpec.VERSION201 + || formatOptions.mVersion == FormatSpec.VERSION202) { return new File(directory, name + "." + version + TEST_DICT_FILE_EXTENSION); } else if (formatOptions.mVersion == FormatSpec.VERSION4) { return new File(directory, name + "." + version); @@ -71,7 +70,7 @@ public class BinaryDictUtils { file.mkdir(); } return new Ver4DictEncoder(file); - } else if (formatOptions.mVersion == FormatSpec.VERSION2) { + } else if (formatOptions.mVersion == FormatSpec.VERSION202) { return new Ver2DictEncoder(file, Ver2DictEncoder.CODE_POINT_TABLE_OFF); } else { throw new RuntimeException("The format option has a wrong version : " diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java index 457e7af8e..5c261a94d 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java @@ -178,7 +178,8 @@ public class Ver2DictDecoder extends AbstractDictDecoder { throw new IOException("Cannot read the dictionary header."); } if (header.mFormatOptions.mVersion != FormatSpec.VERSION2 && - header.mFormatOptions.mVersion != FormatSpec.VERSION201) { + header.mFormatOptions.mVersion != FormatSpec.VERSION201 && + header.mFormatOptions.mVersion != FormatSpec.VERSION202) { throw new UnsupportedFormatException("File header has a wrong version : " + header.mFormatOptions.mVersion); } diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java index 2c2152be7..b52b8c485 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java @@ -124,7 +124,8 @@ public class Ver2DictEncoder implements DictEncoder { @Override public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) throws IOException, UnsupportedFormatException { - if (formatOptions.mVersion > FormatSpec.VERSION201) { + // We no longer support anything but the latest version of v2. + if (formatOptions.mVersion != FormatSpec.VERSION202) { throw new UnsupportedFormatException( "The given format options has wrong version number : " + formatOptions.mVersion); diff --git a/tests/src/com/android/inputmethod/latin/utils/CollectionUtilsTests.java b/tests/src/com/android/inputmethod/latin/utils/CollectionUtilsTests.java index a5979c3df..dc4e2e4bb 100644 --- a/tests/src/com/android/inputmethod/latin/utils/CollectionUtilsTests.java +++ b/tests/src/com/android/inputmethod/latin/utils/CollectionUtilsTests.java @@ -29,14 +29,45 @@ import java.util.Collections; @SmallTest public class CollectionUtilsTests extends AndroidTestCase { /** + * Tests that {@link CollectionUtils#arrayAsList(Object[],int,int)} fails as expected + * with some invalid inputs. + */ + public void testArrayAsListFailure() { + final String[] array = { "0", "1" }; + // Negative start + try { + CollectionUtils.arrayAsList(array, -1, 1); + fail("Failed to catch start < 0"); + } catch (final IllegalArgumentException e) { + assertEquals("Invalid start: -1 end: 1 with array.length: 2", e.getMessage()); + } + // start > end + try { + CollectionUtils.arrayAsList(array, 1, -1); + fail("Failed to catch start > end"); + } catch (final IllegalArgumentException e) { + assertEquals("Invalid start: 1 end: -1 with array.length: 2", e.getMessage()); + } + // end > array.length + try { + CollectionUtils.arrayAsList(array, 1, 3); + fail("Failed to catch end > array.length"); + } catch (final IllegalArgumentException e) { + assertEquals("Invalid start: 1 end: 3 with array.length: 2", e.getMessage()); + } + } + + /** * Tests that {@link CollectionUtils#arrayAsList(Object[],int,int)} gives the expected * results for a few valid inputs. */ public void testArrayAsList() { - final String[] array = { "0", "1", "2", "3", "4" }; final ArrayList<String> empty = new ArrayList<>(); + assertEquals(empty, CollectionUtils.arrayAsList(new String[] { }, 0, 0)); + final String[] array = { "0", "1", "2", "3", "4" }; assertEquals(empty, CollectionUtils.arrayAsList(array, 0, 0)); assertEquals(empty, CollectionUtils.arrayAsList(array, 1, 1)); + assertEquals(empty, CollectionUtils.arrayAsList(array, array.length, array.length)); final ArrayList<String> expected123 = new ArrayList<>(Arrays.asList("1", "2", "3")); assertEquals(expected123, CollectionUtils.arrayAsList(array, 1, 4)); } diff --git a/tools/dicttool/Android.mk b/tools/dicttool/Android.mk index 81c0706c1..42659253a 100644 --- a/tools/dicttool/Android.mk +++ b/tools/dicttool/Android.mk @@ -42,15 +42,11 @@ LATINIME_TESTS_SRC_DIR := $(LATINIME_LOCAL_DIR)/tests/src/com/android/inputmetho # a significant part of the dependencies are mocked in the compat/ directory, with empty or # nearly-empty implementations, for parts that we don't use in Dicttool. LATINIME_SRC_FILES_FOR_DICTTOOL := \ - event/Combiner.java \ - event/Event.java \ latin/BinaryDictionary.java \ latin/DicTraverseSession.java \ latin/Dictionary.java \ - latin/LastComposedWord.java \ latin/NgramContext.java \ latin/SuggestedWords.java \ - latin/WordComposer.java \ latin/settings/NativeSuggestOptions.java \ latin/settings/SettingsValuesForSuggestion.java \ latin/utils/BinaryDictionaryUtils.java \ diff --git a/tools/dicttool/compat/com/android/inputmethod/event/CombinerChain.java b/tools/dicttool/compat/com/android/inputmethod/event/CombinerChain.java deleted file mode 100644 index c4457a1b7..000000000 --- a/tools/dicttool/compat/com/android/inputmethod/event/CombinerChain.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.event; - -import java.util.ArrayList; - -/** - * Compatibility class that stands in for the combiner chain in LatinIME. - * - * This is not used by dicttool, it's just needed by the dependency chain. - */ -// TODO: there should not be a dependency to this in dicttool, so there -// should be a sensible way to separate them cleanly. -public class CombinerChain { - private StringBuilder mComposingWord; - public CombinerChain(final String initialText, final Combiner... combinerList) { - mComposingWord = new StringBuilder(initialText); - } - - public Event processEvent(final ArrayList<Event> previousEvents, final Event newEvent) { - return newEvent; - } - - public void applyProcessedEvent(final Event event) { - mComposingWord.append(event.getTextToCommit()); - } - - public CharSequence getComposingWordWithCombiningFeedback() { - return mComposingWord; - } - - public void reset() { - mComposingWord.setLength(0); - } - - public static Combiner[] createCombiners(final String spec) { - // Dicttool never uses a combiner at all, so we just return a zero-sized array. - return new Combiner[0]; - } -} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java index 48d2e5922..955c5728c 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -98,6 +98,7 @@ public class CombinedInputOutput { String word = null; ProbabilityInfo probabilityInfo = new ProbabilityInfo(0); boolean isNotAWord = false; + boolean isPossiblyOffensive = false; ArrayList<WeightedString> bigrams = new ArrayList<>(); ArrayList<WeightedString> shortcuts = new ArrayList<>(); while (null != (line = reader.readLine())) { @@ -106,7 +107,7 @@ public class CombinedInputOutput { if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { if (null != word) { dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, - isNotAWord, false /* isPossiblyOffensive */); + isNotAWord, isPossiblyOffensive); for (WeightedString s : bigrams) { dict.setBigram(word, s.mWord, s.mProbabilityInfo); } @@ -114,27 +115,37 @@ public class CombinedInputOutput { if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>(); if (!bigrams.isEmpty()) bigrams = new ArrayList<>(); isNotAWord = false; + isPossiblyOffensive = false; for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); - if (CombinedFormatUtils.WORD_TAG.equals(params[0])) { - word = params[1]; - } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { - probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), - probabilityInfo.mTimestamp, probabilityInfo.mLevel, - probabilityInfo.mCount); - } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { - final String[] historicalInfoParams = - params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); - if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { - throw new RuntimeException("Wrong format (historical info) : " + line); - } - probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, - Integer.parseInt(historicalInfoParams[0]), - Integer.parseInt(historicalInfoParams[1]), - Integer.parseInt(historicalInfoParams[2])); - } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) { - isNotAWord = "true".equals(params[1]); + switch (params[0]) { + case CombinedFormatUtils.WORD_TAG: + word = params[1]; + break; + case CombinedFormatUtils.PROBABILITY_TAG: + probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), + probabilityInfo.mTimestamp, probabilityInfo.mLevel, + probabilityInfo.mCount); + break; + case CombinedFormatUtils.HISTORICAL_INFO_TAG: + final String[] historicalInfoParams = params[1].split( + CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); + if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { + throw new RuntimeException("Wrong format (historical info) : " + + line); + } + probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, + Integer.parseInt(historicalInfoParams[0]), + Integer.parseInt(historicalInfoParams[1]), + Integer.parseInt(historicalInfoParams[2])); + break; + case CombinedFormatUtils.NOT_A_WORD_TAG: + isNotAWord = CombinedFormatUtils.isLiteralTrue(params[1]); + break; + case CombinedFormatUtils.POSSIBLY_OFFENSIVE_TAG: + isPossiblyOffensive = CombinedFormatUtils.isLiteralTrue(params[1]); + break; } } } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { @@ -190,7 +201,7 @@ public class CombinedInputOutput { } if (null != word) { dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord, - false /* isPossiblyOffensive */); + isPossiblyOffensive); for (WeightedString s : bigrams) { dict.setBigram(word, s.mWord, s.mProbabilityInfo); } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java index 8f9e4a3a6..6187853c8 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -120,7 +120,7 @@ public class DictionaryMaker { String inputCombined = null; String outputBinary = null; String outputCombined = null; - int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201. + int outputBinaryFormatVersion = FormatSpec.VERSION202; // the default version is 202. // Don't use code point table by default. int codePointTableMode = Ver2DictEncoder.CODE_POINT_TABLE_OFF; |