diff options
-rw-r--r-- | java/res/values/donottranslate.xml | 4 | ||||
-rw-r--r-- | java/res/values/strings.xml | 6 | ||||
-rw-r--r-- | java/src/com/android/inputmethod/latin/AdditionalSubtype.java | 18 | ||||
-rw-r--r-- | java/src/com/android/inputmethod/latin/Constants.java | 19 | ||||
-rw-r--r-- | java/src/com/android/inputmethod/latin/SubtypeLocale.java | 57 | ||||
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 16 | ||||
-rw-r--r-- | native/jni/src/bigram_dictionary.h | 6 | ||||
-rw-r--r-- | native/jni/src/binary_format.h | 11 | ||||
-rw-r--r-- | native/jni/src/defines.h | 18 | ||||
-rw-r--r-- | native/jni/src/dictionary.h | 13 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 53 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.h | 30 | ||||
-rw-r--r-- | tests/src/com/android/inputmethod/latin/SubtypeLocaleTests.java | 10 |
13 files changed, 176 insertions, 85 deletions
diff --git a/java/res/values/donottranslate.xml b/java/res/values/donottranslate.xml index 4fe447e20..73aa54f7e 100644 --- a/java/res/values/donottranslate.xml +++ b/java/res/values/donottranslate.xml @@ -134,7 +134,9 @@ <item>5</item> </string-array> - <!-- Subtype locale display name exceptions --> + <!-- Subtype locale display name exceptions. + For each exception, there should be related string resource for display name that has + explicit keyboard layout. The string resource name must be "subtype_with_layout_<locale>. --> <string-array name="subtype_locale_exception_keys"> <item>en_US</item> <item>en_GB</item> diff --git a/java/res/values/strings.xml b/java/res/values/strings.xml index 69fdea79f..cb458a4ed 100644 --- a/java/res/values/strings.xml +++ b/java/res/values/strings.xml @@ -252,6 +252,12 @@ <string name="subtype_en_GB">English (UK)</string> <!-- Description for English (United States) keyboard subtype [CHAR LIMIT=25] --> <string name="subtype_en_US">English (US)</string> + <!-- Description for English (United Kingdom) keyboard subtype with explicit keyboard layout [CHAR LIMIT=25] + This should be identical to subtype_en_GB aside from the trailing (%s). --> + <string name="subtype_with_layout_en_GB">English (UK) (<xliff:g id="layout">%s</xliff:g>)</string> + <!-- Description for English (United States) keyboard subtype with explicit keyboard layout [CHAR LIMIT=25] + This should be identical to subtype_en_US aside from the trailing (%s). --> + <string name="subtype_with_layout_en_US">English (US) (<xliff:g id="layout">%s</xliff:g>)</string> <!-- Description for language agnostic keyboard subtype [CHAR LIMIT=25] --> <string name="subtype_no_language">No language</string> <!-- Description for language agnostic QWERTY keyboard subtype [CHAR LIMIT=25] --> diff --git a/java/src/com/android/inputmethod/latin/AdditionalSubtype.java b/java/src/com/android/inputmethod/latin/AdditionalSubtype.java index 06d33154f..f0076a5b6 100644 --- a/java/src/com/android/inputmethod/latin/AdditionalSubtype.java +++ b/java/src/com/android/inputmethod/latin/AdditionalSubtype.java @@ -19,7 +19,9 @@ package com.android.inputmethod.latin; import static com.android.inputmethod.latin.Constants.Subtype.KEYBOARD_MODE; import static com.android.inputmethod.latin.Constants.Subtype.ExtraValue.IS_ADDITIONAL_SUBTYPE; import static com.android.inputmethod.latin.Constants.Subtype.ExtraValue.KEYBOARD_LAYOUT_SET; +import static com.android.inputmethod.latin.Constants.Subtype.ExtraValue.UNTRANSLATABLE_STRING_IN_SUBTYPE_NAME; +import android.os.Build; import android.view.inputmethod.InputMethodSubtype; import java.util.ArrayList; @@ -40,12 +42,22 @@ public class AdditionalSubtype { public static InputMethodSubtype createAdditionalSubtype( String localeString, String keyboardLayoutSetName, String extraValue) { final String layoutExtraValue = KEYBOARD_LAYOUT_SET + "=" + keyboardLayoutSetName; - final String filteredExtraValue = StringUtils.appendToCsvIfNotExists( - IS_ADDITIONAL_SUBTYPE, extraValue); + final String layoutDisplayNameExtraValue; + if (Build.VERSION.SDK_INT >= /* JELLY_BEAN */ 15 + && SubtypeLocale.isExceptionalLocale(localeString)) { + final String layoutDisplayName = SubtypeLocale.getKeyboardLayoutSetDisplayName( + keyboardLayoutSetName); + layoutDisplayNameExtraValue = StringUtils.appendToCsvIfNotExists( + UNTRANSLATABLE_STRING_IN_SUBTYPE_NAME + "=" + layoutDisplayName, extraValue); + } else { + layoutDisplayNameExtraValue = extraValue; + } + final String additionalSubtypeExtraValue = StringUtils.appendToCsvIfNotExists( + IS_ADDITIONAL_SUBTYPE, layoutDisplayNameExtraValue); final int nameId = SubtypeLocale.getSubtypeNameId(localeString, keyboardLayoutSetName); return new InputMethodSubtype(nameId, R.drawable.ic_subtype_keyboard, localeString, KEYBOARD_MODE, - layoutExtraValue + "," + filteredExtraValue, false, false); + layoutExtraValue + "," + additionalSubtypeExtraValue, false, false); } public static String getPrefSubtype(InputMethodSubtype subtype) { diff --git a/java/src/com/android/inputmethod/latin/Constants.java b/java/src/com/android/inputmethod/latin/Constants.java index b205cc004..7c2284569 100644 --- a/java/src/com/android/inputmethod/latin/Constants.java +++ b/java/src/com/android/inputmethod/latin/Constants.java @@ -65,11 +65,6 @@ public final class Constants { public static final class ExtraValue { /** - * The subtype extra value used to indicate that the subtype keyboard layout set name. - */ - public static final String KEYBOARD_LAYOUT_SET = "KeyboardLayoutSet"; - - /** * The subtype extra value used to indicate that the subtype keyboard layout is capable * for typing ASCII characters. */ @@ -82,6 +77,20 @@ public final class Constants { public static final String REQ_NETWORK_CONNECTIVITY = "requireNetworkConnectivity"; /** + * The subtype extra value used to indicate that the subtype display name contains "%s" + * for replacement mark and it should be replaced by this extra value. + * This extra value is supported on JellyBean and later. + */ + public static final String UNTRANSLATABLE_STRING_IN_SUBTYPE_NAME = + "UntranslatableReplacementStringInSubtypeName"; + + /** + * The subtype extra value used to indicate that the subtype keyboard layout set name. + * This extra value is private to LatinIME. + */ + public static final String KEYBOARD_LAYOUT_SET = "KeyboardLayoutSet"; + + /** * The subtype extra value used to indicate that the subtype is additional subtype * that the user defined. This extra value is private to LatinIME. */ diff --git a/java/src/com/android/inputmethod/latin/SubtypeLocale.java b/java/src/com/android/inputmethod/latin/SubtypeLocale.java index 7694b56fc..ca293060a 100644 --- a/java/src/com/android/inputmethod/latin/SubtypeLocale.java +++ b/java/src/com/android/inputmethod/latin/SubtypeLocale.java @@ -17,9 +17,12 @@ package com.android.inputmethod.latin; import static com.android.inputmethod.latin.Constants.Subtype.ExtraValue.KEYBOARD_LAYOUT_SET; +import static com.android.inputmethod.latin.Constants.Subtype.ExtraValue.UNTRANSLATABLE_STRING_IN_SUBTYPE_NAME; import android.content.Context; import android.content.res.Resources; +import android.os.Build; +import android.util.Log; import android.view.inputmethod.InputMethodSubtype; import com.android.inputmethod.latin.LocaleUtils.RunInLocale; @@ -28,7 +31,7 @@ import java.util.HashMap; import java.util.Locale; public class SubtypeLocale { - private static final String TAG = SubtypeLocale.class.getSimpleName(); + static final String TAG = SubtypeLocale.class.getSimpleName(); // This class must be located in the same package as LatinIME.java. private static final String RESOURCE_PACKAGE_NAME = DictionaryFactory.class.getPackage().getName(); @@ -38,16 +41,20 @@ public class SubtypeLocale { public static final String QWERTY = "qwerty"; public static final int UNKNOWN_KEYBOARD_LAYOUT = R.string.subtype_generic; - private static Context sContext; private static String[] sPredefinedKeyboardLayoutSet; // Keyboard layout to its display name map. - private static final HashMap<String, String> sKeyboardKayoutToDisplayNameMap = + private static final HashMap<String, String> sKeyboardLayoutToDisplayNameMap = new HashMap<String, String>(); // Keyboard layout to subtype name resource id map. private static final HashMap<String, Integer> sKeyboardLayoutToNameIdsMap = new HashMap<String, Integer>(); + // Exceptional locale to subtype name resource id map. + private static final HashMap<String, Integer> sExceptionalLocaleToWithLayoutNameIdsMap = + new HashMap<String, Integer>(); private static final String SUBTYPE_NAME_RESOURCE_GENERIC_PREFIX = "string/subtype_generic_"; + private static final String SUBTYPE_NAME_RESOURCE_WITH_LAYOUT_PREFIX = + "string/subtype_with_layout_"; private static final String SUBTYPE_NAME_RESOURCE_NO_LANGUAGE_PREFIX = "string/subtype_no_language_"; // Exceptional locales to display name map. @@ -59,7 +66,6 @@ public class SubtypeLocale { } public static void init(Context context) { - sContext = context; final Resources res = context.getResources(); final String[] predefinedLayoutSet = res.getStringArray(R.array.predefined_layouts); @@ -68,7 +74,7 @@ public class SubtypeLocale { R.array.predefined_layout_display_names); for (int i = 0; i < predefinedLayoutSet.length; i++) { final String layoutName = predefinedLayoutSet[i]; - sKeyboardKayoutToDisplayNameMap.put(layoutName, layoutDisplayNames[i]); + sKeyboardLayoutToDisplayNameMap.put(layoutName, layoutDisplayNames[i]); final String resourceName = SUBTYPE_NAME_RESOURCE_GENERIC_PREFIX + layoutName; final int resId = res.getIdentifier(resourceName, null, RESOURCE_PACKAGE_NAME); sKeyboardLayoutToNameIdsMap.put(layoutName, resId); @@ -85,7 +91,11 @@ public class SubtypeLocale { final String[] exceptionalDisplayNames = res.getStringArray( R.array.subtype_locale_exception_values); for (int i = 0; i < exceptionalLocales.length; i++) { - sExceptionalDisplayNamesMap.put(exceptionalLocales[i], exceptionalDisplayNames[i]); + final String localeString = exceptionalLocales[i]; + sExceptionalDisplayNamesMap.put(localeString, exceptionalDisplayNames[i]); + final String resourceName = SUBTYPE_NAME_RESOURCE_WITH_LAYOUT_PREFIX + localeString; + final int resId = res.getIdentifier(resourceName, null, RESOURCE_PACKAGE_NAME); + sExceptionalLocaleToWithLayoutNameIdsMap.put(localeString, resId); } } @@ -93,11 +103,18 @@ public class SubtypeLocale { return sPredefinedKeyboardLayoutSet; } + public static boolean isExceptionalLocale(String localeString) { + return sExceptionalLocaleToWithLayoutNameIdsMap.containsKey(localeString); + } + private static final String getNoLanguageLayoutKey(String keyboardLayoutName) { return NO_LANGUAGE + "_" + keyboardLayoutName; } public static int getSubtypeNameId(String localeString, String keyboardLayoutName) { + if (Build.VERSION.SDK_INT >= /* JELLY_BEAN */ 15 && isExceptionalLocale(localeString)) { + return sExceptionalLocaleToWithLayoutNameIdsMap.get(localeString); + } final String key = localeString.equals(NO_LANGUAGE) ? getNoLanguageLayoutKey(keyboardLayoutName) : keyboardLayoutName; @@ -129,16 +146,26 @@ public class SubtypeLocale { // en_US azerty T English (US) (AZERTY) // zz azerty T No language (AZERTY) in system locale - public static String getSubtypeDisplayName(InputMethodSubtype subtype, Resources res) { - // TODO: Remove this check when InputMethodManager.getLastInputMethodSubtype is - // fixed. - if (!ImfUtils.checkIfSubtypeBelongsToThisIme(sContext, subtype)) return ""; - final String language = getSubtypeLocaleDisplayName(subtype.getLocale()); + public static String getSubtypeDisplayName(final InputMethodSubtype subtype, Resources res) { + final String replacementString = (Build.VERSION.SDK_INT >= /* JELLY_BEAN */ 15 + && subtype.containsExtraValueKey(UNTRANSLATABLE_STRING_IN_SUBTYPE_NAME)) + ? subtype.getExtraValueOf(UNTRANSLATABLE_STRING_IN_SUBTYPE_NAME) + : getSubtypeLocaleDisplayName(subtype.getLocale()); final int nameResId = subtype.getNameResId(); final RunInLocale<String> getSubtypeName = new RunInLocale<String>() { @Override protected String job(Resources res) { - return res.getString(nameResId, language); + try { + return res.getString(nameResId, replacementString); + } catch (Resources.NotFoundException e) { + // TODO: Remove this catch when InputMethodManager.getCurrentInputMethodSubtype + // is fixed. + Log.w(TAG, "Unknown subtype: mode=" + subtype.getMode() + + " locale=" + subtype.getLocale() + + " extra=" + subtype.getExtraValue() + + "\n" + Utils.getStackTrace()); + return ""; + } } }; final Locale locale = isNoLanguage(subtype) @@ -158,7 +185,11 @@ public class SubtypeLocale { public static String getKeyboardLayoutSetDisplayName(InputMethodSubtype subtype) { final String layoutName = getKeyboardLayoutSetName(subtype); - return sKeyboardKayoutToDisplayNameMap.get(layoutName); + return getKeyboardLayoutSetDisplayName(layoutName); + } + + public static String getKeyboardLayoutSetDisplayName(String layoutName) { + return sKeyboardLayoutToDisplayNameMap.get(layoutName); } public static String getKeyboardLayoutSetName(InputMethodSubtype subtype) { diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index 67f96281d..220b340d1 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -153,8 +153,19 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, return pos; } -void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord, - const int prevWordLength, std::map<int, int> *map) { +static inline void setInFilter(uint8_t *filter, const int position) { + const unsigned int bucket = position % BIGRAM_FILTER_MODULO; + filter[bucket >> 3] |= (1 << (bucket & 0x7)); +} + +static inline bool isInFilter(uint8_t *filter, const int position) { + const unsigned int bucket = position % BIGRAM_FILTER_MODULO; + return filter[bucket >> 3] & (1 << (bucket & 0x7)); +} + +void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, + const int prevWordLength, std::map<int, int> *map, uint8_t *filter) { + memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t* const root = DICT; int pos = getBigramListPositionForWord(prevWord, prevWordLength); if (0 == pos) return; @@ -166,6 +177,7 @@ void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); (*map)[bigramPos] = frequency; + setInFilter(filter, bigramPos); } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } diff --git a/native/jni/src/bigram_dictionary.h b/native/jni/src/bigram_dictionary.h index b1233215b..7328d5828 100644 --- a/native/jni/src/bigram_dictionary.h +++ b/native/jni/src/bigram_dictionary.h @@ -20,6 +20,8 @@ #include <map> #include <stdint.h> +#include "defines.h" + namespace latinime { class Dictionary; @@ -29,8 +31,8 @@ class BigramDictionary { int getBigrams(const int32_t *word, int length, int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams); int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength); - void fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int prevWordLength, - std::map<int, int> *map); + void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, + std::map<int, int> *map, uint8_t *filter); ~BigramDictionary(); private: bool addWordBigram(unsigned short *word, int length, int frequency); diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index d5d67c108..71ade48a3 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -66,7 +66,8 @@ class BinaryFormat { const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, uint16_t* outWord); - static int getProbability(const int bigramListPosition, const int unigramFreq); + static int getProbability(const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const int unigramFreq); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or @@ -519,9 +520,11 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a } // This should probably return a probability in log space. -inline int BinaryFormat::getProbability(const int bigramListPosition, const int unigramFreq) { - // TODO: use the bigram list position to get the bigram probability. If the bigram - // is not found, use the unigram frequency. +inline int BinaryFormat::getProbability(const std::map<int, int> *bigramMap, + const uint8_t *bigramFilter, const int unigramFreq) { + // TODO: use the bigram filter for fast rejection, then the bigram map for lookup + // to get the bigram probability. If the bigram is not found, use the unigram frequency. + // Don't forget that they can be null. // TODO: if the unigram frequency is used, compute the actual probability return unigramFreq; } diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index c99f8a8b2..cb3dbb115 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -241,6 +241,24 @@ static inline void prof_out(void) { #define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 +// Size, in bytes, of the bloom filter index for bigrams +// 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k, +// where k is the number of hash functions, n the number of bigrams, and m the number of +// bits we can test. +// At the moment 100 is the maximum number of bigrams for a word with the current +// dictionaries, so n = 100. 1024 buckets give us m = 1024. +// With 1 hash function, our false positive rate is about 9.3%, which should be enough for +// our uses since we are only using this to increase average performance. For the record, +// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, +// and m = 4096 gives 2.4%. +#define BIGRAM_FILTER_BYTE_SIZE 128 +// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest +// prime under 128 * 8. +#define BIGRAM_FILTER_MODULO 1021 +#if BIGRAM_FILTER_BYTE_SIZE * 8 < BIGRAM_FILTER_MODULO +#error "BIGRAM_FILTER_MODULO is larger than BIGRAM_FILTER_BYTE_SIZE" +#endif + template<typename T> inline T min(T a, T b) { return a < b ? a : b; } template<typename T> inline T max(T a, T b) { return a > b ? a : b; } diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h index a2b0491c5..bce86d1ad 100644 --- a/native/jni/src/dictionary.h +++ b/native/jni/src/dictionary.h @@ -37,16 +37,13 @@ class Dictionary { int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, int *codes, int codesSize, const int32_t* prevWordChars, const int prevWordLength, bool useFullEditDistance, unsigned short *outWords, int *frequencies) { - // bigramListPosition is, as an int, the offset of the bigram list in the file. - // If none, it's zero. - const int bigramListPosition = !prevWordChars ? 0 - : mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength); std::map<int, int> bigramMap; - mBigramDictionary->fillBigramAddressToFrequencyMap(prevWordChars, prevWordLength, - &bigramMap); + uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE]; + mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars, + prevWordLength, &bigramMap, bigramFilter); return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, - mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition, - useFullEditDistance, outWords, frequencies); + mCorrection, xcoordinates, ycoordinates, codes, codesSize, &bigramMap, + bigramFilter, useFullEditDistance, outWords, frequencies); } int getBigrams(const int32_t *word, int length, int *codes, int codesSize, diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 0c759d438..2e5468dd7 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -98,7 +98,7 @@ int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, cons void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer, int *yCoordinatesBuffer, - const int codesBufferSize, const int bigramListPosition, + const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const bool useFullEditDistance, const int *codesSrc, const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, WordsPriorityQueuePool *queuePool, @@ -128,7 +128,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit replacementCodePoint; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - bigramListPosition, useFullEditDistance, codesSrc + i + 1, + bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1, codesRemain - i - 1, currentDepth + 1, codesDest + i, correction, queuePool, digraphs, digraphsSize); @@ -138,7 +138,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR); getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - bigramListPosition, useFullEditDistance, codesSrc + i, codesRemain - i, + bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i, currentDepth + 1, codesDest + i, correction, queuePool, digraphs, digraphsSize); return; @@ -161,16 +161,18 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit } getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, - startIndex + codesRemain, bigramListPosition, useFullEditDistance, correction, + startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction, queuePool); } -// bigramListPosition is the offset in the file to the list of bigrams for the previous word. +// bigramMap contains the association <bigram address> -> <bigram frequency> +// bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter +// in bigram_dictionary.cpp int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, - const int bigramListPosition, const bool useFullEditDistance, unsigned short *outWords, - int *frequencies) { + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, unsigned short *outWords, int *frequencies) { queuePool->clearAll(); Correction* masterCorrection = correction; @@ -180,7 +182,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, GERMAN_UMLAUT_DIGRAPHS, sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); @@ -189,13 +191,13 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, FRENCH_LIGATURES_DIGRAPHS, sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); } else { // Normal processing getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, - bigramListPosition, useFullEditDistance, masterCorrection, queuePool); + bigramMap, bigramFilter, useFullEditDistance, masterCorrection, queuePool); } PROF_START(20); @@ -228,15 +230,15 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const int inputLength, const int bigramListPosition, const bool useFullEditDistance, - Correction *correction, WordsPriorityQueuePool *queuePool) { + const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) { PROF_OPEN; PROF_START(0); PROF_END(0); PROF_START(1); - getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramListPosition, + getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramMap, bigramFilter, useFullEditDistance, inputLength, correction, queuePool); PROF_END(1); @@ -308,15 +310,16 @@ static const char SPACE = ' '; void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const int bigramListPosition, const bool useFullEditDistance, const int inputLength, + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool) { initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, bigramListPosition, correction, + getSuggestionCandidates(useFullEditDistance, inputLength, bigramMap, bigramFilter, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, const int bigramListPosition, + const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams @@ -337,7 +340,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - bigramListPosition, correction, &childCount, &firstChildPos, &siblingPos, + bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos, queuePool, currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -432,8 +435,8 @@ bool UnigramDictionary::getSubStringSuggestion( queuePool->clearSubQueue(currentWordIndex); // TODO: pass the bigram list for substring suggestion getSuggestionCandidates(useFullEditDistance, inputWordLength, - 0 /* bigramListPosition */, correction, queuePool, false /* doAutoCompletion */, - MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); + 0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool, + false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); if (DEBUG_DICT) { if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); @@ -763,9 +766,9 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, - const int bigramListPosition, Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, - const int currentWordIndex) { + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, + int *newCount, int *newChildrenPosition, int *nextSiblingPosition, + WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } @@ -846,9 +849,9 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); - // The bigramListPosition is the offset in the file of the bigrams for the previous word, - // or zero if we don't know of any bigrams for it. - const int probability = BinaryFormat::getProbability(bigramListPosition, unigramFreq); + // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. + // bigramFilter is a bloom filter of said frequencies for even faster rejection. + const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index 0cc59bac8..b9233518f 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -17,6 +17,7 @@ #ifndef LATINIME_UNIGRAM_DICTIONARY_H #define LATINIME_UNIGRAM_DICTIONARY_H +#include <map> #include <stdint.h> #include "correction.h" #include "correction_state.h" @@ -75,32 +76,36 @@ class UnigramDictionary { int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, - const int *codes, const int codesSize, const int bigramListPosition, - const bool useFullEditDistance, unsigned short *outWords, int *frequencies); + const int *codes, const int codesSize, const std::map<int, int> *bigramMap, + const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords, + int *frequencies); virtual ~UnigramDictionary(); private: void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int inputLength, - const int bigramListPosition, const bool useFullEditDistance, Correction *correction, + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool); int getDigraphReplacement(const int *codes, const int i, const int codesSize, const digraph_t* const digraphs, const unsigned int digraphsSize) const; void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize, - const int bigramListPosition, const bool useFullEditDistance, const int* codesSrc, - const int codesRemain, const int currentDepth, int* codesDest, Correction *correction, + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, const int* codesSrc, const int codesRemain, + const int currentDepth, int* codesDest, Correction *correction, WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs, const unsigned int digraphsSize); void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int bigramListPosition, - const bool useFullEditDistance, const int inputLength, Correction *correction, - WordsPriorityQueuePool* queuePool); + const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap, + const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool); void getSuggestionCandidates( - const bool useFullEditDistance, const int inputLength, const int bigramListPosition, + const bool useFullEditDistance, const int inputLength, + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex); void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, @@ -114,9 +119,10 @@ class UnigramDictionary { bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character - bool processCurrentNode(const int initialPos, const int bigramListPosition, - Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition, - WordsPriorityQueuePool *queuePool, const int currentWordIndex); + bool processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap, + const uint8_t *bigramFilter, Correction *correction, int *newCount, + int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, + const int currentWordIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, diff --git a/tests/src/com/android/inputmethod/latin/SubtypeLocaleTests.java b/tests/src/com/android/inputmethod/latin/SubtypeLocaleTests.java index 8863bcf47..c70c2fde5 100644 --- a/tests/src/com/android/inputmethod/latin/SubtypeLocaleTests.java +++ b/tests/src/com/android/inputmethod/latin/SubtypeLocaleTests.java @@ -30,14 +30,12 @@ public class SubtypeLocaleTests extends AndroidTestCase { // Locale to subtypes list. private final ArrayList<InputMethodSubtype> mSubtypesList = new ArrayList<InputMethodSubtype>(); - private Context mContext; private Resources mRes; @Override protected void setUp() throws Exception { super.setUp(); final Context context = getContext(); - mContext = context; mRes = context.getResources(); SubtypeLocale.init(context); } @@ -124,10 +122,6 @@ public class SubtypeLocaleTests extends AndroidTestCase { final InputMethodSubtype ZZ_AZERTY = AdditionalSubtype.createAdditionalSubtype( SubtypeLocale.NO_LANGUAGE, "azerty", null); - ImfUtils.setAdditionalInputMethodSubtypes(mContext, new InputMethodSubtype[] { - DE_QWERTY, FR_QWERTZ, US_AZERTY, ZZ_AZERTY - }); - final RunInLocale<Void> tests = new RunInLocale<Void>() { @Override protected Void job(Resources res) { @@ -191,10 +185,6 @@ public class SubtypeLocaleTests extends AndroidTestCase { final InputMethodSubtype ZZ_AZERTY = AdditionalSubtype.createAdditionalSubtype( SubtypeLocale.NO_LANGUAGE, "azerty", null); - ImfUtils.setAdditionalInputMethodSubtypes(mContext, new InputMethodSubtype[] { - DE_QWERTY, FR_QWERTZ, US_AZERTY, ZZ_AZERTY - }); - final RunInLocale<Void> tests = new RunInLocale<Void>() { @Override protected Void job(Resources res) { |