diff options
Diffstat (limited to 'native')
-rw-r--r-- | native/Android.mk | 5 | ||||
-rw-r--r-- | native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp | 1 | ||||
-rw-r--r-- | native/src/bigram_dictionary.cpp | 29 | ||||
-rw-r--r-- | native/src/defines.h | 10 | ||||
-rw-r--r-- | native/src/dictionary.cpp | 39 | ||||
-rw-r--r-- | native/src/dictionary.h | 1 | ||||
-rw-r--r-- | native/src/proximity_info.h | 6 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 300 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 22 |
9 files changed, 291 insertions, 122 deletions
diff --git a/native/Android.mk b/native/Android.mk index c8342e31f..4727b1e39 100644 --- a/native/Android.mk +++ b/native/Android.mk @@ -3,6 +3,11 @@ include $(CLEAR_VARS) LOCAL_C_INCLUDES += $(LOCAL_PATH)/src +LOCAL_CFLAGS += -Werror -Wall + +# To suppress compiler warnings for unused variables/functions used for debug features etc. +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function + LOCAL_SRC_FILES := \ jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \ jni/com_android_inputmethod_latin_BinaryDictionary.cpp \ diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 555a522eb..4b61c1414 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -131,7 +131,6 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return 0; ProximityInfo *pInfo = (ProximityInfo*)proximityInfo; - if (!pInfo) return 0; int *xCoordinates = env->GetIntArrayElements(xCoordinatesArray, NULL); int *yCoordinates = env->GetIntArrayElements(yCoordinatesArray, NULL); diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp index 5ec310f07..4bc436537 100644 --- a/native/src/bigram_dictionary.cpp +++ b/native/src/bigram_dictionary.cpp @@ -30,8 +30,10 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { - if (DEBUG_DICT) LOGI("BigramDictionary - constructor"); - if (DEBUG_DICT) LOGI("Has Bigram : %d", hasBigram); + if (DEBUG_DICT) { + LOGI("BigramDictionary - constructor"); + LOGI("Has Bigram : %d", hasBigram); + } } BigramDictionary::~BigramDictionary() { @@ -54,7 +56,9 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ } insertAt++; } - if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + if (DEBUG_DICT) { + LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + } if (insertAt < mMaxBigrams) { memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), @@ -68,7 +72,9 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ *dest++ = *word++; } *dest = 0; // NULL terminate - if (DEBUG_DICT) LOGI("Bigram: Added word at %d", insertAt); + if (DEBUG_DICT) { + LOGI("Bigram: Added word at %d", insertAt); + } return true; } return false; @@ -105,9 +111,10 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i mMaxBigrams = maxBigrams; if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) { - int pos = mParentDictionary->isValidWordRec( - DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength); - if (DEBUG_DICT) LOGI("Pos -> %d", pos); + int pos = mParentDictionary->isValidWord(prevWord, prevWordLength); + if (DEBUG_DICT) { + LOGI("Pos -> %d", pos); + } if (pos < 0) { return 0; } @@ -151,7 +158,9 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc } pos = followDownBranchAddress; // pos start at count int count = DICT[pos] & 0xFF; - if (DEBUG_DICT) LOGI("count - %d",count); + if (DEBUG_DICT) { + LOGI("count - %d",count); + } pos++; for (int i = 0; i < count; i++) { // pos at data @@ -225,7 +234,9 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc } depth++; if (followDownBranchAddress == 0) { - if (DEBUG_DICT) LOGI("ERROR!!! Cannot find bigram!!"); + if (DEBUG_DICT) { + LOGI("ERROR!!! Cannot find bigram!!"); + } break; } } diff --git a/native/src/defines.h b/native/src/defines.h index 00cbb6c9e..0a3240507 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -77,8 +77,8 @@ static void prof_out(void) { } #else // FLAG_DBG -#define LOGE -#define LOGI +#define LOGE(fmt, ...) +#define LOGI(fmt, ...) #define DEBUG_DICT false #define DEBUG_DICT_FULL false #define DEBUG_SHOW_FOUND_WORD false @@ -138,12 +138,14 @@ static void prof_out(void) { #define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent. -#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 70 -#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80 +#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80 +#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12 +#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 +#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp index d69cb2a53..a49769bdb 100644 --- a/native/src/dictionary.cpp +++ b/native/src/dictionary.cpp @@ -53,45 +53,8 @@ bool Dictionary::hasBigram() { return ((mDict[1] & 0xFF) == 1); } -// TODO: use uint16_t instead of unsigned short bool Dictionary::isValidWord(unsigned short *word, int length) { - if (IS_LATEST_DICT_VERSION) { - return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); - } else { - return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD); - } + return mUnigramDictionary->isValidWord(word, length); } -int Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) { - // returns address of bigram data of that word - // return -99 if not found - - int count = Dictionary::getCount(mDict, &pos); - unsigned short currentChar = (unsigned short) word[offset]; - for (int j = 0; j < count; j++) { - unsigned short c = Dictionary::getChar(mDict, &pos); - int terminal = Dictionary::getTerminal(mDict, &pos); - int childPos = Dictionary::getAddress(mDict, &pos); - if (c == currentChar) { - if (offset == length - 1) { - if (terminal) { - return (pos+1); - } - } else { - if (childPos != 0) { - int t = isValidWordRec(childPos, word, offset + 1, length); - if (t > 0) { - return t; - } - } - } - } - if (terminal) { - Dictionary::getFreq(mDict, IS_LATEST_DICT_VERSION, &pos); - } - // There could be two instances of each alphabet - upper and lower case. So continue - // looking ... - } - return NOT_VALID_WORD; -} } // namespace latinime diff --git a/native/src/dictionary.h b/native/src/dictionary.h index 13b2a2816..3b360d93a 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -43,7 +43,6 @@ public: } bool isValidWord(unsigned short *word, int length); - int isValidWordRec(int pos, unsigned short *word, int offset, int length); void *getDict() { return (void *)mDict; } int getDictSize() { return mDictSize; } int getMmapFd() { return mMmapFd; } diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index 0f1201866..c2062e8c5 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -32,13 +32,13 @@ public: bool hasSpaceProximity(const int x, const int y) const; private: int getStartIndexFromCoordinates(const int x, const int y) const; - const int CELL_WIDTH; - const int CELL_HEIGHT; + const int MAX_PROXIMITY_CHARS_SIZE; const int KEYBOARD_WIDTH; const int KEYBOARD_HEIGHT; const int GRID_WIDTH; const int GRID_HEIGHT; - const int MAX_PROXIMITY_CHARS_SIZE; + const int CELL_WIDTH; + const int CELL_HEIGHT; uint32_t *mProximityCharsArray; }; }; // namespace latinime diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 30fbaeae1..7bcdbb498 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -16,8 +16,6 @@ */ #include <assert.h> -#include <fcntl.h> -#include <stdio.h> #include <string.h> #define LOG_TAG "LatinIME: unigram_dictionary.cpp" @@ -34,16 +32,20 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = { 'o', 'e' }, { 'u', 'e' } }; -UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier, +// TODO: check the header +UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) - : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), + : DICT_ROOT(streamStart), + MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0), BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { - if (DEBUG_DICT) LOGI("UnigramDictionary - constructor"); + if (DEBUG_DICT) { + LOGI("UnigramDictionary - constructor"); + } } UnigramDictionary::~UnigramDictionary() {} @@ -183,7 +185,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, // Suggestion with missing character if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest missing characters %d", i); + } getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH); } } @@ -194,7 +198,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest excessive characters %d", i); + } getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH); } } @@ -205,7 +211,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, // Only suggest words that length is mInputLength if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest transposed characters %d", i); + } getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1); } } @@ -216,22 +224,27 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { for (int i = 1; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest missing space characters %d", i); + } getMissingSpaceWords(mInputLength, i); } } PROF_END(5); PROF_START(6); - if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY) { + if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) { // The first and last "mistyped spaces" are taken care of by excessive character handling for (int i = 1; i < codesSize - 1; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest words with proximity space %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest words with proximity space %d", i); + } const int x = xcoordinates[i]; const int y = ycoordinates[i]; - if (DEBUG_PROXIMITY_INFO) + if (DEBUG_PROXIMITY_INFO) { LOGI("Input[%d] x = %d, y = %d, has space proximity = %d", i, x, y, proximityInfo->hasSpaceProximity(x, y)); + } if (proximityInfo->hasSpaceProximity(x, y)) { getMistypedSpaceWords(mInputLength, i); } @@ -242,7 +255,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, unsigned short *outWords, int *frequencies) { - if (DEBUG_DICT) LOGI("initSuggest"); + if (DEBUG_DICT) { + LOGI("initSuggest"); + } mFrequencies = frequencies; mOutputChars = outWords; mInputCodes = codes; @@ -250,8 +265,7 @@ void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; } -void UnigramDictionary::registerNextLetter( - unsigned short c, int *nextLetters, int nextLettersSize) { +static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) { if (c < nextLettersSize) { nextLetters[c]++; } @@ -266,15 +280,17 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) LOGI("Found word = %s, freq = %d", s, frequency); } if (length > MAX_WORD_LENGTH) { - if (DEBUG_DICT) LOGI("Exceeded max word length."); + if (DEBUG_DICT) { + LOGI("Exceeded max word length."); + } return false; } // Find the right insertion point int insertAt = 0; while (insertAt < MAX_WORDS) { - if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency - && length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) { + // TODO: How should we sort words with the same frequency? + if (frequency > mFrequencies[insertAt]) { break; } insertAt++; @@ -283,7 +299,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) if (DEBUG_DICT) { char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Added word = %s, freq = %d", s, frequency); + LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX); } memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), @@ -297,13 +313,15 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) *dest++ = *word++; } *dest = 0; // NULL terminate - if (DEBUG_DICT) LOGI("Added word at %d", insertAt); + if (DEBUG_DICT) { + LOGI("Added word at %d", insertAt); + } return true; } return false; } -unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) { +static inline unsigned short toBaseLowerCase(unsigned short c) { if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { c = BASE_CHARS[c]; } @@ -344,7 +362,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, } int rootPosition = ROOT_POS; // Get the number of child of root, then increment the position - int childCount = Dictionary::getCount(DICT, &rootPosition); + int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); int depth = 0; mStackChildCount[0] = childCount; @@ -353,6 +371,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, mStackInputIndex[0] = 0; mStackDiffs[0] = 0; mStackSiblingPos[0] = rootPosition; + mStackOutputIndex[0] = 0; // Depth first search while (depth >= 0) { @@ -363,14 +382,15 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, int inputIndex = mStackInputIndex[depth]; int diffs = mStackDiffs[depth]; int siblingPos = mStackSiblingPos[depth]; + int outputIndex = mStackOutputIndex[depth]; int firstChildPos; // depth will never be greater than maxDepth because in that case, // needsToTraverseChildrenNodes should be false - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, + const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex, maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos, excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs, - &siblingPos); + &siblingPos, &outputIndex); // Update next sibling pos mStackSiblingPos[depth] = siblingPos; if (needsToTraverseChildrenNodes) { @@ -382,6 +402,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, mStackInputIndex[depth] = inputIndex; mStackDiffs[depth] = diffs; mStackSiblingPos[depth] = firstChildPos; + mStackOutputIndex[depth] = outputIndex; } } else { // Goes to parent sibling node @@ -390,17 +411,105 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, } } -inline static void multiplyRate(const int rate, int *freq) { - if (rate > 1000000) { - *freq = (*freq / 100) * rate; +static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; +static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); +} + +static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; +inline static void multiplyIntCapped(const int multiplier, int *base) { + const int temp = *base; + if (temp != S_INT_MAX) { + // Branch if multiplier == 2 for the optimization + if (multiplier == 2) { + *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; + } else { + const int tempRetval = temp * multiplier; + *base = tempRetval >= temp ? tempRetval : S_INT_MAX; + } + } +} + +inline static int powerIntCapped(const int base, const int n) { + if (base == 2) { + return n < 31 ? 1 << n : S_INT_MAX; } else { - *freq = *freq * rate / 100; + int ret = base; + for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); + return ret; + } +} + +inline static void multiplyRate(const int rate, int *freq) { + if (*freq != S_INT_MAX) { + if (*freq > 1000000) { + *freq /= 100; + multiplyIntCapped(rate, freq); + } else { + multiplyIntCapped(rate, freq); + *freq /= 100; + } + } +} + +inline static int calcFreqForSplitTwoWords( + const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength, + const int firstFreq, const int secondFreq, const bool isSpaceProximity) { + if (firstWordLength == 0 || secondWordLength == 0) { + return 0; + } + const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); + int tempFirstFreq = firstFreq; + multiplyRate(firstDemotionRate, &tempFirstFreq); + + const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); + int tempSecondFreq = secondFreq; + multiplyRate(secondDemotionRate, &tempSecondFreq); + + const int totalLength = firstWordLength + secondWordLength; + + // Promote pairFreq with multiplying by 2, because the word length is the same as the typed + // length. + int totalFreq = tempFirstFreq + tempSecondFreq; + + // This is a workaround to try offsetting the not-enough-demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) + // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by + // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) + const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); + multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); + + // At this moment, totalFreq is calculated by the following formula: + // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) + // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) + + multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); + + // This is another workaround to offset the demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote + // the same amount because we already have adjusted the synthetic freq of this "missing or + // mistyped space" suggestion candidate above in this method. + const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); + multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); + + if (isSpaceProximity) { + // A word pair with one space proximity correction + if (DEBUG_DICT) { + LOGI("Found a word pair with space proximity correction."); + } + multiplyIntCapped(typedLetterMultiplier, &totalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } + + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + return totalFreq; } bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos, - const int secondWordLength) { + const int secondWordLength, const bool isSpaceProximity) { if (inputLength >= MAX_WORD_LENGTH) return false; if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) @@ -409,7 +518,9 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, // Allocating variable length array on stack unsigned short word[newWordLength]; const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord); - if (DEBUG_DICT) LOGI("First freq: %d", firstFreq); + if (DEBUG_DICT) { + LOGI("First freq: %d", firstFreq); + } if (firstFreq <= 0) return false; for (int i = 0; i < firstWordLength; ++i) { @@ -417,7 +528,9 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, } const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord); - if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq); + if (DEBUG_DICT) { + LOGI("Second freq: %d", secondFreq); + } if (secondFreq <= 0) return false; word[firstWordLength] = SPACE; @@ -425,22 +538,25 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, word[i] = mWord[i - firstWordLength - 1]; } - int pairFreq = ((firstFreq + secondFreq) / 2); - for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER; - multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &pairFreq); + int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength, + secondWordLength, firstFreq, secondFreq, isSpaceProximity); + if (DEBUG_DICT) { + LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength, + TYPED_LETTER_MULTIPLIER); + } addWord(word, newWordLength, pairFreq); return true; } bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) { return getSplitTwoWordsSuggestion( - inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos); + inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false); } bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) { return getSplitTwoWordsSuggestion( inputLength, 0, spaceProximityPos, spaceProximityPos + 1, - inputLength - spaceProximityPos - 1); + inputLength - spaceProximityPos - 1, true); } // Keep this for comparing spec to new getWords @@ -448,7 +564,7 @@ void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int excessivePos, const int transposedPos,int *nextLetters, const int nextLettersSize) { int initialPosition = initialPos; - const int count = Dictionary::getCount(DICT, &initialPosition); + const int count = Dictionary::getCount(DICT_ROOT, &initialPosition); getWordsRec(count, initialPosition, 0, min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH), mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters, @@ -469,12 +585,13 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons int newInputIndex; int newDiffs; int newSiblingPos; + int newOutputIndex; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos, excessivePos, transposedPos, nextLetters, nextLettersSize, &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate, - &newInputIndex, &newDiffs, &newSiblingPos); + &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex); siblingPos = newSiblingPos; if (needsToTraverseChildrenNodes) { @@ -485,19 +602,21 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons } } -static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; -static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { - return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); -} inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) const { // TODO: Demote by edit distance int finalFreq = freq * matchWeight; if (skipPos >= 0) { - if (mInputLength >= 3) { - multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE * - (mInputLength - 2) / (mInputLength - 1), &finalFreq); + if (mInputLength >= 2) { + const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE + * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) + / (10 * mInputLength + - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); + if (DEBUG_DICT_FULL) { + LOGI("Demotion rate for missing character is %d.", demotionRate); + } + multiplyRate(demotionRate, &finalFreq); } else { finalFreq = 0; } @@ -511,17 +630,30 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int } } int lengthFreq = TYPED_LETTER_MULTIPLIER; - for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER; + multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq); if (lengthFreq == matchWeight) { + // Full exact match if (depth > 1) { - if (DEBUG_DICT) LOGI("Found full matched word."); + if (DEBUG_DICT) { + LOGI("Found full matched word."); + } multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } + } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) { + // A word with proximity corrections + if (DEBUG_DICT) { + LOGI("Found one proximity correction."); + } + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } - if (sameLength) finalFreq *= FULL_WORD_MULTIPLIER; + if (DEBUG_DICT) { + LOGI("calc: %d, %d", depth, sameLength); + } + if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq); return finalFreq; } @@ -625,7 +757,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth const int diffs, const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, - int *nextSiblingPosition) { + int *nextSiblingPosition, int *nextOutputIndex) { if (DEBUG_DICT) { int inputCount = 0; if (skipPos >= 0) ++inputCount; @@ -641,8 +773,9 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; - *nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c, - &childPosition, &terminal, &freq); + *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos, + &c, &childPosition, &terminal, &freq); + *nextOutputIndex = depth + 1; const bool needsToTraverseChildrenNodes = childPosition != 0; @@ -674,7 +807,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { - matchWeight = matchWeight * TYPED_LETTER_MULTIPLIER; + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); } bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); @@ -700,7 +833,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth *newTraverseAllNodes = true; } // get the count of nodes and increment childAddress. - *newCount = Dictionary::getCount(DICT, &childPosition); + *newCount = Dictionary::getCount(DICT_ROOT, &childPosition); *newChildPosition = childPosition; if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); return needsToTraverseChildrenNodes; @@ -709,7 +842,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word) { int pos = ROOT_POS; - int count = Dictionary::getCount(DICT, &pos); + int count = Dictionary::getCount(DICT_ROOT, &pos); int maxFreq = 0; int depth = 0; unsigned short newWord[MAX_WORD_LENGTH_INTERNAL]; @@ -765,10 +898,12 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh const int inputIndex = startInputIndex + depth; const int *currentChars = getInputCharsAt(inputIndex); unsigned short c; - *siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c, - newChildPosition, newTerminal, newFreq); + *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos, + &c, newChildPosition, newTerminal, newFreq); const unsigned int inputC = currentChars[0]; - if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX); + if (DEBUG_DICT) { + assert(inputC <= U_SHORT_MAX); + } const unsigned short baseLowerC = toBaseLowerCase(c); const bool matched = (inputC == baseLowerC || inputC == c); const bool hasChild = *newChildPosition != 0; @@ -776,10 +911,12 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh word[depth] = c; if (DEBUG_DICT && DEBUG_NODE) { LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq); - if (*newTerminal) LOGI("Terminal %d", *newFreq); + if (*newTerminal) { + LOGI("Terminal %d", *newFreq); + } } if (hasChild) { - *newCount = Dictionary::getCount(DICT, newChildPosition); + *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition); return true; } else { return false; @@ -791,4 +928,49 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh return false; } } + +// TODO: use uint32_t instead of unsigned short +bool UnigramDictionary::isValidWord(unsigned short *word, int length) { + if (IS_LATEST_DICT_VERSION) { + return (getFrequency(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); + } else { + return (getFrequency(0, word, 0, length) != NOT_VALID_WORD); + } +} + + +// Require strict exact match. +int UnigramDictionary::getFrequency(int pos, unsigned short *word, int offset, int length) const { + // returns address of bigram data of that word + // return -99 if not found + + int count = Dictionary::getCount(DICT_ROOT, &pos); + unsigned short currentChar = (unsigned short) word[offset]; + for (int j = 0; j < count; j++) { + unsigned short c = Dictionary::getChar(DICT_ROOT, &pos); + int terminal = Dictionary::getTerminal(DICT_ROOT, &pos); + int childPos = Dictionary::getAddress(DICT_ROOT, &pos); + if (c == currentChar) { + if (offset == length - 1) { + if (terminal) { + return (pos+1); + } + } else { + if (childPos != 0) { + int t = getFrequency(childPos, word, offset + 1, length); + if (t > 0) { + return t; + } + } + } + } + if (terminal) { + Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos); + } + // There could be two instances of each alphabet - upper and lower case. So continue + // looking ... + } + return NOT_VALID_WORD; +} + } // namespace latinime diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 3d3007ce0..b8e4914fa 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -17,9 +17,14 @@ #ifndef LATINIME_UNIGRAM_DICTIONARY_H #define LATINIME_UNIGRAM_DICTIONARY_H +#include <stdint.h> #include "defines.h" #include "proximity_info.h" +#ifndef NULL +#define NULL 0 +#endif + namespace latinime { class UnigramDictionary { @@ -31,8 +36,10 @@ class UnigramDictionary { } ProximityType; public: - UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier, - int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion); + UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, + int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, + const bool isLatestDictVersion); + bool isValidWord(unsigned short *word, int length); int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies); @@ -52,6 +59,7 @@ private: void getSuggestionCandidates(const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, const int maxDepth); + int getFrequency(int pos, unsigned short *word, int offset, int length) const; void getVersionNumber(); bool checkIfDictVersionIsLatest(); int getAddress(int *pos); @@ -59,21 +67,19 @@ private: int wideStrLen(unsigned short *str); bool sameAsTyped(unsigned short *word, int length); bool addWord(unsigned short *word, int length, int frequency); - unsigned short toBaseLowerCase(unsigned short c); void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs, const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize); bool getSplitTwoWordsSuggestion(const int inputLength, const int firstWordStartPos, const int firstWordLength, - const int secondWordStartPos, const int secondWordLength); + const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity); bool getMissingSpaceWords(const int inputLength, const int missingSpacePos); bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos); // Keep getWordsOld for comparing performance between getWords and getWordsOld void getWordsOld(const int initialPos, const int inputLength, const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize); - void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize); int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) const; @@ -94,7 +100,7 @@ private: const int diffs, const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs, - int *nextSiblingPosition); + int *nextSiblingPosition, int *nextOutputIndex); int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word); // Process a node by considering missing space bool processCurrentNodeForExactMatch(const int firstChildPos, @@ -104,7 +110,8 @@ private: inline const int* getInputCharsAt(const int index) const { return mInputCodes + (index * MAX_PROXIMITY_CHARS); } - const unsigned char *DICT; + + const uint8_t* const DICT_ROOT; const int MAX_WORD_LENGTH; const int MAX_WORDS; const int MAX_PROXIMITY_CHARS; @@ -138,6 +145,7 @@ private: int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL]; int mStackDiffs[MAX_WORD_LENGTH_INTERNAL]; int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL]; + int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL]; int mNextLettersFrequency[NEXT_LETTERS_SIZE]; }; |