diff options
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r-- | native/src/unigram_dictionary.cpp | 300 |
1 files changed, 241 insertions, 59 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 30fbaeae1..7bcdbb498 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -16,8 +16,6 @@ */ #include <assert.h> -#include <fcntl.h> -#include <stdio.h> #include <string.h> #define LOG_TAG "LatinIME: unigram_dictionary.cpp" @@ -34,16 +32,20 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = { 'o', 'e' }, { 'u', 'e' } }; -UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier, +// TODO: check the header +UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) - : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), + : DICT_ROOT(streamStart), + MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0), BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { - if (DEBUG_DICT) LOGI("UnigramDictionary - constructor"); + if (DEBUG_DICT) { + LOGI("UnigramDictionary - constructor"); + } } UnigramDictionary::~UnigramDictionary() {} @@ -183,7 +185,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, // Suggestion with missing character if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest missing characters %d", i); + } getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH); } } @@ -194,7 +198,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest excessive characters %d", i); + } getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH); } } @@ -205,7 +211,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, // Only suggest words that length is mInputLength if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest transposed characters %d", i); + } getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1); } } @@ -216,22 +224,27 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { for (int i = 1; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest missing space characters %d", i); + } getMissingSpaceWords(mInputLength, i); } } PROF_END(5); PROF_START(6); - if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY) { + if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) { // The first and last "mistyped spaces" are taken care of by excessive character handling for (int i = 1; i < codesSize - 1; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest words with proximity space %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest words with proximity space %d", i); + } const int x = xcoordinates[i]; const int y = ycoordinates[i]; - if (DEBUG_PROXIMITY_INFO) + if (DEBUG_PROXIMITY_INFO) { LOGI("Input[%d] x = %d, y = %d, has space proximity = %d", i, x, y, proximityInfo->hasSpaceProximity(x, y)); + } if (proximityInfo->hasSpaceProximity(x, y)) { getMistypedSpaceWords(mInputLength, i); } @@ -242,7 +255,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, unsigned short *outWords, int *frequencies) { - if (DEBUG_DICT) LOGI("initSuggest"); + if (DEBUG_DICT) { + LOGI("initSuggest"); + } mFrequencies = frequencies; mOutputChars = outWords; mInputCodes = codes; @@ -250,8 +265,7 @@ void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; } -void UnigramDictionary::registerNextLetter( - unsigned short c, int *nextLetters, int nextLettersSize) { +static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) { if (c < nextLettersSize) { nextLetters[c]++; } @@ -266,15 +280,17 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) LOGI("Found word = %s, freq = %d", s, frequency); } if (length > MAX_WORD_LENGTH) { - if (DEBUG_DICT) LOGI("Exceeded max word length."); + if (DEBUG_DICT) { + LOGI("Exceeded max word length."); + } return false; } // Find the right insertion point int insertAt = 0; while (insertAt < MAX_WORDS) { - if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency - && length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) { + // TODO: How should we sort words with the same frequency? + if (frequency > mFrequencies[insertAt]) { break; } insertAt++; @@ -283,7 +299,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) if (DEBUG_DICT) { char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Added word = %s, freq = %d", s, frequency); + LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX); } memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), @@ -297,13 +313,15 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) *dest++ = *word++; } *dest = 0; // NULL terminate - if (DEBUG_DICT) LOGI("Added word at %d", insertAt); + if (DEBUG_DICT) { + LOGI("Added word at %d", insertAt); + } return true; } return false; } -unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) { +static inline unsigned short toBaseLowerCase(unsigned short c) { if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { c = BASE_CHARS[c]; } @@ -344,7 +362,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, } int rootPosition = ROOT_POS; // Get the number of child of root, then increment the position - int childCount = Dictionary::getCount(DICT, &rootPosition); + int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); int depth = 0; mStackChildCount[0] = childCount; @@ -353,6 +371,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, mStackInputIndex[0] = 0; mStackDiffs[0] = 0; mStackSiblingPos[0] = rootPosition; + mStackOutputIndex[0] = 0; // Depth first search while (depth >= 0) { @@ -363,14 +382,15 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, int inputIndex = mStackInputIndex[depth]; int diffs = mStackDiffs[depth]; int siblingPos = mStackSiblingPos[depth]; + int outputIndex = mStackOutputIndex[depth]; int firstChildPos; // depth will never be greater than maxDepth because in that case, // needsToTraverseChildrenNodes should be false - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, + const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex, maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos, excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs, - &siblingPos); + &siblingPos, &outputIndex); // Update next sibling pos mStackSiblingPos[depth] = siblingPos; if (needsToTraverseChildrenNodes) { @@ -382,6 +402,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, mStackInputIndex[depth] = inputIndex; mStackDiffs[depth] = diffs; mStackSiblingPos[depth] = firstChildPos; + mStackOutputIndex[depth] = outputIndex; } } else { // Goes to parent sibling node @@ -390,17 +411,105 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, } } -inline static void multiplyRate(const int rate, int *freq) { - if (rate > 1000000) { - *freq = (*freq / 100) * rate; +static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; +static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); +} + +static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; +inline static void multiplyIntCapped(const int multiplier, int *base) { + const int temp = *base; + if (temp != S_INT_MAX) { + // Branch if multiplier == 2 for the optimization + if (multiplier == 2) { + *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; + } else { + const int tempRetval = temp * multiplier; + *base = tempRetval >= temp ? tempRetval : S_INT_MAX; + } + } +} + +inline static int powerIntCapped(const int base, const int n) { + if (base == 2) { + return n < 31 ? 1 << n : S_INT_MAX; } else { - *freq = *freq * rate / 100; + int ret = base; + for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); + return ret; + } +} + +inline static void multiplyRate(const int rate, int *freq) { + if (*freq != S_INT_MAX) { + if (*freq > 1000000) { + *freq /= 100; + multiplyIntCapped(rate, freq); + } else { + multiplyIntCapped(rate, freq); + *freq /= 100; + } + } +} + +inline static int calcFreqForSplitTwoWords( + const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength, + const int firstFreq, const int secondFreq, const bool isSpaceProximity) { + if (firstWordLength == 0 || secondWordLength == 0) { + return 0; + } + const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); + int tempFirstFreq = firstFreq; + multiplyRate(firstDemotionRate, &tempFirstFreq); + + const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); + int tempSecondFreq = secondFreq; + multiplyRate(secondDemotionRate, &tempSecondFreq); + + const int totalLength = firstWordLength + secondWordLength; + + // Promote pairFreq with multiplying by 2, because the word length is the same as the typed + // length. + int totalFreq = tempFirstFreq + tempSecondFreq; + + // This is a workaround to try offsetting the not-enough-demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) + // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by + // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) + const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); + multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); + + // At this moment, totalFreq is calculated by the following formula: + // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) + // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) + + multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); + + // This is another workaround to offset the demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote + // the same amount because we already have adjusted the synthetic freq of this "missing or + // mistyped space" suggestion candidate above in this method. + const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); + multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); + + if (isSpaceProximity) { + // A word pair with one space proximity correction + if (DEBUG_DICT) { + LOGI("Found a word pair with space proximity correction."); + } + multiplyIntCapped(typedLetterMultiplier, &totalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } + + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + return totalFreq; } bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos, - const int secondWordLength) { + const int secondWordLength, const bool isSpaceProximity) { if (inputLength >= MAX_WORD_LENGTH) return false; if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) @@ -409,7 +518,9 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, // Allocating variable length array on stack unsigned short word[newWordLength]; const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord); - if (DEBUG_DICT) LOGI("First freq: %d", firstFreq); + if (DEBUG_DICT) { + LOGI("First freq: %d", firstFreq); + } if (firstFreq <= 0) return false; for (int i = 0; i < firstWordLength; ++i) { @@ -417,7 +528,9 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, } const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord); - if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq); + if (DEBUG_DICT) { + LOGI("Second freq: %d", secondFreq); + } if (secondFreq <= 0) return false; word[firstWordLength] = SPACE; @@ -425,22 +538,25 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, word[i] = mWord[i - firstWordLength - 1]; } - int pairFreq = ((firstFreq + secondFreq) / 2); - for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER; - multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &pairFreq); + int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength, + secondWordLength, firstFreq, secondFreq, isSpaceProximity); + if (DEBUG_DICT) { + LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength, + TYPED_LETTER_MULTIPLIER); + } addWord(word, newWordLength, pairFreq); return true; } bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) { return getSplitTwoWordsSuggestion( - inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos); + inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false); } bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) { return getSplitTwoWordsSuggestion( inputLength, 0, spaceProximityPos, spaceProximityPos + 1, - inputLength - spaceProximityPos - 1); + inputLength - spaceProximityPos - 1, true); } // Keep this for comparing spec to new getWords @@ -448,7 +564,7 @@ void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int excessivePos, const int transposedPos,int *nextLetters, const int nextLettersSize) { int initialPosition = initialPos; - const int count = Dictionary::getCount(DICT, &initialPosition); + const int count = Dictionary::getCount(DICT_ROOT, &initialPosition); getWordsRec(count, initialPosition, 0, min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH), mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters, @@ -469,12 +585,13 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons int newInputIndex; int newDiffs; int newSiblingPos; + int newOutputIndex; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos, excessivePos, transposedPos, nextLetters, nextLettersSize, &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate, - &newInputIndex, &newDiffs, &newSiblingPos); + &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex); siblingPos = newSiblingPos; if (needsToTraverseChildrenNodes) { @@ -485,19 +602,21 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons } } -static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; -static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { - return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); -} inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) const { // TODO: Demote by edit distance int finalFreq = freq * matchWeight; if (skipPos >= 0) { - if (mInputLength >= 3) { - multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE * - (mInputLength - 2) / (mInputLength - 1), &finalFreq); + if (mInputLength >= 2) { + const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE + * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) + / (10 * mInputLength + - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); + if (DEBUG_DICT_FULL) { + LOGI("Demotion rate for missing character is %d.", demotionRate); + } + multiplyRate(demotionRate, &finalFreq); } else { finalFreq = 0; } @@ -511,17 +630,30 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int } } int lengthFreq = TYPED_LETTER_MULTIPLIER; - for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER; + multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq); if (lengthFreq == matchWeight) { + // Full exact match if (depth > 1) { - if (DEBUG_DICT) LOGI("Found full matched word."); + if (DEBUG_DICT) { + LOGI("Found full matched word."); + } multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } + } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) { + // A word with proximity corrections + if (DEBUG_DICT) { + LOGI("Found one proximity correction."); + } + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } - if (sameLength) finalFreq *= FULL_WORD_MULTIPLIER; + if (DEBUG_DICT) { + LOGI("calc: %d, %d", depth, sameLength); + } + if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq); return finalFreq; } @@ -625,7 +757,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth const int diffs, const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, - int *nextSiblingPosition) { + int *nextSiblingPosition, int *nextOutputIndex) { if (DEBUG_DICT) { int inputCount = 0; if (skipPos >= 0) ++inputCount; @@ -641,8 +773,9 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; - *nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c, - &childPosition, &terminal, &freq); + *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos, + &c, &childPosition, &terminal, &freq); + *nextOutputIndex = depth + 1; const bool needsToTraverseChildrenNodes = childPosition != 0; @@ -674,7 +807,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { - matchWeight = matchWeight * TYPED_LETTER_MULTIPLIER; + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); } bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); @@ -700,7 +833,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth *newTraverseAllNodes = true; } // get the count of nodes and increment childAddress. - *newCount = Dictionary::getCount(DICT, &childPosition); + *newCount = Dictionary::getCount(DICT_ROOT, &childPosition); *newChildPosition = childPosition; if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); return needsToTraverseChildrenNodes; @@ -709,7 +842,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word) { int pos = ROOT_POS; - int count = Dictionary::getCount(DICT, &pos); + int count = Dictionary::getCount(DICT_ROOT, &pos); int maxFreq = 0; int depth = 0; unsigned short newWord[MAX_WORD_LENGTH_INTERNAL]; @@ -765,10 +898,12 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh const int inputIndex = startInputIndex + depth; const int *currentChars = getInputCharsAt(inputIndex); unsigned short c; - *siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c, - newChildPosition, newTerminal, newFreq); + *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos, + &c, newChildPosition, newTerminal, newFreq); const unsigned int inputC = currentChars[0]; - if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX); + if (DEBUG_DICT) { + assert(inputC <= U_SHORT_MAX); + } const unsigned short baseLowerC = toBaseLowerCase(c); const bool matched = (inputC == baseLowerC || inputC == c); const bool hasChild = *newChildPosition != 0; @@ -776,10 +911,12 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh word[depth] = c; if (DEBUG_DICT && DEBUG_NODE) { LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq); - if (*newTerminal) LOGI("Terminal %d", *newFreq); + if (*newTerminal) { + LOGI("Terminal %d", *newFreq); + } } if (hasChild) { - *newCount = Dictionary::getCount(DICT, newChildPosition); + *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition); return true; } else { return false; @@ -791,4 +928,49 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh return false; } } + +// TODO: use uint32_t instead of unsigned short +bool UnigramDictionary::isValidWord(unsigned short *word, int length) { + if (IS_LATEST_DICT_VERSION) { + return (getFrequency(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); + } else { + return (getFrequency(0, word, 0, length) != NOT_VALID_WORD); + } +} + + +// Require strict exact match. +int UnigramDictionary::getFrequency(int pos, unsigned short *word, int offset, int length) const { + // returns address of bigram data of that word + // return -99 if not found + + int count = Dictionary::getCount(DICT_ROOT, &pos); + unsigned short currentChar = (unsigned short) word[offset]; + for (int j = 0; j < count; j++) { + unsigned short c = Dictionary::getChar(DICT_ROOT, &pos); + int terminal = Dictionary::getTerminal(DICT_ROOT, &pos); + int childPos = Dictionary::getAddress(DICT_ROOT, &pos); + if (c == currentChar) { + if (offset == length - 1) { + if (terminal) { + return (pos+1); + } + } else { + if (childPos != 0) { + int t = getFrequency(childPos, word, offset + 1, length); + if (t > 0) { + return t; + } + } + } + } + if (terminal) { + Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos); + } + // There could be two instances of each alphabet - upper and lower case. So continue + // looking ... + } + return NOT_VALID_WORD; +} + } // namespace latinime |