diff options
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r-- | native/src/unigram_dictionary.cpp | 838 |
1 files changed, 123 insertions, 715 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index afa8bc545..bbfaea454 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -24,9 +24,7 @@ #include "dictionary.h" #include "unigram_dictionary.h" -#ifdef NEW_DICTIONARY_FORMAT #include "binary_format.h" -#endif // NEW_DICTIONARY_FORMAT namespace latinime { @@ -39,28 +37,23 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) -#ifndef NEW_DICTIONARY_FORMAT - : DICT_ROOT(streamStart), -#else // NEW_DICTIONARY_FORMAT : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE), -#endif // NEW_DICTIONARY_FORMAT MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), -#ifndef NEW_DICTIONARY_FORMAT - ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0), -#else // NEW_DICTIONARY_FORMAT // TODO : remove this variable. ROOT_POS(0), -#endif // NEW_DICTIONARY_FORMAT BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { if (DEBUG_DICT) { LOGI("UnigramDictionary - constructor"); } + mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); } -UnigramDictionary::~UnigramDictionary() {} +UnigramDictionary::~UnigramDictionary() { + delete mCorrection; +} static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize, const int MAX_PROXIMITY_CHARS) { @@ -174,12 +167,6 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x LOGI("%s %i", s, mFrequencies[j]); #endif } - LOGI("Next letters: "); - for (int k = 0; k < NEXT_LETTERS_SIZE; k++) { - if (mNextLettersFrequency[k] > 0) { - LOGI("%c = %d,", k, mNextLettersFrequency[k]); - } - } } PROF_END(20); PROF_CLOSE; @@ -196,11 +183,12 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies); if (DEBUG_DICT) assert(codesSize == mInputLength); - const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); + const int maxDepth = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); + mCorrection->initCorrection(mProximityInfo, mInputLength, maxDepth); PROF_END(0); PROF_START(1); - getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH); + getSuggestionCandidates(-1, -1, -1); PROF_END(1); PROF_START(2); @@ -210,7 +198,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, if (DEBUG_DICT) { LOGI("--- Suggest missing characters %d", i); } - getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH); + getSuggestionCandidates(i, -1, -1); } } PROF_END(2); @@ -223,7 +211,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, if (DEBUG_DICT) { LOGI("--- Suggest excessive characters %d", i); } - getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH); + getSuggestionCandidates(-1, i, -1); } } PROF_END(3); @@ -236,7 +224,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, if (DEBUG_DICT) { LOGI("--- Suggest transposed characters %d", i); } - getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1); + getSuggestionCandidates(-1, -1, i); } } PROF_END(4); @@ -249,7 +237,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, if (DEBUG_DICT) { LOGI("--- Suggest missing space characters %d", i); } - getMissingSpaceWords(mInputLength, i); + getMissingSpaceWords(mInputLength, i, mCorrection); } } PROF_END(5); @@ -268,7 +256,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, i, x, y, proximityInfo->hasSpaceProximity(x, y)); } if (proximityInfo->hasSpaceProximity(x, y)) { - getMistypedSpaceWords(mInputLength, i); + getMistypedSpaceWords(mInputLength, i, mCorrection); } } } @@ -284,7 +272,6 @@ void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int mFrequencies = frequencies; mOutputChars = outWords; mInputLength = codesSize; - mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; proximityInfo->setInputParams(codes, codesSize); mProximityInfo = proximityInfo; } @@ -354,70 +341,43 @@ static const char QUOTE = '\''; static const char SPACE = ' '; void UnigramDictionary::getSuggestionCandidates(const int skipPos, - const int excessivePos, const int transposedPos, int *nextLetters, - const int nextLettersSize, const int maxDepth) { + const int excessivePos, const int transposedPos) { if (DEBUG_DICT) { - LOGI("getSuggestionCandidates %d", maxDepth); assert(transposedPos + 1 < mInputLength); assert(excessivePos < mInputLength); assert(missingPos < mInputLength); } + mCorrection->setCorrectionParams(skipPos, excessivePos, transposedPos, + -1 /* spaceProximityPos */, -1 /* missingSpacePos */); int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); - int depth = 0; + int outputIndex = 0; - mStackChildCount[0] = childCount; - mStackTraverseAll[0] = (mInputLength <= 0); - mStackNodeFreq[0] = 1; - mStackInputIndex[0] = 0; - mStackDiffs[0] = 0; - mStackSiblingPos[0] = rootPosition; - mStackOutputIndex[0] = 0; + mCorrection->initCorrectionState(rootPosition, childCount, (mInputLength <= 0)); // Depth first search - while (depth >= 0) { - if (mStackChildCount[depth] > 0) { - --mStackChildCount[depth]; - bool traverseAllNodes = mStackTraverseAll[depth]; - int matchWeight = mStackNodeFreq[depth]; - int inputIndex = mStackInputIndex[depth]; - int diffs = mStackDiffs[depth]; - int siblingPos = mStackSiblingPos[depth]; - int outputIndex = mStackOutputIndex[depth]; + while (outputIndex >= 0) { + if (mCorrection->initProcessState(outputIndex)) { + int siblingPos = mCorrection->getTreeSiblingPos(outputIndex); int firstChildPos; - // depth will never be greater than maxDepth because in that case, - // needsToTraverseChildrenNodes should be false - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex, - maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos, - excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount, - &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs, - &siblingPos, &outputIndex); + + const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, + mCorrection, &childCount, &firstChildPos, &siblingPos); // Update next sibling pos - mStackSiblingPos[depth] = siblingPos; + mCorrection->setTreeSiblingPos(outputIndex, siblingPos); + if (needsToTraverseChildrenNodes) { // Goes to child node - ++depth; - mStackChildCount[depth] = childCount; - mStackTraverseAll[depth] = traverseAllNodes; - mStackNodeFreq[depth] = matchWeight; - mStackInputIndex[depth] = inputIndex; - mStackDiffs[depth] = diffs; - mStackSiblingPos[depth] = firstChildPos; - mStackOutputIndex[depth] = outputIndex; + outputIndex = mCorrection->goDownTree(outputIndex, childCount, firstChildPos); } } else { // Goes to parent sibling node - --depth; + outputIndex = mCorrection->getTreeParentIndex(outputIndex); } } } -static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; -static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { - return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); -} - static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; inline static void multiplyIntCapped(const int multiplier, int *base) { const int temp = *base; @@ -432,149 +392,18 @@ inline static void multiplyIntCapped(const int multiplier, int *base) { } } -inline static int powerIntCapped(const int base, const int n) { - if (base == 2) { - return n < 31 ? 1 << n : S_INT_MAX; - } else { - int ret = base; - for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); - return ret; - } +void UnigramDictionary::getMissingSpaceWords( + const int inputLength, const int missingSpacePos, Correction *correction) { + correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, + -1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos); + getSplitTwoWordsSuggestion(inputLength, correction); } -inline static void multiplyRate(const int rate, int *freq) { - if (*freq != S_INT_MAX) { - if (*freq > 1000000) { - *freq /= 100; - multiplyIntCapped(rate, freq); - } else { - multiplyIntCapped(rate, freq); - *freq /= 100; - } - } -} - -inline static int calcFreqForSplitTwoWords( - const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength, - const int firstFreq, const int secondFreq, const bool isSpaceProximity) { - if (firstWordLength == 0 || secondWordLength == 0) { - return 0; - } - const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); - int tempFirstFreq = firstFreq; - multiplyRate(firstDemotionRate, &tempFirstFreq); - - const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); - int tempSecondFreq = secondFreq; - multiplyRate(secondDemotionRate, &tempSecondFreq); - - const int totalLength = firstWordLength + secondWordLength; - - // Promote pairFreq with multiplying by 2, because the word length is the same as the typed - // length. - int totalFreq = tempFirstFreq + tempSecondFreq; - - // This is a workaround to try offsetting the not-enough-demotion which will be done in - // calcNormalizedScore in Utils.java. - // In calcNormalizedScore the score will be demoted by (1 - 1 / length) - // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by - // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) - const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); - multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); - - // At this moment, totalFreq is calculated by the following formula: - // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) - // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) - - multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); - - // This is another workaround to offset the demotion which will be done in - // calcNormalizedScore in Utils.java. - // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote - // the same amount because we already have adjusted the synthetic freq of this "missing or - // mistyped space" suggestion candidate above in this method. - const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); - multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); - - if (isSpaceProximity) { - // A word pair with one space proximity correction - if (DEBUG_DICT) { - LOGI("Found a word pair with space proximity correction."); - } - multiplyIntCapped(typedLetterMultiplier, &totalFreq); - multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); - } - - multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); - return totalFreq; -} - -bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) { - return getSplitTwoWordsSuggestion( - inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false); -} - -bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) { - return getSplitTwoWordsSuggestion( - inputLength, 0, spaceProximityPos, spaceProximityPos + 1, - inputLength - spaceProximityPos - 1, true); -} - -inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, - const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos, - const int freq, const bool sameLength) const { - // TODO: Demote by edit distance - int finalFreq = freq * matchWeight; - if (skipPos >= 0) { - if (mInputLength >= 2) { - const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE - * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) - / (10 * mInputLength - - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); - if (DEBUG_DICT_FULL) { - LOGI("Demotion rate for missing character is %d.", demotionRate); - } - multiplyRate(demotionRate, &finalFreq); - } else { - finalFreq = 0; - } - } - if (transposedPos >= 0) multiplyRate( - WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); - if (excessivePos >= 0) { - multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); - if (!mProximityInfo->existsAdjacentProximityChars(inputIndex)) { - // If an excessive character is not adjacent to the left char or the right char, - // we will demote this word. - multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); - } - } - int lengthFreq = TYPED_LETTER_MULTIPLIER; - multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq); - if (lengthFreq == matchWeight) { - // Full exact match - if (depth > 1) { - if (DEBUG_DICT) { - LOGI("Found full matched word."); - } - multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); - } - if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { - finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); - } - } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) { - // A word with proximity corrections - if (DEBUG_DICT) { - LOGI("Found one proximity correction."); - } - multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq); - multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); - } - if (DEBUG_DICT) { - LOGI("calc: %d, %d", depth, sameLength); - } - if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq); - return finalFreq; +void UnigramDictionary::getMistypedSpaceWords( + const int inputLength, const int spaceProximityPos, Correction *correction) { + correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, + -1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */); + getSplitTwoWordsSuggestion(inputLength, correction); } inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, @@ -584,35 +413,38 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth; } - -inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth, - const uint8_t* const root, const uint8_t flags, const int pos, - const int inputIndex, const int matchWeight, const int skipPos, - const int excessivePos, const int transposedPos, const int freq, const bool sameLength, - int* nextLetters, const int nextLettersSize) { - - const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false; - if (isSameAsTyped) return; - - if (depth >= MIN_SUGGEST_DEPTH) { - const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, - excessivePos, transposedPos, freq, sameLength); - if (!isSameAsTyped) - addWord(word, depth + 1, finalFreq); +inline void UnigramDictionary::onTerminal(const int freq, Correction *correction) { + int wordLength; + unsigned short* wordPointer; + const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); + if (finalFreq >= 0) { + addWord(wordPointer, wordLength, finalFreq); } +} - if (sameLength && depth >= mInputLength && skipPos < 0) { - registerNextLetter(word[mInputLength], nextLetters, nextLettersSize); +void UnigramDictionary::getSplitTwoWordsSuggestion( + const int inputLength, Correction* correction) { + const int spaceProximityPos = correction->getSpaceProximityPos(); + const int missingSpacePos = correction->getMissingSpacePos(); + if (DEBUG_DICT) { + int inputCount = 0; + if (spaceProximityPos >= 0) ++inputCount; + if (missingSpacePos >= 0) ++inputCount; + assert(inputCount <= 1); } -} + const bool isSpaceProximity = spaceProximityPos >= 0; + const int firstWordStartPos = 0; + const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; + const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; + const int secondWordLength = isSpaceProximity + ? (inputLength - spaceProximityPos - 1) + : (inputLength - missingSpacePos); -bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, - const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos, - const int secondWordLength, const bool isSpaceProximity) { - if (inputLength >= MAX_WORD_LENGTH) return false; + if (inputLength >= MAX_WORD_LENGTH) return; if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) - return false; + return; + const int newWordLength = firstWordLength + secondWordLength + 1; // Allocating variable length array on stack unsigned short word[newWordLength]; @@ -620,7 +452,7 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, if (DEBUG_DICT) { LOGI("First freq: %d", firstFreq); } - if (firstFreq <= 0) return false; + if (firstFreq <= 0) return; for (int i = 0; i < firstWordLength; ++i) { word[i] = mWord[i]; @@ -630,299 +462,21 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, if (DEBUG_DICT) { LOGI("Second freq: %d", secondFreq); } - if (secondFreq <= 0) return false; + if (secondFreq <= 0) return; word[firstWordLength] = SPACE; for (int i = (firstWordLength + 1); i < newWordLength; ++i) { word[i] = mWord[i - firstWordLength - 1]; } - int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength, - secondWordLength, firstFreq, secondFreq, isSpaceProximity); + const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq); if (DEBUG_DICT) { - LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength, - TYPED_LETTER_MULTIPLIER); + LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); } addWord(word, newWordLength, pairFreq); - return true; -} - -#ifndef NEW_DICTIONARY_FORMAT -// The following functions will be entirely replaced with new implementations. -void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos, - const int excessivePos, const int transposedPos,int *nextLetters, - const int nextLettersSize) { - int initialPosition = initialPos; - const int count = Dictionary::getCount(DICT_ROOT, &initialPosition); - getWordsRec(count, initialPosition, 0, - min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH), - mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters, - nextLettersSize); -} - -void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth, - const int maxDepth, const bool traverseAllNodes, const int matchWeight, - const int inputIndex, const int diffs, const int skipPos, const int excessivePos, - const int transposedPos, int *nextLetters, const int nextLettersSize) { - int siblingPos = pos; - for (int i = 0; i < childrenCount; ++i) { - int newCount; - int newChildPosition; - bool newTraverseAllNodes; - int newMatchRate; - int newInputIndex; - int newDiffs; - int newSiblingPos; - int newOutputIndex; - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth, - traverseAllNodes, matchWeight, inputIndex, diffs, - skipPos, excessivePos, transposedPos, - nextLetters, nextLettersSize, - &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate, - &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex); - siblingPos = newSiblingPos; - - if (needsToTraverseChildrenNodes) { - getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes, - newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos, - nextLetters, nextLettersSize); - } - } -} - -inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, - const int inputLength, unsigned short *word) { - int pos = ROOT_POS; - int count = Dictionary::getCount(DICT_ROOT, &pos); - int maxFreq = 0; - int depth = 0; - unsigned short newWord[MAX_WORD_LENGTH_INTERNAL]; - bool terminal = false; - - mStackChildCount[0] = count; - mStackSiblingPos[0] = pos; - - while (depth >= 0) { - if (mStackChildCount[depth] > 0) { - --mStackChildCount[depth]; - int firstChildPos; - int newFreq; - int siblingPos = mStackSiblingPos[depth]; - const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos, - startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq, - &siblingPos); - mStackSiblingPos[depth] = siblingPos; - if (depth == (inputLength - 1)) { - // Traverse sibling node - if (terminal) { - if (newFreq > maxFreq) { - for (int i = 0; i < inputLength; ++i) word[i] = newWord[i]; - if (DEBUG_DICT && DEBUG_NODE) { -#ifdef FLAG_DBG - char s[inputLength + 1]; - for (int i = 0; i < inputLength; ++i) s[i] = word[i]; - s[inputLength] = 0; - LOGI("New missing space word found: %d > %d (%s), %d, %d", - newFreq, maxFreq, s, inputLength, depth); -#endif - } - maxFreq = newFreq; - } - } - } else if (needsToTraverseChildrenNodes) { - // Traverse children nodes - ++depth; - mStackChildCount[depth] = count; - mStackSiblingPos[depth] = firstChildPos; - } - } else { - // Traverse parent node - --depth; - } - } - - word[inputLength] = 0; - return maxFreq; + return; } -inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos, - const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition, - int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) { - const int inputIndex = startInputIndex + depth; - unsigned short c; - *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos, - &c, newChildPosition, newTerminal, newFreq); - const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex); - if (DEBUG_DICT) { - assert(inputC <= U_SHORT_MAX); - } - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); - const bool matched = (inputC == baseLowerC || inputC == c); - const bool hasChild = *newChildPosition != 0; - if (matched) { - word[depth] = c; - if (DEBUG_DICT && DEBUG_NODE) { - LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq); - if (*newTerminal) { - LOGI("Terminal %d", *newFreq); - } - } - if (hasChild) { - *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition); - return true; - } else { - return false; - } - } else { - // If this node is not user typed character, this method treats this word as unmatched. - // Thus newTerminal shouldn't be true. - *newTerminal = false; - return false; - } -} - -// TODO: use uint32_t instead of unsigned short -bool UnigramDictionary::isValidWord(unsigned short *word, int length) { - if (IS_LATEST_DICT_VERSION) { - return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); - } else { - return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD); - } -} - - -// Require strict exact match. -int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, - int length) const { - // returns address of bigram data of that word - // return -99 if not found - - int count = Dictionary::getCount(DICT_ROOT, &pos); - unsigned short currentChar = (unsigned short) word[offset]; - for (int j = 0; j < count; j++) { - unsigned short c = Dictionary::getChar(DICT_ROOT, &pos); - int terminal = Dictionary::getTerminal(DICT_ROOT, &pos); - int childPos = Dictionary::getAddress(DICT_ROOT, &pos); - if (c == currentChar) { - if (offset == length - 1) { - if (terminal) { - return (pos+1); - } - } else { - if (childPos != 0) { - int t = getBigramPosition(childPos, word, offset + 1, length); - if (t > 0) { - return t; - } - } - } - } - if (terminal) { - Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos); - } - // There could be two instances of each alphabet - upper and lower case. So continue - // looking ... - } - return NOT_VALID_WORD; -} - -// The following functions will be modified. -inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth, - const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex, - const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos, - int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, - bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, - int *nextSiblingPosition, int *nextOutputIndex) { - if (DEBUG_DICT) { - int inputCount = 0; - if (skipPos >= 0) ++inputCount; - if (excessivePos >= 0) ++inputCount; - if (transposedPos >= 0) ++inputCount; - assert(inputCount <= 1); - } - unsigned short c; - int childPosition; - bool terminal; - int freq; - bool isSameAsUserTypedLength = false; - - const int pos = initialPos; - const int depth = initialDepth; - const int traverseAllNodes = initialTraverseAllNodes; - const int diffs = initialDiffs; - - const uint8_t flags = 0; // No flags for now - - if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; - - *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos, - &c, &childPosition, &terminal, &freq); - *nextOutputIndex = depth + 1; - - const bool needsToTraverseChildrenNodes = childPosition != 0; - - // If we are only doing traverseAllNodes, no need to look at the typed characters. - if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { - mWord[depth] = c; - if (traverseAllNodes && terminal) { - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize); - } - if (!needsToTraverseChildrenNodes) return false; - *newTraverseAllNodes = traverseAllNodes; - *newMatchRate = matchWeight; - *newDiffs = diffs; - *newInputIndex = inputIndex; - } else { - int inputIndexForProximity = inputIndex; - - if (transposedPos >= 0) { - if (inputIndex == transposedPos) ++inputIndexForProximity; - if (inputIndex == (transposedPos + 1)) --inputIndexForProximity; - } - - ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId( - inputIndexForProximity, c, skipPos, excessivePos, transposedPos); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false; - mWord[depth] = c; - // If inputIndex is greater than mInputLength, that means there is no - // proximity chars. So, we don't need to check proximity. - if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { - multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); - } - bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 - || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); - if (isSameAsUserTypedLength && terminal) { - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize); - } - if (!needsToTraverseChildrenNodes) return false; - // Start traversing all nodes after the index exceeds the user typed length - *newTraverseAllNodes = isSameAsUserTypedLength; - *newMatchRate = matchWeight; - *newDiffs = diffs - + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); - *newInputIndex = inputIndex + 1; - } - // Optimization: Prune out words that are too long compared to how much was typed. - if (depth >= maxDepth || *newDiffs > mMaxEditDistance) { - return false; - } - - // If inputIndex is greater than mInputLength, that means there are no proximity chars. - // TODO: Check if this can be isSameAsUserTypedLength only. - if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) { - *newTraverseAllNodes = true; - } - // get the count of nodes and increment childAddress. - *newCount = Dictionary::getCount(DICT_ROOT, &childPosition); - *newChildPosition = childPosition; - if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); - return needsToTraverseChildrenNodes; -} - -#else // NEW_DICTIONARY_FORMAT - // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, @@ -1055,83 +609,8 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor return maxFreq; } -// This function gets the frequency of the exact matching word in the dictionary. -// If no match is found, it returns -1. -int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const { - int pos = 0; - int wordPos = 0; - const uint8_t* const root = DICT_ROOT; - - while (true) { - // If we already traversed the tree further than the word is long, there means - // there was no match (or we would have found it). - if (wordPos > length) return -1; - int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); - const uint16_t wChar = inWord[wordPos]; - while (true) { - // If there are no more character groups in this node, it means we could not - // find a matching character for this depth, therefore there is no match. - if (0 >= charGroupCount) return -1; - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - if (character == wChar) { - // This is the correct node. Only one character group may start with the same - // char within a node, so either we found our match in this node, or there is - // no match and we can return -1. So we will check all the characters in this - // character group indeed does match. - if (FLAG_HAS_MULTIPLE_CHARS & flags) { - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - while (NOT_A_CHARACTER != character) { - ++wordPos; - // If we shoot the length of the word we search for, or if we find a single - // character that does not match, as explained above, it means the word is - // not in the dictionary (by virtue of this chargroup being the only one to - // match the word on the first character, but not matching the whole word). - if (wordPos > length) return -1; - if (inWord[wordPos] != character) return -1; - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - } - } - // If we come here we know that so far, we do match. Either we are on a terminal - // and we match the length, in which case we found it, or we traverse children. - // If we don't match the length AND don't have children, then a word in the - // dictionary fully matches a prefix of the searched word but not the full word. - ++wordPos; - if (FLAG_IS_TERMINAL & flags) { - if (wordPos == length) { - return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); - } - pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); - } - if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) - return -1; - // We have children and we are still shorter than the word we are searching for, so - // we need to traverse children. Put the pointer on the children position, and - // break - pos = BinaryFormat::readChildrenPosition(root, flags, pos); - break; - } else { - // This chargroup does not match, so skip the remaining part and go to the next. - if (FLAG_HAS_MULTIPLE_CHARS & flags) { - pos = BinaryFormat::skipOtherCharacters(root, pos); - } - pos = BinaryFormat::skipFrequency(flags, pos); - pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); - } - --charGroupCount; - } - } -} - bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const { - return -1 != getFrequency(inWord, length); -} - -int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize, - unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, - int maxAlternatives) { - // TODO: add implementation. - return 0; + return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); } // TODO: remove this function. @@ -1154,23 +633,13 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // there aren't any more nodes at this level, it merely returns the address of the first byte after // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. -inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth, - const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex, - const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos, - int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition, - bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, - int *nextSiblingPosition, int *newOutputIndex) { +inline bool UnigramDictionary::processCurrentNode(const int initialPos, + Correction *correction, int *newCount, + int *newChildrenPosition, int *nextSiblingPosition) { if (DEBUG_DICT) { - int inputCount = 0; - if (skipPos >= 0) ++inputCount; - if (excessivePos >= 0) ++inputCount; - if (transposedPos >= 0) ++inputCount; - assert(inputCount <= 1); + correction->checkState(); } int pos = initialPos; - int depth = initialDepth; - int traverseAllNodes = initialTraverseAllNodes; - int diffs = initialDiffs; // Flags contain the following information: // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: @@ -1182,6 +651,9 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); + const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); + + bool needsToInvokeOnTerminal = false; // This gets only ONE character from the stream. Next there will be: // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node @@ -1207,101 +679,21 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in const bool isLastChar = (NOT_A_CHARACTER == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. // If we are on the last char, this virtual node is a terminal if this node is. - const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags)); - // If there are more chars in this node, then this virtual node has children. - // If we are on the last char, this virtual node has children if this node has. - const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags); - - // This has to be done for each virtual char (this forwards the "inputIndex" which - // is the index in the user-inputted chars, as read by proximity chars. - if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; - if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { - mWord[depth] = c; - if (traverseAllNodes && isTerminal) { - // The frequency should be here, because we come here only if this is actually - // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize); - } - if (!hasChildren) { - // If we don't have children here, that means we finished processing all - // characters of this node (we are on the last virtual node), AND we are in - // traverseAllNodes mode, which means we are searching for *completions*. We - // should skip the frequency if we have a terminal, and report the position - // of the next sibling. We don't have to return other values because we are - // returning false, as in "don't traverse children". - if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - return false; - } - } else { - int inputIndexForProximity = inputIndex; - - if (transposedPos >= 0) { - if (inputIndex == transposedPos) ++inputIndexForProximity; - if (inputIndex == (transposedPos + 1)) --inputIndexForProximity; - } - - int matchedProximityCharId = mProximityInfo->getMatchedProximityId( - inputIndexForProximity, c, skipPos, excessivePos, transposedPos); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { - // We found that this is an unrelated character, so we should give up traversing - // this node and its children entirely. - // However we may not be on the last virtual node yet so we skip the remaining - // characters in this node, the frequency if it's there, read the next sibling - // position to output it, then return false. - // We don't have to output other values because we return false, as in - // "don't traverse children". - if (!isLastChar) { - pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); - } - pos = BinaryFormat::skipFrequency(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - return false; - } - mWord[depth] = c; - // If inputIndex is greater than mInputLength, that means there is no - // proximity chars. So, we don't need to check proximity. - if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { - multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); - } - const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 - || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); - if (isSameAsUserTypedLength && isTerminal) { - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize); - } - // This character matched the typed character (enough to traverse the node at least) - // so we just evaluated it. Now we should evaluate this virtual node's children - that - // is, if it has any. If it has no children, we're done here - so we skip the end of - // the node, output the siblings position, and return false "don't traverse children". - // Note that !hasChildren implies isLastChar, so we know we don't have to skip any - // remaining char in this group for there can't be any. - if (!hasChildren) { - pos = BinaryFormat::skipFrequency(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - return false; - } - // Start traversing all nodes after the index exceeds the user typed length - traverseAllNodes = isSameAsUserTypedLength; - diffs = diffs - + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); - // Finally, we are ready to go to the next character, the next "virtual node". - // We should advance the input index. - // We do this in this branch of the 'if traverseAllNodes' because we are still matching - // characters to input; the other branch is not matching them but searching for - // completions, this is why it does not have to do it. - ++inputIndex; - } - // Optimization: Prune out words that are too long compared to how much was typed. - if (depth >= maxDepth || diffs > mMaxEditDistance) { - // We are giving up parsing this node and its children. Skip the rest of the node, - // output the sibling position, and return that we don't want to traverse children. + const bool isTerminal = isLastChar && isTerminalNode; + + Correction::CorrectionType stateType = correction->processCharAndCalcState( + c, isTerminal); + if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL + || stateType == Correction::ON_TERMINAL) { + needsToInvokeOnTerminal = true; + } else if (stateType == Correction::UNRELATED) { + // We found that this is an unrelated character, so we should give up traversing + // this node and its children entirely. + // However we may not be on the last virtual node yet so we skip the remaining + // characters in this node, the frequency if it's there, read the next sibling + // position to output it, then return false. + // We don't have to output other values because we return false, as in + // "don't traverse children". if (!isLastChar) { pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); } @@ -1315,23 +707,41 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in // will take care of prefetching the next. If we finally found our last char, nextc will // contain NOT_A_CHARACTER. c = nextc; - // Also, the next char is one "virtual node" depth more than this char. - ++depth; } while (NOT_A_CHARACTER != c); - // If inputIndex is greater than mInputLength, that means there are no proximity chars. - // Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength. - if (mInputLength <= *newInputIndex) { - traverseAllNodes = true; - } + if (isTerminalNode) { + if (needsToInvokeOnTerminal) { + // The frequency should be here, because we come here only if this is actually + // a terminal node, and we are on its last char. + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + onTerminal(freq, mCorrection); + } + + // If there are more chars in this node, then this virtual node has children. + // If we are on the last char, this virtual node has children if this node has. + const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); + + // This character matched the typed character (enough to traverse the node at least) + // so we just evaluated it. Now we should evaluate this virtual node's children - that + // is, if it has any. If it has no children, we're done here - so we skip the end of + // the node, output the siblings position, and return false "don't traverse children". + // Note that !hasChildren implies isLastChar, so we know we don't have to skip any + // remaining char in this group for there can't be any. + if (!hasChildren) { + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } - // All the output values that are purely computation by this function are held in local - // variables. Output them to the caller. - *newTraverseAllNodes = traverseAllNodes; - *newMatchRate = matchWeight; - *newDiffs = diffs; - *newInputIndex = inputIndex; - *newOutputIndex = depth; + // Optimization: Prune out words that are too long compared to how much was typed. + if (correction->needsToPrune()) { + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } + } // Now we finished processing this node, and we want to traverse children. If there are no // children, we can't come here. @@ -1351,6 +761,4 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in return true; } -#endif // NEW_DICTIONARY_FORMAT - } // namespace latinime |