diff options
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r-- | native/src/unigram_dictionary.cpp | 202 |
1 files changed, 89 insertions, 113 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index e998ee486..6a8973761 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -159,19 +159,26 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, } PROF_START(20); + if (DEBUG_DICT) { + double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + ns += 0; + AKLOGI("Max normalized score = %f", ns); + } const int suggestedWordsCount = queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords); if (DEBUG_DICT) { + double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + ns += 0; AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { -#ifdef FLAG_DBG short unsigned int* w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; AKLOGI("%s %i", s, frequencies[j]); -#endif } } PROF_END(20); @@ -205,6 +212,13 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_START(4); // Note: This line is intentionally left blank + bool hasAutoCorrectionCandidate = false; + WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); + if (masterQueue->size() > 0) { + double nsForMaster = masterQueue->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); + hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); + } PROF_END(4); PROF_START(5); @@ -216,7 +230,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, AKLOGI("--- Suggest missing space characters %d", i); } getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, i, correction, queuePool); + useFullEditDistance, inputLength, i, correction, queuePool, + hasAutoCorrectionCandidate); } } PROF_END(5); @@ -236,7 +251,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } if (proximityInfo->hasSpaceProximity(x, y)) { getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, i, correction, queuePool); + useFullEditDistance, inputLength, i, correction, queuePool, + hasAutoCorrectionCandidate); } } } @@ -281,12 +297,12 @@ void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool) { initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, - true /* doAutoCompletion */, DEFAULT_MAX_ERRORS); + true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, - const bool doAutoCompletion, const int maxErrors) { + const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, @@ -305,7 +321,8 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - correction, &childCount, &firstChildPos, &siblingPos, queuePool); + correction, &childCount, &firstChildPos, &siblingPos, queuePool, + currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -323,31 +340,32 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, - WordsPriorityQueuePool* queuePool) { + WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */, - correction, queuePool); + correction, queuePool, hasAutoCorrectionCandidate); } void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Correction *correction, - WordsPriorityQueuePool* queuePool) { + WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos, - correction, queuePool); + correction, queuePool, hasAutoCorrectionCandidate); } inline void UnigramDictionary::onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, - WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) { + WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, + const int currentWordIndex) { const int inputIndex = correction->getInputIndex(); const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; int wordLength; unsigned short* wordPointer; - if (addToMasterQueue) { + if ((currentWordIndex == 1) && addToMasterQueue) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); if (finalFreq != NOT_A_FREQUENCY) { @@ -376,9 +394,14 @@ inline void UnigramDictionary::onTerminal(const int freq, // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH // or more length. if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) { - // TODO: Check the validity of "inputIndex == wordLength" - //if (addToSubQueue && inputIndex == wordLength) { - WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex); + WordsPriorityQueue *subQueue; + if (currentWordIndex == 1) { + subQueue = queuePool->getSubQueue1(inputIndex); + } else if (currentWordIndex == 2) { + subQueue = queuePool->getSubQueue2(inputIndex); + } else { + return; + } const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength, inputIndex); addWord(wordPointer, wordLength, finalFreq, subQueue); @@ -388,17 +411,21 @@ inline void UnigramDictionary::onTerminal(const int freq, void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, - const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { + const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate) { if (inputLength >= MAX_WORD_LENGTH) return; if (DEBUG_DICT) { int inputCount = 0; if (spaceProximityPos >= 0) ++inputCount; if (missingSpacePos >= 0) ++inputCount; assert(inputCount <= 1); + // MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16 + assert(MAX_PROXIMITY_CHARS == 16); } + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, + inputLength, correction); WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - const bool isSpaceProximity = spaceProximityPos >= 0; // First word @@ -411,26 +438,22 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo if (firstFreq > 0) { firstOutputWordLength = firstInputWordLength; firstOutputWord = mWord; - } else { - if (masterQueue->size() > 0) { - double nsForMaster = masterQueue->getHighestNormalizedScore( - proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); - if (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD) { - // Do nothing if the highest suggestion exceeds the threshold. - return; - } - } + } else if (!hasAutoCorrectionCandidate) { WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue1(firstInputWordLength); - if (firstWordQueue->size() < 1) { + if (!firstWordQueue || firstWordQueue->size() < 1) { return; } int score = 0; const double ns = firstWordQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), firstInputWordLength, &firstOutputWord, &score, &firstOutputWordLength); + if (DEBUG_DICT) { + AKLOGI("NS1 = %f, Score = %d", ns, score); + } // Two words correction won't be done if the score of the first word doesn't exceed the // threshold. - if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD) { + if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD + || firstOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { return; } firstFreq = score >> (firstOutputWordLength @@ -456,14 +479,6 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo outputWord[firstOutputWordLength] = SPACE; outputWordLength = firstOutputWordLength + 1; - //const int outputWordLength = firstOutputWordLength + secondWordLength + 1; - // Space proximity preparation - //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); - //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstOutputWordLength, - //subQueue, correction); - //getSuggestionCandidates(useFullEditDistance, firstOutputWordLength, correction, subQueue, - //false, MAX_ERRORS_FOR_TWO_WORDS); - // Second word const int secondInputWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) @@ -478,9 +493,42 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo if (secondFreq > 0) { secondOutputWordLength = secondInputWordLength; secondOutputWord = mWord; + } else if (!hasAutoCorrectionCandidate) { + const int offset = secondInputWordStartPos; + initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], + codes + offset * MAX_PROXIMITY_CHARS, secondInputWordLength, correction); + queuePool->clearSubQueue2(); + getSuggestionCandidates(useFullEditDistance, secondInputWordLength, correction, + queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, SECOND_WORD_INDEX); + if (DEBUG_DICT) { + AKLOGI("Dump second word candidates %d", secondInputWordLength); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + queuePool->getSubQueue2(i)->dumpTopWord(); + } + } + WordsPriorityQueue* secondWordQueue = queuePool->getSubQueue2(secondInputWordLength); + if (!secondWordQueue || secondWordQueue->size() < 1) { + return; + } + int score = 0; + const double ns = secondWordQueue->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), secondInputWordLength, + &secondOutputWord, &score, &secondOutputWordLength); + if (DEBUG_DICT) { + AKLOGI("NS2 = %f, Score = %d", ns, score); + } + // Two words correction won't be done if the score of the first word doesn't exceed the + // threshold. + if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD + || secondOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { + return; + } + secondFreq = score >> (secondOutputWordLength + + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); } if (DEBUG_DICT) { + DUMP_WORD(secondOutputWord, secondOutputWordLength); AKLOGI("Second freq: %d", secondFreq); } @@ -509,80 +557,6 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo return; } -void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, - const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, const int missingSpacePos, - const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { - WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - - if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); - } - const bool isSpaceProximity = spaceProximityPos >= 0; - const int firstWordStartPos = 0; - const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; - const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity - ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); - - if (inputLength >= MAX_WORD_LENGTH) return; - if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos - || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) - return; - - const int newWordLength = firstWordLength + secondWordLength + 1; - - - // Space proximity preparation - //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); - //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue, - //correction); - //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false, - //MAX_ERRORS_FOR_TWO_WORDS); - - // Allocating variable length array on stack - unsigned short word[newWordLength]; - const int firstFreq = getMostFrequentWordLike( - firstWordStartPos, firstWordLength, proximityInfo, mWord); - if (DEBUG_DICT) { - AKLOGI("First freq: %d", firstFreq); - } - if (firstFreq <= 0) return; - - for (int i = 0; i < firstWordLength; ++i) { - word[i] = mWord[i]; - } - - const int secondFreq = getMostFrequentWordLike( - secondWordStartPos, secondWordLength, proximityInfo, mWord); - if (DEBUG_DICT) { - AKLOGI("Second freq: %d", secondFreq); - } - if (secondFreq <= 0) return; - - word[firstWordLength] = SPACE; - for (int i = (firstWordLength + 1); i < newWordLength; ++i) { - word[i] = mWord[i - firstWordLength - 1]; - } - - // TODO: Remove initSuggestions and correction->setCorrectionParams - initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); - - correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, - -1 /* transposedPos */, spaceProximityPos, missingSpacePos, - useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); - const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); - if (DEBUG_DICT) { - AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); - } - addWord(word, newWordLength, pairFreq, masterQueue); - return; -} - // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, @@ -742,7 +716,8 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool) { + int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, + const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } @@ -823,7 +798,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); - onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal); + onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, + currentWordIndex); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. |