diff options
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r-- | native/src/unigram_dictionary.cpp | 188 |
1 files changed, 147 insertions, 41 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index e95e03ce5..8be95bc40 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -47,7 +47,7 @@ UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typed BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { if (DEBUG_DICT) { - LOGI("UnigramDictionary - constructor"); + AKLOGI("UnigramDictionary - constructor"); } } @@ -163,14 +163,14 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords); if (DEBUG_DICT) { - LOGI("Returning %d words", suggestedWordsCount); + AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { #ifdef FLAG_DBG short unsigned int* w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; - LOGI("%s %i", s, frequencies[j]); + AKLOGI("%s %i", s, frequencies[j]); #endif } } @@ -186,7 +186,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_OPEN; PROF_START(0); - // Note: This line is intentionally left blank + queuePool->clearAll(); PROF_END(0); PROF_START(1); @@ -213,7 +213,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { for (int i = 1; i < inputLength; ++i) { if (DEBUG_DICT) { - LOGI("--- Suggest missing space characters %d", i); + AKLOGI("--- Suggest missing space characters %d", i); } getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, i, correction, queuePool); @@ -226,12 +226,12 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, // The first and last "mistyped spaces" are taken care of by excessive character handling for (int i = 1; i < inputLength - 1; ++i) { if (DEBUG_DICT) { - LOGI("--- Suggest words with proximity space %d", i); + AKLOGI("--- Suggest words with proximity space %d", i); } const int x = xcoordinates[i]; const int y = ycoordinates[i]; if (DEBUG_PROXIMITY_INFO) { - LOGI("Input[%d] x = %d, y = %d, has space proximity = %d", + AKLOGI("Input[%d] x = %d, y = %d, has space proximity = %d", i, x, y, proximityInfo->hasSpaceProximity(x, y)); } if (proximityInfo->hasSpaceProximity(x, y)) { @@ -241,18 +241,33 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } } PROF_END(6); + if (DEBUG_DICT) { + queuePool->dumpSubQueue1TopSuggestions(); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + WordsPriorityQueue* queue = queuePool->getSubQueue1(i); + if (queue->size() > 0) { + WordsPriorityQueue::SuggestedWord* sw = queue->top(); + const int score = sw->mScore; + const unsigned short* word = sw->mWord; + const int wordLength = sw->mWordLength; + double ns = Correction::RankingAlgorithm::calcNormalizedScore( + proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); + ns += 0; + AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, + (ns > TWO_WORDS_CORRECTION_THRESHOLD)); + DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); + DUMP_WORD(word, wordLength); + } + } + } } void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, - const int *yCoordinates, const int *codes, const int inputLength, - WordsPriorityQueue *queue, Correction *correction) { + const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { if (DEBUG_DICT) { - LOGI("initSuggest"); + AKLOGI("initSuggest"); } proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); - if (queue) { - queue->clear(); - } const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); correction->initCorrection(proximityInfo, inputLength, maxDepth); } @@ -264,15 +279,13 @@ void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool) { - WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - initSuggestions( - proximityInfo, xcoordinates, ycoordinates, codes, inputLength, masterQueue, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, correction, masterQueue, + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, Correction *correction, WordsPriorityQueue *queue, + const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, @@ -280,7 +293,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, doAutoCompletion, maxErrors); int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position - int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); + int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); int outputIndex = 0; correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0)); @@ -292,7 +305,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - correction, &childCount, &firstChildPos, &siblingPos, queue); + correction, &childCount, &firstChildPos, &siblingPos, queuePool); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -327,14 +340,34 @@ void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, cons inline void UnigramDictionary::onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, - WordsPriorityQueue *queue) { + WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) { + const int inputIndex = correction->getInputIndex(); + const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; + if (!addToMasterQueue && !addToSubQueue) { + return; + } + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex); int wordLength; unsigned short* wordPointer; const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); - if (finalFreq >= 0) { + if (finalFreq != NOT_A_FREQUENCY) { if (!terminalAttributes.isShortcutOnly()) { - addWord(wordPointer, wordLength, finalFreq, queue); + if (addToMasterQueue) { + addWord(wordPointer, wordLength, finalFreq, masterQueue); + } + // TODO: Check the validity of "inputIndex == wordLength" + //if (addToSubQueue && inputIndex == wordLength) { + if (addToSubQueue) { + addWord(wordPointer, wordLength, finalFreq, subQueue); + } + } + // Please note that the shortcut candidates will be added to the master queue only. + if (!addToMasterQueue) { + return; } + + // From here, below is the code to add shortcut candidates. TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); while (iterator.hasNextShortcutTarget()) { // TODO: addWord only supports weak ordering, meaning we have no means to control the @@ -345,7 +378,7 @@ inline void UnigramDictionary::onTerminal(const int freq, uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; const int shortcutTargetStringLength = iterator.getNextShortcutTarget( MAX_WORD_LENGTH_INTERNAL, shortcutTarget); - addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, queue); + addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue); } } } @@ -390,7 +423,81 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo const int firstFreq = getMostFrequentWordLike( firstWordStartPos, firstWordLength, proximityInfo, mWord); if (DEBUG_DICT) { - LOGI("First freq: %d", firstFreq); + AKLOGI("First freq: %d", firstFreq); + } + if (firstFreq <= 0) return; + + for (int i = 0; i < firstWordLength; ++i) { + word[i] = mWord[i]; + } + + const int secondFreq = getMostFrequentWordLike( + secondWordStartPos, secondWordLength, proximityInfo, mWord); + if (DEBUG_DICT) { + AKLOGI("Second freq: %d", secondFreq); + } + if (secondFreq <= 0) return; + + word[firstWordLength] = SPACE; + for (int i = (firstWordLength + 1); i < newWordLength; ++i) { + word[i] = mWord[i - firstWordLength - 1]; + } + + // TODO: Remove initSuggestions and correction->setCorrectionParams + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + + correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, + -1 /* transposedPos */, spaceProximityPos, missingSpacePos, + useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); + const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); + if (DEBUG_DICT) { + AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); + } + addWord(word, newWordLength, pairFreq, masterQueue); + return; +} + +void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, const int missingSpacePos, + const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + + if (DEBUG_DICT) { + int inputCount = 0; + if (spaceProximityPos >= 0) ++inputCount; + if (missingSpacePos >= 0) ++inputCount; + assert(inputCount <= 1); + } + const bool isSpaceProximity = spaceProximityPos >= 0; + const int firstWordStartPos = 0; + const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; + const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; + const int secondWordLength = isSpaceProximity + ? (inputLength - spaceProximityPos - 1) + : (inputLength - missingSpacePos); + + if (inputLength >= MAX_WORD_LENGTH) return; + if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos + || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) + return; + + const int newWordLength = firstWordLength + secondWordLength + 1; + + + // Space proximity preparation + //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); + //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue, + //correction); + //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false, + //MAX_ERRORS_FOR_TWO_WORDS); + + // Allocating variable length array on stack + unsigned short word[newWordLength]; + const int firstFreq = getMostFrequentWordLike( + firstWordStartPos, firstWordLength, proximityInfo, mWord); + if (DEBUG_DICT) { + AKLOGI("First freq: %d", firstFreq); } if (firstFreq <= 0) return; @@ -401,7 +508,7 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo const int secondFreq = getMostFrequentWordLike( secondWordStartPos, secondWordLength, proximityInfo, mWord); if (DEBUG_DICT) { - LOGI("Second freq: %d", secondFreq); + AKLOGI("Second freq: %d", secondFreq); } if (secondFreq <= 0) return; @@ -411,15 +518,14 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo } // TODO: Remove initSuggestions and correction->setCorrectionParams - initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, - 0 /* do not clear queue */, correction); + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, -1 /* transposedPos */, spaceProximityPos, missingSpacePos, useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); if (DEBUG_DICT) { - LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); + AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); } addWord(word, newWordLength, pairFreq, masterQueue); return; @@ -507,9 +613,10 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor int maxFreq = -1; const uint8_t* const root = DICT_ROOT; - mStackChildCount[0] = root[0]; + int startPos = 0; + mStackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); mStackInputIndex[0] = 0; - mStackSiblingPos[0] = 1; + mStackSiblingPos[0] = startPos; while (depth >= 0) { const int charGroupCount = mStackChildCount[depth]; int pos = mStackSiblingPos[depth]; @@ -583,7 +690,7 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueue *queue) { + int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool) { if (DEBUG_DICT) { correction->checkState(); } @@ -658,14 +765,13 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, } while (NOT_A_CHARACTER != c); if (isTerminalNode) { - if (needsToInvokeOnTerminal) { - // The frequency should be here, because we come here only if this is actually - // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - TerminalAttributes terminalAttributes(DICT_ROOT, flags, - BinaryFormat::skipFrequency(flags, pos)); - onTerminal(freq, terminalAttributes, correction, queue); - } + // The frequency should be here, because we come here only if this is actually + // a terminal node, and we are on its last char. + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); + const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); + TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); + onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. @@ -690,7 +796,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); if (DEBUG_DICT_FULL) { - LOGI("Traversing was pruned."); + AKLOGI("Traversing was pruned."); } return false; } |