diff options
Diffstat (limited to 'native/jni/src/unigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 93 |
1 files changed, 52 insertions, 41 deletions
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index ab8570e6f..0c759d438 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -98,7 +98,8 @@ int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, cons void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer, int *yCoordinatesBuffer, - const int codesBufferSize, const bool useFullEditDistance, const int *codesSrc, + const int codesBufferSize, const int bigramListPosition, + const bool useFullEditDistance, const int *codesSrc, const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, WordsPriorityQueuePool *queuePool, const digraph_t* const digraphs, const unsigned int digraphsSize) { @@ -127,8 +128,8 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit replacementCodePoint; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - useFullEditDistance, codesSrc + i + 1, codesRemain - i - 1, - currentDepth + 1, codesDest + i, correction, + bigramListPosition, useFullEditDistance, codesSrc + i + 1, + codesRemain - i - 1, currentDepth + 1, codesDest + i, correction, queuePool, digraphs, digraphsSize); // Copy the second char of the digraph in place, then continue processing on @@ -137,9 +138,9 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR); getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - useFullEditDistance, codesSrc + i, codesRemain - i, currentDepth + 1, - codesDest + i, correction, queuePool, - digraphs, digraphsSize); + bigramListPosition, useFullEditDistance, codesSrc + i, codesRemain - i, + currentDepth + 1, codesDest + i, correction, queuePool, digraphs, + digraphsSize); return; } } @@ -160,14 +161,16 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit } getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, - startIndex + codesRemain, useFullEditDistance, correction, + startIndex + codesRemain, bigramListPosition, useFullEditDistance, correction, queuePool); } +// bigramListPosition is the offset in the file to the list of bigrams for the previous word. int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, - const bool useFullEditDistance, unsigned short *outWords, int *frequencies) { + const int bigramListPosition, const bool useFullEditDistance, unsigned short *outWords, + int *frequencies) { queuePool->clearAll(); Correction* masterCorrection = correction; @@ -177,8 +180,8 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, - codesSize, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, GERMAN_UMLAUT_DIGRAPHS, sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); } else if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & FLAGS) { @@ -186,13 +189,13 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, - codesSize, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, FRENCH_LIGATURES_DIGRAPHS, sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); } else { // Normal processing getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, - useFullEditDistance, masterCorrection, queuePool); + bigramListPosition, useFullEditDistance, masterCorrection, queuePool); } PROF_START(20); @@ -225,16 +228,16 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const int inputLength, const bool useFullEditDistance, Correction *correction, - WordsPriorityQueuePool *queuePool) { + const int inputLength, const int bigramListPosition, const bool useFullEditDistance, + Correction *correction, WordsPriorityQueuePool *queuePool) { PROF_OPEN; PROF_START(0); PROF_END(0); PROF_START(1); - getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, - inputLength, correction, queuePool); + getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramListPosition, + useFullEditDistance, inputLength, correction, queuePool); PROF_END(1); PROF_START(2); @@ -305,15 +308,16 @@ static const char SPACE = ' '; void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, Correction *correction, - WordsPriorityQueuePool *queuePool) { + const int bigramListPosition, const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool *queuePool) { initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, - true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); + getSuggestionCandidates(useFullEditDistance, inputLength, bigramListPosition, correction, + queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, + const int inputLength, const int bigramListPosition, + Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, @@ -333,8 +337,8 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - correction, &childCount, &firstChildPos, &siblingPos, queuePool, - currentWordIndex); + bigramListPosition, correction, &childCount, &firstChildPos, &siblingPos, + queuePool, currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -349,7 +353,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, } } -inline void UnigramDictionary::onTerminal(const int freq, +inline void UnigramDictionary::onTerminal(const int probability, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex) { @@ -361,26 +365,28 @@ inline void UnigramDictionary::onTerminal(const int freq, if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); - if (finalFreq != NOT_A_FREQUENCY) { - addWord(wordPointer, wordLength, finalFreq, masterQueue); + const int finalProbability = + correction->getFinalProbability(probability, &wordPointer, &wordLength); + if (finalProbability != NOT_A_PROBABILITY) { + addWord(wordPointer, wordLength, finalProbability, masterQueue); - const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0; + const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0; // Please note that the shortcut candidates will be added to the master queue only. TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); while (iterator.hasNextShortcutTarget()) { // TODO: addWord only supports weak ordering, meaning we have no means // to control the order of the shortcuts relative to one another or to the word. - // We need to either modulate the frequency of each shortcut according - // to its own shortcut frequency or to make the queue + // We need to either modulate the probability of each shortcut according + // to its own shortcut probability or to make the queue // so that the insert order is protected inside the queue for words // with the same score. For the moment we use -1 to make sure the shortcut will // never be in front of the word. uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; const int shortcutTargetStringLength = iterator.getNextShortcutTarget( MAX_WORD_LENGTH_INTERNAL, shortcutTarget); - addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue); + addWord(shortcutTarget, shortcutTargetStringLength, shortcutProbability, + masterQueue); } } } @@ -393,9 +399,9 @@ inline void UnigramDictionary::onTerminal(const int freq, if (!subQueue) { return; } - const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength, - inputIndex); - addWord(wordPointer, wordLength, finalFreq, subQueue); + const int finalProbability = correction->getFinalProbabilityForSubQueue( + probability, &wordPointer, &wordLength, inputIndex); + addWord(wordPointer, wordLength, finalProbability, subQueue); } } @@ -424,8 +430,10 @@ bool UnigramDictionary::getSubStringSuggestion( initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], codes + offset, inputWordLength, correction); queuePool->clearSubQueue(currentWordIndex); - getSuggestionCandidates(useFullEditDistance, inputWordLength, correction, - queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); + // TODO: pass the bigram list for substring suggestion + getSuggestionCandidates(useFullEditDistance, inputWordLength, + 0 /* bigramListPosition */, correction, queuePool, false /* doAutoCompletion */, + MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); if (DEBUG_DICT) { if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); @@ -730,7 +738,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor return maxFreq; } -bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const { +bool UnigramDictionary::isValidWord(const int32_t* const inWord, const int length) const { return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); } @@ -755,7 +763,7 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, - Correction *correction, int *newCount, + const int bigramListPosition, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { @@ -834,11 +842,14 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, if (isTerminalNode) { // The frequency should be here, because we come here only if this is actually // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); - onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, + // The bigramListPosition is the offset in the file of the bigrams for the previous word, + // or zero if we don't know of any bigrams for it. + const int probability = BinaryFormat::getProbability(bigramListPosition, unigramFreq); + onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); // If there are more chars in this node, then this virtual node has children. |