diff options
Diffstat (limited to 'native/jni/src')
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 4 | ||||
-rw-r--r-- | native/jni/src/bigram_dictionary.h | 4 | ||||
-rw-r--r-- | native/jni/src/binary_format.h | 15 | ||||
-rw-r--r-- | native/jni/src/correction.cpp | 26 | ||||
-rw-r--r-- | native/jni/src/correction.h | 14 | ||||
-rw-r--r-- | native/jni/src/defines.h | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary.cpp | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary.h | 12 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 93 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.h | 29 |
10 files changed, 113 insertions, 88 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index 320b0af68..927381fdb 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -96,7 +96,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ * and the bigrams are used to boost unigram result scores, it makes little sense to * reduce their scope to the ones that match the first letter. */ -int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, +int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, int *codes, int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams) { // TODO: remove unused arguments, and refrain from storing stuff in members of this class @@ -134,7 +134,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i // Returns a pointer to the start of the bigram list. // If the word is not found or has no bigrams, this function returns 0. int BigramDictionary::getBigramListForWord(const uint8_t* const root, - const unsigned short *prevWord, const int prevWordLength) { + const int32_t *prevWord, const int prevWordLength) { int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength); if (NOT_VALID_WORD == pos) return 0; diff --git a/native/jni/src/bigram_dictionary.h b/native/jni/src/bigram_dictionary.h index 1612131c4..07e47f059 100644 --- a/native/jni/src/bigram_dictionary.h +++ b/native/jni/src/bigram_dictionary.h @@ -25,10 +25,10 @@ class Dictionary; class BigramDictionary { public: BigramDictionary(const unsigned char *dict, int maxWordLength, Dictionary *parentDictionary); - int getBigrams(unsigned short *word, int length, int *codes, int codesSize, + int getBigrams(const int32_t *word, int length, int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams); int getBigramListForWord(const uint8_t* const root, - const unsigned short *prevWord, const int prevWordLength); + const int32_t *prevWord, const int prevWordLength); ~BigramDictionary(); private: bool addWordBigram(unsigned short *word, int length, int frequency); diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index f59302460..d5d67c108 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -62,10 +62,11 @@ class BinaryFormat { static bool hasChildrenInFlags(const uint8_t flags); static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, int *pos); - static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord, + static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, uint16_t* outWord); + static int getProbability(const int bigramListPosition, const int unigramFreq); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or @@ -304,7 +305,7 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con // This function gets the byte position of the last chargroup of the exact matching word in the // dictionary. If no match is found, it returns NOT_VALID_WORD. inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, - const uint16_t* const inWord, const int length) { + const int32_t* const inWord, const int length) { int pos = 0; int wordPos = 0; @@ -313,7 +314,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, // there was no match (or we would have found it). if (wordPos > length) return NOT_VALID_WORD; int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); - const uint16_t wChar = inWord[wordPos]; + const int32_t wChar = inWord[wordPos]; while (true) { // If there are no more character groups in this node, it means we could not // find a matching character for this depth, therefore there is no match. @@ -517,6 +518,14 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a return 0; } +// This should probably return a probability in log space. +inline int BinaryFormat::getProbability(const int bigramListPosition, const int unigramFreq) { + // TODO: use the bigram list position to get the bigram probability. If the bigram + // is not found, use the unigram frequency. + // TODO: if the unigram frequency is used, compute the actual probability + return unigramFreq; +} + } // namespace latinime #endif // LATINIME_BINARY_FORMAT_H diff --git a/native/jni/src/correction.cpp b/native/jni/src/correction.cpp index 087219ed4..376e9a10e 100644 --- a/native/jni/src/correction.cpp +++ b/native/jni/src/correction.cpp @@ -165,28 +165,28 @@ int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wo wordCount, this, isSpaceProximity, word); } -int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { - return getFinalFreqInternal(freq, word, wordLength, mInputLength); +int Correction::getFinalProbability(const int probability, unsigned short **word, int *wordLength) { + return getFinalProbabilityInternal(probability, word, wordLength, mInputLength); } -int Correction::getFinalFreqForSubQueue(const int freq, unsigned short **word, int *wordLength, - const int inputLength) { - return getFinalFreqInternal(freq, word, wordLength, inputLength); +int Correction::getFinalProbabilityForSubQueue(const int probability, unsigned short **word, + int *wordLength, const int inputLength) { + return getFinalProbabilityInternal(probability, word, wordLength, inputLength); } -int Correction::getFinalFreqInternal(const int freq, unsigned short **word, int *wordLength, - const int inputLength) { +int Correction::getFinalProbabilityInternal(const int probability, unsigned short **word, + int *wordLength, const int inputLength) { const int outputIndex = mTerminalOutputIndex; const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; if (outputIndex < MIN_SUGGEST_DEPTH) { - return NOT_A_FREQUENCY; + return NOT_A_PROBABILITY; } *word = mWord; - int finalFreq = Correction::RankingAlgorithm::calculateFinalFreq( - inputIndex, outputIndex, freq, mEditDistanceTable, this, inputLength); - return finalFreq; + int finalProbability= Correction::RankingAlgorithm::calculateFinalProbability( + inputIndex, outputIndex, probability, mEditDistanceTable, this, inputLength); + return finalProbability; } bool Correction::initProcessState(const int outputIndex) { @@ -649,8 +649,8 @@ inline static bool isUpperCase(unsigned short c) { ////////////////////// /* static */ -int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, - const int freq, int* editDistanceTable, const Correction* correction, +int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex, + const int outputIndex, const int freq, int* editDistanceTable, const Correction* correction, const int inputLength) { const int excessivePos = correction->getExcessivePos(); const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; diff --git a/native/jni/src/correction.h b/native/jni/src/correction.h index ee55c9604..1b4e4bf4e 100644 --- a/native/jni/src/correction.h +++ b/native/jni/src/correction.h @@ -132,9 +132,9 @@ class Correction { int getFreqForSplitMultipleWords( const int *freqArray, const int *wordLengthArray, const int wordCount, const bool isSpaceProximity, const unsigned short *word); - int getFinalFreq(const int freq, unsigned short **word, int* wordLength); - int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength, - const int inputLength); + int getFinalProbability(const int probability, unsigned short **word, int* wordLength); + int getFinalProbabilityForSubQueue(const int probability, unsigned short **word, + int* wordLength, const int inputLength); CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); @@ -156,8 +156,8 @@ class Correction { class RankingAlgorithm { public: - static int calculateFinalFreq(const int inputIndex, const int depth, - const int freq, int *editDistanceTable, const Correction* correction, + static int calculateFinalProbability(const int inputIndex, const int depth, + const int probability, int *editDistanceTable, const Correction* correction, const int inputLength); static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, const int wordCount, const Correction* correction, const bool isSpaceProximity, @@ -182,8 +182,8 @@ class Correction { const int32_t c, const bool isTerminal, const bool inputIndexIncremented); inline CorrectionType processUnrelatedCorrectionType(); inline void addCharToCurrentWord(const int32_t c); - inline int getFinalFreqInternal(const int freq, unsigned short **word, int* wordLength, - const int inputLength); + inline int getFinalProbabilityInternal(const int probability, unsigned short **word, + int* wordLength, const int inputLength); const int TYPED_LETTER_MULTIPLIER; const int FULL_WORD_MULTIPLIER; diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index e882c3714..c99f8a8b2 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -172,7 +172,7 @@ static inline void prof_out(void) { #define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO -3 #define ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO -4 #define NOT_AN_INDEX -1 -#define NOT_A_FREQUENCY -1 +#define NOT_A_PROBABILITY -1 #define KEYCODE_SPACE ' ' diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp index 90ec207f0..9dc207223 100644 --- a/native/jni/src/dictionary.cpp +++ b/native/jni/src/dictionary.cpp @@ -54,7 +54,7 @@ Dictionary::~Dictionary() { delete mBigramDictionary; } -bool Dictionary::isValidWord(unsigned short *word, int length) { +bool Dictionary::isValidWord(const int32_t *word, int length) { return mUnigramDictionary->isValidWord(word, length); } diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h index 66a5c2150..5b9ddb3e9 100644 --- a/native/jni/src/dictionary.h +++ b/native/jni/src/dictionary.h @@ -35,18 +35,22 @@ class Dictionary { int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, int *codes, int codesSize, bool useFullEditDistance, unsigned short *outWords, int *frequencies) { + // bigramListPosition is, as an int, the offset of the bigram list in the file. + // If none, it's zero. + // TODO: get this from the bigram dictionary instance + const int bigramListPosition = 0; return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, - mCorrection, xcoordinates, ycoordinates, codes, - codesSize, useFullEditDistance, outWords, frequencies); + mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition, + useFullEditDistance, outWords, frequencies); } - int getBigrams(unsigned short *word, int length, int *codes, int codesSize, + int getBigrams(const int32_t *word, int length, int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) { return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies, maxWordLength, maxBigrams); } - bool isValidWord(unsigned short *word, int length); + bool isValidWord(const int32_t *word, int length); void *getDict() { return (void *)mDict; } int getDictSize() { return mDictSize; } int getMmapFd() { return mMmapFd; } diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index ab8570e6f..0c759d438 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -98,7 +98,8 @@ int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, cons void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer, int *yCoordinatesBuffer, - const int codesBufferSize, const bool useFullEditDistance, const int *codesSrc, + const int codesBufferSize, const int bigramListPosition, + const bool useFullEditDistance, const int *codesSrc, const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, WordsPriorityQueuePool *queuePool, const digraph_t* const digraphs, const unsigned int digraphsSize) { @@ -127,8 +128,8 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit replacementCodePoint; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - useFullEditDistance, codesSrc + i + 1, codesRemain - i - 1, - currentDepth + 1, codesDest + i, correction, + bigramListPosition, useFullEditDistance, codesSrc + i + 1, + codesRemain - i - 1, currentDepth + 1, codesDest + i, correction, queuePool, digraphs, digraphsSize); // Copy the second char of the digraph in place, then continue processing on @@ -137,9 +138,9 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR); getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, - useFullEditDistance, codesSrc + i, codesRemain - i, currentDepth + 1, - codesDest + i, correction, queuePool, - digraphs, digraphsSize); + bigramListPosition, useFullEditDistance, codesSrc + i, codesRemain - i, + currentDepth + 1, codesDest + i, correction, queuePool, digraphs, + digraphsSize); return; } } @@ -160,14 +161,16 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit } getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, - startIndex + codesRemain, useFullEditDistance, correction, + startIndex + codesRemain, bigramListPosition, useFullEditDistance, correction, queuePool); } +// bigramListPosition is the offset in the file to the list of bigrams for the previous word. int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, - const bool useFullEditDistance, unsigned short *outWords, int *frequencies) { + const int bigramListPosition, const bool useFullEditDistance, unsigned short *outWords, + int *frequencies) { queuePool->clearAll(); Correction* masterCorrection = correction; @@ -177,8 +180,8 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, - codesSize, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, GERMAN_UMLAUT_DIGRAPHS, sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); } else if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & FLAGS) { @@ -186,13 +189,13 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int xCoordinatesBuffer[codesSize]; int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - xCoordinatesBuffer, yCoordinatesBuffer, - codesSize, useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, + xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition, + useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, FRENCH_LIGATURES_DIGRAPHS, sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); } else { // Normal processing getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, - useFullEditDistance, masterCorrection, queuePool); + bigramListPosition, useFullEditDistance, masterCorrection, queuePool); } PROF_START(20); @@ -225,16 +228,16 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const int inputLength, const bool useFullEditDistance, Correction *correction, - WordsPriorityQueuePool *queuePool) { + const int inputLength, const int bigramListPosition, const bool useFullEditDistance, + Correction *correction, WordsPriorityQueuePool *queuePool) { PROF_OPEN; PROF_START(0); PROF_END(0); PROF_START(1); - getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, - inputLength, correction, queuePool); + getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramListPosition, + useFullEditDistance, inputLength, correction, queuePool); PROF_END(1); PROF_START(2); @@ -305,15 +308,16 @@ static const char SPACE = ' '; void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, Correction *correction, - WordsPriorityQueuePool *queuePool) { + const int bigramListPosition, const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool *queuePool) { initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, - true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); + getSuggestionCandidates(useFullEditDistance, inputLength, bigramListPosition, correction, + queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, + const int inputLength, const int bigramListPosition, + Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, @@ -333,8 +337,8 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - correction, &childCount, &firstChildPos, &siblingPos, queuePool, - currentWordIndex); + bigramListPosition, correction, &childCount, &firstChildPos, &siblingPos, + queuePool, currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -349,7 +353,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, } } -inline void UnigramDictionary::onTerminal(const int freq, +inline void UnigramDictionary::onTerminal(const int probability, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex) { @@ -361,26 +365,28 @@ inline void UnigramDictionary::onTerminal(const int freq, if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); - if (finalFreq != NOT_A_FREQUENCY) { - addWord(wordPointer, wordLength, finalFreq, masterQueue); + const int finalProbability = + correction->getFinalProbability(probability, &wordPointer, &wordLength); + if (finalProbability != NOT_A_PROBABILITY) { + addWord(wordPointer, wordLength, finalProbability, masterQueue); - const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0; + const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0; // Please note that the shortcut candidates will be added to the master queue only. TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); while (iterator.hasNextShortcutTarget()) { // TODO: addWord only supports weak ordering, meaning we have no means // to control the order of the shortcuts relative to one another or to the word. - // We need to either modulate the frequency of each shortcut according - // to its own shortcut frequency or to make the queue + // We need to either modulate the probability of each shortcut according + // to its own shortcut probability or to make the queue // so that the insert order is protected inside the queue for words // with the same score. For the moment we use -1 to make sure the shortcut will // never be in front of the word. uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; const int shortcutTargetStringLength = iterator.getNextShortcutTarget( MAX_WORD_LENGTH_INTERNAL, shortcutTarget); - addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue); + addWord(shortcutTarget, shortcutTargetStringLength, shortcutProbability, + masterQueue); } } } @@ -393,9 +399,9 @@ inline void UnigramDictionary::onTerminal(const int freq, if (!subQueue) { return; } - const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength, - inputIndex); - addWord(wordPointer, wordLength, finalFreq, subQueue); + const int finalProbability = correction->getFinalProbabilityForSubQueue( + probability, &wordPointer, &wordLength, inputIndex); + addWord(wordPointer, wordLength, finalProbability, subQueue); } } @@ -424,8 +430,10 @@ bool UnigramDictionary::getSubStringSuggestion( initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], codes + offset, inputWordLength, correction); queuePool->clearSubQueue(currentWordIndex); - getSuggestionCandidates(useFullEditDistance, inputWordLength, correction, - queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); + // TODO: pass the bigram list for substring suggestion + getSuggestionCandidates(useFullEditDistance, inputWordLength, + 0 /* bigramListPosition */, correction, queuePool, false /* doAutoCompletion */, + MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); if (DEBUG_DICT) { if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); @@ -730,7 +738,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor return maxFreq; } -bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const { +bool UnigramDictionary::isValidWord(const int32_t* const inWord, const int length) const { return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); } @@ -755,7 +763,7 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, - Correction *correction, int *newCount, + const int bigramListPosition, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { @@ -834,11 +842,14 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, if (isTerminalNode) { // The frequency should be here, because we come here only if this is actually // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); - onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, + // The bigramListPosition is the offset in the file of the bigrams for the previous word, + // or zero if we don't know of any bigrams for it. + const int probability = BinaryFormat::getProbability(bigramListPosition, unigramFreq); + onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); // If there are more chars in this node, then this virtual node has children. diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index 4479cd94e..0cc59bac8 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -71,37 +71,38 @@ class UnigramDictionary { UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags); - bool isValidWord(const uint16_t* const inWord, const int length) const; + bool isValidWord(const int32_t* const inWord, const int length) const; int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, - Correction *correction, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, + Correction *correction, const int *xcoordinates, const int *ycoordinates, + const int *codes, const int codesSize, const int bigramListPosition, const bool useFullEditDistance, unsigned short *outWords, int *frequencies); virtual ~UnigramDictionary(); private: void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int inputLength, - const bool useFullEditDistance, Correction *correction, + const int bigramListPosition, const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool); int getDigraphReplacement(const int *codes, const int i, const int codesSize, const digraph_t* const digraphs, const unsigned int digraphsSize) const; void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, - int *xCoordinatesBuffer, int *yCoordinatesBuffer, - const int codesBufferSize, const bool useFullEditDistance, const int* codesSrc, + int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize, + const int bigramListPosition, const bool useFullEditDistance, const int* codesSrc, const int codesRemain, const int currentDepth, int* codesDest, Correction *correction, WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs, const unsigned int digraphsSize); void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const bool useFullEditDistance, - const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool); - void getSuggestionCandidates( + const int *ycoordinates, const int *codes, const int bigramListPosition, const bool useFullEditDistance, const int inputLength, Correction *correction, - WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors, - const int currentWordIndex); + WordsPriorityQueuePool* queuePool); + void getSuggestionCandidates( + const bool useFullEditDistance, const int inputLength, const int bigramListPosition, + Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, + const int maxErrors, const int currentWordIndex); void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, @@ -113,9 +114,9 @@ class UnigramDictionary { bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character - bool processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, - const int currentWordIndex); + bool processCurrentNode(const int initialPos, const int bigramListPosition, + Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition, + WordsPriorityQueuePool *queuePool, const int currentWordIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, |