diff options
Diffstat (limited to 'native/jni/src/unigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 72 |
1 files changed, 41 insertions, 31 deletions
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 1133256c4..5820a1d0e 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -18,13 +18,15 @@ #define LOG_TAG "LatinIME: unigram_dictionary.cpp" -#include "char_utils.h" #include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/dictionary/terminal_attributes.h" #include "suggest/core/layout/proximity_info.h" +#include "utils/char_utils.h" #include "unigram_dictionary.h" #include "words_priority_queue.h" #include "words_priority_queue_pool.h" @@ -32,8 +34,9 @@ namespace latinime { // TODO: check the header -UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags) - : DICT_ROOT(streamStart), ROOT_POS(0), +UnigramDictionary::UnigramDictionary( + const BinaryDictionaryInfo *const binaryDicitonaryInfo, const uint8_t dictFlags) + : mBinaryDicitonaryInfo(binaryDicitonaryInfo), MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) { if (DEBUG_DICT) { AKLOGI("UnigramDictionary - constructor"); @@ -315,9 +318,10 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, correction->setCorrectionParams(0, 0, 0, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, doAutoCompletion, maxErrors); - int rootPosition = ROOT_POS; + int rootPosition = mBinaryDicitonaryInfo->getRootPosition(); // Get the number of children of root, then increment the position - int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); + int childCount = BinaryFormat::getGroupCountAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &rootPosition); int outputIndex = 0; correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0)); @@ -696,8 +700,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; int codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); - int baseChar = toBaseLowerCase(codePoint); - const int wChar = toBaseLowerCase(inWord[startInputIndex]); + int baseChar = CharUtils::toBaseLowerCase(codePoint); + const int wChar = CharUtils::toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; @@ -709,8 +713,9 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, if (hasMultipleChars) { codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); while (NOT_A_CODE_POINT != codePoint) { - baseChar = toBaseLowerCase(codePoint); - if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + baseChar = CharUtils::toBaseLowerCase(codePoint); + if (inputIndex + 1 >= inputSize + || CharUtils::toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false; @@ -746,7 +751,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con int newWord[MAX_WORD_LENGTH]; int depth = 0; int maxFreq = -1; - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot(); int stackChildCount[MAX_WORD_LENGTH]; int stackInputIndex[MAX_WORD_LENGTH]; int stackSiblingPos[MAX_WORD_LENGTH]; @@ -805,7 +810,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con } int UnigramDictionary::getProbability(const int *const inWord, const int length) const { - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot(); int pos = BinaryFormat::getTerminalPosition(root, inWord, length, false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == pos) { @@ -822,7 +827,7 @@ int UnigramDictionary::getProbability(const int *const inWord, const int length) if (hasMultipleChars) { pos = BinaryFormat::skipOtherCharacters(root, pos); } else { - BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); + BinaryFormat::getCodePointAndForwardPointer(root, &pos); } const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); return unigramProbability; @@ -864,7 +869,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &pos); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); @@ -875,7 +881,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // else if FLAG_IS_TERMINAL: the probability // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. - int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); + int c = BinaryFormat::getCodePointAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &pos); ASSERT(NOT_A_CODE_POINT != c); // We are going to loop through each character and make it look like it's a different @@ -889,8 +896,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // We prefetch the next char. If 'c' is the last char of this node, we will have // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node // should behave as a terminal or not and whether we have children. - const int nextc = hasMultipleChars - ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT; + const int nextc = hasMultipleChars ? BinaryFormat::getCodePointAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT; const bool isLastChar = (NOT_A_CODE_POINT == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. // If we are on the last char, this virtual node is a terminal if this node is. @@ -910,11 +917,11 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // We don't have to output other values because we return false, as in // "don't traverse children". if (!isLastChar) { - pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); + pos = BinaryFormat::skipOtherCharacters(mBinaryDicitonaryInfo->getDictRoot(), pos); } pos = BinaryFormat::skipProbability(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); return false; } @@ -927,15 +934,15 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, if (isTerminalNode) { // The probability should be here, because we come here only if this is actually // a terminal node, and we are on its last char. - const int unigramProbability = - BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos); + const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer( + mBinaryDicitonaryInfo->getDictRoot(), pos); const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); - TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); + TerminalAttributes terminalAttributes(mBinaryDicitonaryInfo, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. - const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, - unigramProbability); + const int probability = ProbabilityUtils::getProbability( + initialPos, bigramMap, bigramFilter, unigramProbability); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); @@ -951,16 +958,16 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // remaining char in this group for there can't be any. if (!hasChildren) { pos = BinaryFormat::skipProbability(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); return false; } // Optimization: Prune out words that are too long compared to how much was typed. if (correction->needsToPrune()) { pos = BinaryFormat::skipProbability(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); if (DEBUG_DICT_FULL) { AKLOGI("Traversing was pruned."); } @@ -979,9 +986,12 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // Once this is read, we still need to output the number of nodes in the immediate children of // this node, so we read and output it before returning true, as in "please traverse children". pos = BinaryFormat::skipProbability(flags, pos); - int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); - *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); + int childrenPos = BinaryFormat::readChildrenPosition( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); + *newCount = BinaryFormat::getGroupCountAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &childrenPos); *newChildrenPosition = childrenPos; return true; } |