diff options
Diffstat (limited to 'native/jni/src')
12 files changed, 64 insertions, 69 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index 607a74400..974bb483b 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -292,7 +292,7 @@ static inline void prof_out(void) { // of the binary dictionary where a {key,value} string pair scheme is used. #define LARGEST_INT_DIGIT_COUNT 11 -#define NOT_VALID_WORD (-99) +#define NOT_A_VALID_WORD_POS (-99) #define NOT_A_CODE_POINT (-1) #define NOT_A_DISTANCE (-1) #define NOT_A_COORDINATE (-1) diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index 973da67e4..696be0aeb 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -112,7 +112,7 @@ class DicNode { mIsUsed = true; mIsCachedForNextSuggestion = false; mDicNodeProperties.init( - NOT_A_DICT_POS, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, + NOT_A_VALID_WORD_POS /* pos */, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, false /* isTerminal */, true /* hasChildren */, false /* isBlacklistedOrNotAWord */, 0 /* depth */, 0 /* terminalDepth */); @@ -125,7 +125,7 @@ class DicNode { mIsUsed = true; mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; mDicNodeProperties.init( - NOT_A_DICT_POS, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, + NOT_A_VALID_WORD_POS /* pos */, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, false /* isTerminal */, true /* hasChildren */, false /* isBlacklistedOrNotAWord */, 0 /* depth */, 0 /* terminalDepth */); @@ -231,7 +231,7 @@ class DicNode { } bool isFirstWord() const { - return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos() == NOT_VALID_WORD; + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos() == NOT_A_VALID_WORD_POS; } bool isCompletion(const int inputSize) const { diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h b/native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h index c3968c090..5854f4f6e 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_state_prevword.h @@ -29,7 +29,7 @@ class DicNodeStatePrevWord { public: AK_FORCE_INLINE DicNodeStatePrevWord() : mPrevWordCount(0), mPrevWordLength(0), mPrevWordStart(0), mPrevWordProbability(0), - mPrevWordNodePos(0) { + mPrevWordNodePos(NOT_A_VALID_WORD_POS) { memset(mPrevWord, 0, sizeof(mPrevWord)); memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions)); } @@ -41,7 +41,7 @@ class DicNodeStatePrevWord { mPrevWordCount = 0; mPrevWordStart = 0; mPrevWordProbability = -1; - mPrevWordNodePos = NOT_VALID_WORD; + mPrevWordNodePos = NOT_A_VALID_WORD_POS; memset(mPrevSpacePositions, 0, sizeof(mPrevSpacePositions)); } diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index 6c7f6667a..67fbc1a38 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -179,8 +179,9 @@ namespace latinime { const int unigramProbability = node->getProbability(); const int wordPos = node->getPos(); const int prevWordPos = node->getPrevWordPos(); - if (NOT_VALID_WORD == wordPos || NOT_VALID_WORD == prevWordPos) { - // Note: Normally wordPos comes from the dictionary and should never equal NOT_VALID_WORD. + if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) { + // Note: Normally wordPos comes from the dictionary and should never equal + // NOT_A_VALID_WORD_POS. return ProbabilityUtils::backoff(unigramProbability); } if (multiBigramMap) { diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index ff304d2b2..748430233 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -123,9 +123,10 @@ int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, in for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos); bigramsIt.hasNext(); /* no-op */) { bigramsIt.next(); - const int length = BinaryFormat::getWordAtAddress( - mBinaryDictionaryInfo->getDictRoot(), bigramsIt.getBigramPos(), - MAX_WORD_LENGTH, bigramBuffer, &unigramProbability); + const int length = mBinaryDictionaryInfo->getStructurePolicy()-> + getCodePointsAndProbabilityAndReturnCodePointCount( + mBinaryDictionaryInfo, bigramsIt.getBigramPos(), MAX_WORD_LENGTH, + bigramBuffer, &unigramProbability); // inputSize == 0 means we are trying to find bigram predictions. if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { @@ -152,19 +153,9 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in if (0 >= prevWordLength) return 0; int pos = mBinaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( mBinaryDictionaryInfo, prevWord, prevWordLength, forceLowerCaseSearch); - if (NOT_VALID_WORD == pos) return 0; - const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0; - if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { - BinaryFormat::getCodePointAndForwardPointer(root, &pos); - } else { - pos = BinaryFormat::skipOtherCharacters(root, pos); - } - pos = BinaryFormat::skipProbability(flags, pos); - pos = BinaryFormat::skipChildrenPosition(flags, pos); - pos = BinaryFormat::skipShortcuts(root, flags, pos); - return pos; + if (NOT_A_VALID_WORD_POS == pos) return 0; + return BinaryFormat::getBigramListPositionForWordPosition( + mBinaryDictionaryInfo->getDictRoot(), pos); } bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) const { @@ -190,7 +181,7 @@ bool BigramDictionary::isValidBigram(const int *word0, int length0, const int *w if (0 == pos) return false; int nextWordPos = mBinaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( mBinaryDictionaryInfo, word1, length1, false /* forceLowerCaseSearch */); - if (NOT_VALID_WORD == nextWordPos) return false; + if (NOT_A_VALID_WORD_POS == nextWordPos) return false; for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos); bigramsIt.hasNext(); /* no-op */) { diff --git a/native/jni/src/suggest/core/dictionary/binary_format.h b/native/jni/src/suggest/core/dictionary/binary_format.h index 9557d8ce7..6a5afd12e 100644 --- a/native/jni/src/suggest/core/dictionary/binary_format.h +++ b/native/jni/src/suggest/core/dictionary/binary_format.h @@ -71,8 +71,9 @@ class BinaryFormat { static bool hasChildrenInFlags(const uint8_t flags); static int getTerminalPosition(const uint8_t *const root, const int *const inWord, const int length, const bool forceLowerCaseSearch); - static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, - int *outWord, int *outUnigramProbability); + static int getCodePointsAndProbabilityAndReturnCodePointCount( + const uint8_t *const root, const int nodePos, const int maxCodePointCount, + int *outCodePoints, int *outUnigramProbability); static int getBigramListPositionForWordPosition(const uint8_t *const root, int position); private: @@ -254,7 +255,7 @@ inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { } // This function gets the byte position of the last chargroup of the exact matching word in the -// dictionary. If no match is found, it returns NOT_VALID_WORD. +// dictionary. If no match is found, it returns NOT_A_VALID_WORD_POS. AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, const int *const inWord, const int length, const bool forceLowerCaseSearch) { int pos = 0; @@ -263,22 +264,22 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, while (true) { // If we already traversed the tree further than the word is long, there means // there was no match (or we would have found it). - if (wordPos >= length) return NOT_VALID_WORD; + if (wordPos >= length) return NOT_A_VALID_WORD_POS; int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); const int wChar = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; while (true) { // If there are no more character groups in this node, it means we could not // find a matching character for this depth, therefore there is no match. - if (0 >= charGroupCount) return NOT_VALID_WORD; + if (0 >= charGroupCount) return NOT_A_VALID_WORD_POS; const int charGroupPos = pos; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); int character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); if (character == wChar) { // This is the correct node. Only one character group may start with the same // char within a node, so either we found our match in this node, or there is - // no match and we can return NOT_VALID_WORD. So we will check all the characters - // in this character group indeed does match. + // no match and we can return NOT_A_VALID_WORD_POS. So we will check all the + // characters in this character group indeed does match. if (FLAG_HAS_MULTIPLE_CHARS & flags) { character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); while (NOT_A_CODE_POINT != character) { @@ -287,8 +288,8 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, // character that does not match, as explained above, it means the word is // not in the dictionary (by virtue of this chargroup being the only one to // match the word on the first character, but not matching the whole word). - if (wordPos >= length) return NOT_VALID_WORD; - if (inWord[wordPos] != character) return NOT_VALID_WORD; + if (wordPos >= length) return NOT_A_VALID_WORD_POS; + if (inWord[wordPos] != character) return NOT_A_VALID_WORD_POS; character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); } } @@ -304,7 +305,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos); } if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { - return NOT_VALID_WORD; + return NOT_A_VALID_WORD_POS; } // We have children and we are still shorter than the word we are searching for, so // we need to traverse children. Put the pointer on the children position, and @@ -342,8 +343,9 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, * outUnigramProbability: a pointer to an int to write the probability into. * Return value : the length of the word, of 0 if the word was not found. */ -AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, - const int maxDepth, int *outWord, int *outUnigramProbability) { +AK_FORCE_INLINE int BinaryFormat::getCodePointsAndProbabilityAndReturnCodePointCount( + const uint8_t *const root, const int nodePos, + const int maxCodePointCount, int *outCodePoints, int *outUnigramProbability) { int pos = 0; int wordPos = 0; @@ -353,7 +355,7 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co // The only reason we count nodes is because we want to reduce the probability of infinite // looping in case there is a bug. Since we know there is an upper bound to the depth we are // supposed to traverse, it does not hurt to count iterations. - for (int loopCount = maxDepth; loopCount > 0; --loopCount) { + for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { int lastCandidateGroupPos = 0; // Let's loop through char groups in this node searching for either the terminal // or one of its ascendants. @@ -362,17 +364,17 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co const int startPos = pos; const uint8_t flags = getFlagsAndForwardPointer(root, &pos); const int character = getCodePointAndForwardPointer(root, &pos); - if (address == startPos) { + if (nodePos == startPos) { // We found the address. Copy the rest of the word in the buffer and return // the length. - outWord[wordPos] = character; + outCodePoints[wordPos] = character; if (FLAG_HAS_MULTIPLE_CHARS & flags) { int nextChar = getCodePointAndForwardPointer(root, &pos); // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug - int charCount = maxDepth; + int charCount = maxCodePointCount; while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { - outWord[++wordPos] = nextChar; + outCodePoints[++wordPos] = nextChar; nextChar = getCodePointAndForwardPointer(root, &pos); } } @@ -399,7 +401,7 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co if (hasChildren) { // Here comes the tricky part. First, read the children position. const int childrenPos = readChildrenPosition(root, flags, pos); - if (childrenPos > address) { + if (childrenPos > nodePos) { // If the children pos is greater than address, it means the previous chargroup, // which address is stored in lastCandidateGroupPos, was the right one. found = true; @@ -429,12 +431,12 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co const int lastChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); // We copy all the characters in this group to the buffer - outWord[wordPos] = lastChar; + outCodePoints[wordPos] = lastChar; if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) { int nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); - int charCount = maxDepth; + int charCount = maxCodePointCount; while (-1 != nextChar && --charCount > 0) { - outWord[++wordPos] = nextChar; + outCodePoints[++wordPos] = nextChar; nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); } } @@ -472,7 +474,7 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition( const uint8_t *const root, int position) { - if (NOT_VALID_WORD == position) return 0; + if (NOT_A_VALID_WORD_POS == position) return 0; const uint8_t flags = getFlagsAndForwardPointer(root, &position); if (!(flags & FLAG_HAS_BIGRAMS)) return 0; if (flags & FLAG_HAS_MULTIPLE_CHARS) { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index f520a75b1..52e635975 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -89,7 +89,7 @@ int Dictionary::getProbability(const int *word, int length) const { mBinaryDictionaryInfo.getStructurePolicy(); int pos = structurePolicy->getTerminalNodePositionOfWord(&mBinaryDictionaryInfo, word, length, false /* forceLowerCaseSearch */); - if (NOT_VALID_WORD == pos) { + if (NOT_A_VALID_WORD_POS == pos) { return NOT_A_PROBABILITY; } return structurePolicy->getUnigramProbability(&mBinaryDictionaryInfo, pos); diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_policy.h index ab42c13b4..48ba5b8c2 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_policy.h @@ -50,8 +50,9 @@ class DictionaryStructurePolicy { const BinaryDictionaryInfo *const binaryDictionaryInfo, const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const = 0; - virtual void getWordAtPosition(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int terminalNodePos, const int maxDepth, int *const outWord, + virtual int getCodePointsAndProbabilityAndReturnCodePointCount( + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const = 0; virtual int getTerminalNodePositionOfWord( diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp index 774d6074e..7651b19a0 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp +++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -18,10 +18,8 @@ #include "defines.h" #include "jni.h" -#include "suggest/core/dicnode/dic_node_utils.h" #include "suggest/core/dictionary/binary_dictionary_header.h" #include "suggest/core/dictionary/binary_dictionary_info.h" -#include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/dictionary.h" namespace latinime { @@ -29,23 +27,22 @@ namespace latinime { void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord, int prevWordLength, const SuggestOptions *const suggestOptions) { mDictionary = dictionary; - mMultiWordCostMultiplier = mDictionary->getBinaryDictionaryInfo() - ->getHeader()->getMultiWordCostMultiplier(); + const BinaryDictionaryInfo *const binaryDictionaryInfo = + mDictionary->getBinaryDictionaryInfo(); + mMultiWordCostMultiplier = binaryDictionaryInfo->getHeader()->getMultiWordCostMultiplier(); mSuggestOptions = suggestOptions; if (!prevWord) { - mPrevWordPos = NOT_VALID_WORD; + mPrevWordPos = NOT_A_VALID_WORD_POS; return; } // TODO: merge following similar calls to getTerminalPosition into one case-insensitive call. - mPrevWordPos = BinaryFormat::getTerminalPosition( - dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord, - prevWordLength, false /* forceLowerCaseSearch */); - if (mPrevWordPos == NOT_VALID_WORD) { + mPrevWordPos = binaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( + binaryDictionaryInfo, prevWord, prevWordLength, false /* forceLowerCaseSearch */); + if (mPrevWordPos == NOT_A_VALID_WORD_POS) { // Check bigrams for lower-cased previous word if original was not found. Useful for // auto-capitalized words like "The [current_word]". - mPrevWordPos = BinaryFormat::getTerminalPosition( - dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord, - prevWordLength, true /* forceLowerCaseSearch */); + mPrevWordPos = binaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( + binaryDictionaryInfo, prevWord, prevWordLength, true /* forceLowerCaseSearch */); } } diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h index f95a0b23d..de57e041a 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.h +++ b/native/jni/src/suggest/core/session/dic_traverse_session.h @@ -55,7 +55,7 @@ class DicTraverseSession { } AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr) - : mPrevWordPos(NOT_VALID_WORD), mProximityInfo(0), + : mPrevWordPos(NOT_A_VALID_WORD_POS), mProximityInfo(0), mDictionary(0), mSuggestOptions(0), mDicNodesCache(), mMultiBigramMap(), mInputSize(0), mPartiallyCommited(false), mMaxPointerCount(1), mMultiWordCostMultiplier(1.0f) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp index c995af98a..c807fb7c9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp @@ -33,11 +33,13 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, // TODO: Move children creating methods form DicNodeUtils. } -void PatriciaTriePolicy::getWordAtPosition(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int terminalNodePos, const int maxDepth, int *const outWord, +int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { - BinaryFormat::getWordAtAddress(binaryDictionaryInfo->getDictRoot(), terminalNodePos, - maxDepth, outWord, outUnigramProbability); + return BinaryFormat::getCodePointsAndProbabilityAndReturnCodePointCount( + binaryDictionaryInfo->getDictRoot(), nodePos, + maxCodePointCount, outCodePoints, outUnigramProbability); } int PatriciaTriePolicy::getTerminalNodePositionOfWord( diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h index 9b9338145..0a16e414a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -36,8 +36,9 @@ class PatriciaTriePolicy : public DictionaryStructurePolicy { const BinaryDictionaryInfo *const binaryDictionaryInfo, const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const; - void getWordAtPosition(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int terminalNodePos, const int maxDepth, int *const outWord, + int getCodePointsAndProbabilityAndReturnCodePointCount( + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const; int getTerminalNodePositionOfWord( |