diff options
Diffstat (limited to 'native/jni/src')
21 files changed, 404 insertions, 211 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index e349aedb1..cb6681456 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -270,6 +270,7 @@ static inline void prof_out(void) { #define NOT_A_COORDINATE (-1) #define NOT_AN_INDEX (-1) #define NOT_A_PROBABILITY (-1) +#define NOT_A_DICT_POS (S_INT_MIN) #define KEYCODE_SPACE ' ' #define KEYCODE_SINGLE_QUOTE '\'' diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index c700b01ca..52db8e9c7 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -97,7 +97,6 @@ class DicNode { DicNode &operator=(const DicNode &dicNode); virtual ~DicNode() {} - // TODO: minimize arguments by looking binary_format // Init for copy void initByCopy(const DicNode *dicNode) { mIsUsed = true; @@ -107,14 +106,15 @@ class DicNode { PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - // TODO: minimize arguments by looking binary_format // Init for root with prevWordNodePos which is used for bigram - void initAsRoot(const int pos, const int childrenPos, const int childrenCount, - const int prevWordNodePos) { + void initAsRoot(const int rootGroupPos, const int prevWordNodePos) { mIsUsed = true; mIsCachedForNextSuggestion = false; mDicNodeProperties.init( - pos, 0, childrenPos, 0, 0, 0, childrenCount, 0, 0, false, false, true, 0, 0); + NOT_A_DICT_POS, 0 /* flags */, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, + NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, + false /* isTerminal */, true /* hasChildren */, 0 /* depth */, + 0 /* terminalDepth */); mDicNodeState.init(prevWordNodePos); PROF_NODE_RESET(mProfiler); } @@ -128,14 +128,15 @@ class DicNode { PROF_NODE_COPY(&parentNode->mProfiler, mProfiler); } - // TODO: minimize arguments by looking binary_format // Init for root with previous word - void initAsRootWithPreviousWord(DicNode *dicNode, const int pos, const int childrenPos, - const int childrenCount) { + void initAsRootWithPreviousWord(DicNode *dicNode, const int rootGroupPos) { mIsUsed = true; mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; mDicNodeProperties.init( - pos, 0, childrenPos, 0, 0, 0, childrenCount, 0, 0, false, false, true, 0, 0); + NOT_A_DICT_POS, 0 /* flags */, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, + NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, + false /* isTerminal */, true /* hasChildren */, 0 /* depth */, + 0 /* terminalDepth */); // TODO: Move to dicNodeState? mDicNodeState.mDicNodeStateOutput.init(); // reset for next word mDicNodeState.mDicNodeStateInput.init( @@ -157,19 +158,18 @@ class DicNode { // TODO: minimize arguments by looking binary_format void initAsChild(DicNode *dicNode, const int pos, const uint8_t flags, const int childrenPos, - const int attributesPos, const int siblingPos, const int nodeCodePoint, - const int childrenCount, const int probability, const int bigramProbability, - const bool isTerminal, const bool hasMultipleChars, const bool hasChildren, - const uint16_t additionalSubwordLength, const int *additionalSubword) { + const int attributesPos, const int probability, const bool isTerminal, + const bool hasChildren, const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { mIsUsed = true; uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1); mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; const uint16_t newLeavingDepth = static_cast<uint16_t>( - dicNode->mDicNodeProperties.getLeavingDepth() + additionalSubwordLength); - mDicNodeProperties.init(pos, flags, childrenPos, attributesPos, siblingPos, nodeCodePoint, - childrenCount, probability, bigramProbability, isTerminal, hasMultipleChars, - hasChildren, newDepth, newLeavingDepth); - mDicNodeState.init(&dicNode->mDicNodeState, additionalSubwordLength, additionalSubword); + dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount); + mDicNodeProperties.init(pos, flags, childrenPos, attributesPos, mergedNodeCodePoints[0], + probability, isTerminal, hasChildren, newDepth, newLeavingDepth); + mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount, + mergedNodeCodePoints); PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } @@ -193,8 +193,8 @@ class DicNode { } bool isLeavingNode() const { - ASSERT(getNodeCodePointCount() <= getLeavingDepth()); - return getNodeCodePointCount() == getLeavingDepth(); + ASSERT(getNodeCodePointCount() <= mDicNodeProperties.getLeavingDepth()); + return getNodeCodePointCount() == mDicNodeProperties.getLeavingDepth(); } AK_FORCE_INLINE bool isFirstLetter() const { @@ -256,12 +256,6 @@ class DicNode { return mDicNodeProperties.getChildrenPos(); } - // Used in DicNodeUtils - int getChildrenCount() const { - return mDicNodeProperties.getChildrenCount(); - } - - // Used in DicNodeUtils int getProbability() const { return mDicNodeProperties.getProbability(); } @@ -280,10 +274,6 @@ class DicNode { return !(currentDepth > 0 && (currentDepth != 1 || prevWordLen != 1)); } - uint16_t getLeavingDepth() const { - return mDicNodeProperties.getLeavingDepth(); - } - bool isTotalInputSizeExceedingLimit() const { const int prevWordsLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); const int currentWordDepth = getNodeCodePointCount(); @@ -370,7 +360,7 @@ class DicNode { } AK_FORCE_INLINE const int *getOutputWordBuf() const { - return mDicNodeState.mDicNodeStateOutput.mWordBuf; + return mDicNodeState.mDicNodeStateOutput.mCodePointsBuf; } int getPrevCodePointG(int pointerId) const { diff --git a/native/jni/src/suggest/core/dicnode/dic_node_properties.h b/native/jni/src/suggest/core/dicnode/dic_node_properties.h index d2f87c10b..7e8aa4979 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_properties.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_properties.h @@ -27,37 +27,31 @@ namespace latinime { /** * Node for traversing the lexicon trie. */ +// TODO: Introduce a dictionary node class which has attribute members required to understand the +// dictionary structure. class DicNodeProperties { public: AK_FORCE_INLINE DicNodeProperties() - : mPos(0), mFlags(0), mChildrenPos(0), mAttributesPos(0), mSiblingPos(0), - mChildrenCount(0), mProbability(0), mBigramProbability(0), mNodeCodePoint(0), - mDepth(0), mLeavingDepth(0), mIsTerminal(false), mHasMultipleChars(false), - mHasChildren(false) { - } + : mPos(0), mFlags(0), mChildrenPos(0), mAttributesPos(0), mProbability(0), + mNodeCodePoint(0), mDepth(0), mLeavingDepth(0), mIsTerminal(false), + mHasChildren(false) {} virtual ~DicNodeProperties() {} // Should be called only once per DicNode is initialized. void init(const int pos, const uint8_t flags, const int childrenPos, const int attributesPos, - const int siblingPos, const int nodeCodePoint, const int childrenCount, - const int probability, const int bigramProbability, const bool isTerminal, - const bool hasMultipleChars, const bool hasChildren, const uint16_t depth, - const uint16_t terminalDepth) { + const int nodeCodePoint, const int probability, const bool isTerminal, + const bool hasChildren, const uint16_t depth, const uint16_t leavingDepth) { mPos = pos; mFlags = flags; mChildrenPos = childrenPos; mAttributesPos = attributesPos; - mSiblingPos = siblingPos; mNodeCodePoint = nodeCodePoint; - mChildrenCount = childrenCount; mProbability = probability; - mBigramProbability = bigramProbability; mIsTerminal = isTerminal; - mHasMultipleChars = hasMultipleChars; mHasChildren = hasChildren; mDepth = depth; - mLeavingDepth = terminalDepth; + mLeavingDepth = leavingDepth; } // Init for copy @@ -66,13 +60,9 @@ class DicNodeProperties { mFlags = nodeProp->mFlags; mChildrenPos = nodeProp->mChildrenPos; mAttributesPos = nodeProp->mAttributesPos; - mSiblingPos = nodeProp->mSiblingPos; mNodeCodePoint = nodeProp->mNodeCodePoint; - mChildrenCount = nodeProp->mChildrenCount; mProbability = nodeProp->mProbability; - mBigramProbability = nodeProp->mBigramProbability; mIsTerminal = nodeProp->mIsTerminal; - mHasMultipleChars = nodeProp->mHasMultipleChars; mHasChildren = nodeProp->mHasChildren; mDepth = nodeProp->mDepth; mLeavingDepth = nodeProp->mLeavingDepth; @@ -84,13 +74,9 @@ class DicNodeProperties { mFlags = nodeProp->mFlags; mChildrenPos = nodeProp->mChildrenPos; mAttributesPos = nodeProp->mAttributesPos; - mSiblingPos = nodeProp->mSiblingPos; mNodeCodePoint = codePoint; // Overwrite the node char of a passing child - mChildrenCount = nodeProp->mChildrenCount; mProbability = nodeProp->mProbability; - mBigramProbability = nodeProp->mBigramProbability; mIsTerminal = nodeProp->mIsTerminal; - mHasMultipleChars = nodeProp->mHasMultipleChars; mHasChildren = nodeProp->mHasChildren; mDepth = nodeProp->mDepth + 1; // Increment the depth of a passing child mLeavingDepth = nodeProp->mLeavingDepth; @@ -112,10 +98,6 @@ class DicNodeProperties { return mAttributesPos; } - int getChildrenCount() const { - return mChildrenCount; - } - int getProbability() const { return mProbability; } @@ -137,12 +119,8 @@ class DicNodeProperties { return mIsTerminal; } - bool hasMultipleChars() const { - return mHasMultipleChars; - } - bool hasChildren() const { - return mChildrenCount > 0 || mDepth != mLeavingDepth; + return mHasChildren || mDepth != mLeavingDepth; } bool hasBlacklistedOrNotAWordFlag() const { @@ -153,25 +131,15 @@ class DicNodeProperties { // Caution!!! // Use a default copy constructor and an assign operator because shallow copies are ok // for this class - - // Not used - int getSiblingPos() const { - return mSiblingPos; - } - int mPos; uint8_t mFlags; int mChildrenPos; int mAttributesPos; - int mSiblingPos; - int mChildrenCount; int mProbability; - int mBigramProbability; // not used for now int mNodeCodePoint; uint16_t mDepth; uint16_t mLeavingDepth; bool mIsTerminal; - bool mHasMultipleChars; bool mHasChildren; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state.h b/native/jni/src/suggest/core/dicnode/dic_node_state.h index d35e7d79f..b1b6266f2 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_state.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_state.h @@ -55,11 +55,12 @@ class DicNodeState { mDicNodeStateScoring.init(&src->mDicNodeStateScoring); } - // Init by copy and adding subword - void init(const DicNodeState *const src, const uint16_t additionalSubwordLength, - const int *const additionalSubword) { + // Init by copy and adding merged node code points. + void init(const DicNodeState *const src, const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { init(src); - mDicNodeStateOutput.addSubword(additionalSubwordLength, additionalSubword); + mDicNodeStateOutput.addMergedNodeCodePoints( + mergedNodeCodePointCount, mergedNodeCodePoints); } private: diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state_output.h b/native/jni/src/suggest/core/dicnode/dic_node_state_output.h index 1d4f50a06..45c7f5cf9 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_state_output.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_state_output.h @@ -26,50 +26,52 @@ namespace latinime { class DicNodeStateOutput { public: - DicNodeStateOutput() : mOutputtedLength(0) { + DicNodeStateOutput() : mOutputtedCodePointCount(0) { init(); } virtual ~DicNodeStateOutput() {} void init() { - mOutputtedLength = 0; - mWordBuf[0] = 0; + mOutputtedCodePointCount = 0; + mCodePointsBuf[0] = 0; } void init(const DicNodeStateOutput *const stateOutput) { - memcpy(mWordBuf, stateOutput->mWordBuf, - stateOutput->mOutputtedLength * sizeof(mWordBuf[0])); - mOutputtedLength = stateOutput->mOutputtedLength; - if (mOutputtedLength < MAX_WORD_LENGTH) { - mWordBuf[mOutputtedLength] = 0; + memcpy(mCodePointsBuf, stateOutput->mCodePointsBuf, + stateOutput->mOutputtedCodePointCount * sizeof(mCodePointsBuf[0])); + mOutputtedCodePointCount = stateOutput->mOutputtedCodePointCount; + if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { + mCodePointsBuf[mOutputtedCodePointCount] = 0; } } - void addSubword(const uint16_t additionalSubwordLength, const int *const additionalSubword) { - if (additionalSubword) { - memcpy(&mWordBuf[mOutputtedLength], additionalSubword, - additionalSubwordLength * sizeof(mWordBuf[0])); - mOutputtedLength = static_cast<uint16_t>(mOutputtedLength + additionalSubwordLength); - if (mOutputtedLength < MAX_WORD_LENGTH) { - mWordBuf[mOutputtedLength] = 0; + void addMergedNodeCodePoints(const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { + if (mergedNodeCodePoints) { + memcpy(&mCodePointsBuf[mOutputtedCodePointCount], mergedNodeCodePoints, + mergedNodeCodePointCount * sizeof(mCodePointsBuf[0])); + mOutputtedCodePointCount = static_cast<uint16_t>( + mOutputtedCodePointCount + mergedNodeCodePointCount); + if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { + mCodePointsBuf[mOutputtedCodePointCount] = 0; } } } // TODO: Remove - int getCodePointAt(const int id) const { - return mWordBuf[id]; + int getCodePointAt(const int index) const { + return mCodePointsBuf[index]; } // TODO: Move to private - int mWordBuf[MAX_WORD_LENGTH]; + int mCodePointsBuf[MAX_WORD_LENGTH]; private: // Caution!!! // Use a default copy constructor and an assign operator because shallow copies are ok // for this class - uint16_t mOutputtedLength; + uint16_t mOutputtedCodePointCount; }; } // namespace latinime #endif // LATINIME_DIC_NODE_STATE_OUTPUT_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index f0f26c72b..9bf7eceb5 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -26,6 +26,7 @@ #include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/layout/proximity_info.h" #include "suggest/core/layout/proximity_info_state.h" +#include "suggest/core/policy/dictionary_structure_policy.h" #include "utils/char_utils.h" namespace latinime { @@ -36,23 +37,15 @@ namespace latinime { /* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int prevWordNodePos, DicNode *const newRootNode) { - int curPos = binaryDictionaryInfo->getRootPosition(); - const int pos = curPos; - const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( - binaryDictionaryInfo->getDictRoot(), &curPos); - const int childrenPos = curPos; - newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos); + newRootNode->initAsRoot(binaryDictionaryInfo->getStructurePolicy()->getRootPosition(), + prevWordNodePos); } /*static */ void DicNodeUtils::initAsRootWithPreviousWord( const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNode *const prevWordLastNode, DicNode *const newRootNode) { - int curPos = binaryDictionaryInfo->getRootPosition(); - const int pos = curPos; - const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( - binaryDictionaryInfo->getDictRoot(), &curPos); - const int childrenPos = curPos; - newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount); + newRootNode->initAsRootWithPreviousWord( + prevWordLastNode, binaryDictionaryInfo->getStructurePolicy()->getRootPosition()); } /* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) { @@ -76,7 +69,7 @@ namespace latinime { } /* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos, - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth, + const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { @@ -86,15 +79,15 @@ namespace latinime { const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); + const bool hasShortcuts = (0 != (BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS & flags)); int codePoint = BinaryFormat::getCodePointAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos); ASSERT(NOT_A_CODE_POINT != codePoint); - const int nodeCodePoint = codePoint; // TODO: optimize this - int additionalWordBuf[MAX_WORD_LENGTH]; - uint16_t additionalSubwordLength = 0; - additionalWordBuf[additionalSubwordLength++] = codePoint; + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + uint16_t mergedNodeCodePointCount = 0; + mergedNodeCodePoints[mergedNodeCodePointCount++] = codePoint; do { const int nextCodePoint = hasMultipleChars @@ -102,31 +95,29 @@ namespace latinime { binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT; const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint); if (!isLastChar) { - additionalWordBuf[additionalSubwordLength++] = nextCodePoint; + mergedNodeCodePoints[mergedNodeCodePointCount++] = nextCodePoint; } codePoint = nextCodePoint; } while (NOT_A_CODE_POINT != codePoint); const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer( - binaryDictionaryInfo->getDictRoot(), pos) : -1; + binaryDictionaryInfo->getDictRoot(), pos) : NOT_A_PROBABILITY; pos = BinaryFormat::skipProbability(flags, pos); int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition( - binaryDictionaryInfo->getDictRoot(), flags, pos) : 0; - const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos); + binaryDictionaryInfo->getDictRoot(), flags, pos) : NOT_A_DICT_POS; + const int attributesPos = + hasShortcuts ? BinaryFormat::skipChildrenPosition(flags, pos) : NOT_A_DICT_POS; const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes( binaryDictionaryInfo->getDictRoot(), flags, pos); - if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) { + if (isDicNodeFilteredOut(mergedNodeCodePoints[0], pInfo, codePointsFilter)) { return siblingPos; } - if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) { + if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, mergedNodeCodePoints[0])) { return siblingPos; } - const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer( - binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0; - childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos, - nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, - hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf); + childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, + probability, isTerminal, hasChildren, mergedNodeCodePointCount, mergedNodeCodePoints); return siblingPos; } @@ -163,13 +154,16 @@ namespace latinime { const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { - const int terminalDepth = dicNode->getLeavingDepth(); - const int childCount = dicNode->getChildrenCount(); + if (!dicNode->hasChildren()) { + return; + } int nextPos = dicNode->getChildrenPos(); + const int childCount = BinaryFormat::getGroupCountAndForwardPointer( + binaryDictionaryInfo->getDictRoot(), &nextPos); for (int i = 0; i < childCount; i++) { const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo, - terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo, + pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) { // All code points have been found. diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h index e198d6181..d526975ce 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -72,7 +72,7 @@ class DicNodeUtils { const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes); static int createAndGetLeavingChildNode(DicNode *dicNode, int pos, - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth, + const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes); diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h index e23c411f0..9641cc19c 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_vector.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -63,16 +63,13 @@ class DicNodeVector { } void pushLeavingChild(DicNode *dicNode, const int pos, const uint8_t flags, - const int childrenPos, const int attributesPos, const int siblingPos, - const int nodeCodePoint, const int childrenCount, const int probability, - const int bigramProbability, const bool isTerminal, const bool hasMultipleChars, - const bool hasChildren, const uint16_t additionalSubwordLength, - const int *additionalSubword) { + const int childrenPos, const int attributesPos, const int probability, + const bool isTerminal, const bool hasChildren, const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { ASSERT(!mLock); mDicNodes.push_back(mEmptyNode); - mDicNodes.back().initAsChild(dicNode, pos, flags, childrenPos, attributesPos, siblingPos, - nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, - hasMultipleChars, hasChildren, additionalSubwordLength, additionalSubword); + mDicNodes.back().initAsChild(dicNode, pos, flags, childrenPos, attributesPos, probability, + isTerminal, hasChildren, mergedNodeCodePointCount, mergedNodeCodePoints); } DicNode *operator[](const int id) { diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index 242a9bdd6..ff304d2b2 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -150,11 +150,10 @@ int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, in int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return 0; - const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); - int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, - forceLowerCaseSearch); - + int pos = mBinaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( + mBinaryDictionaryInfo, prevWord, prevWordLength, forceLowerCaseSearch); if (NOT_VALID_WORD == pos) return 0; + const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0; if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { @@ -189,8 +188,8 @@ bool BigramDictionary::isValidBigram(const int *word0, int length0, const int *w int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; - int nextWordPos = BinaryFormat::getTerminalPosition(mBinaryDictionaryInfo->getDictRoot(), - word1, length1, false /* forceLowerCaseSearch */); + int nextWordPos = mBinaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( + mBinaryDictionaryInfo, word1, length1, false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == nextWordPos) return false; for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos); diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp index 737df63c7..bbb4ca3f0 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp @@ -22,7 +22,7 @@ namespace latinime { * Dictionary size */ // Any file smaller than this is not a dictionary. -const int BinaryDictionaryFormat::DICTIONARY_MINIMUM_SIZE = 4; +const int BinaryDictionaryFormatUtils::DICTIONARY_MINIMUM_SIZE = 4; /** * Format versions @@ -30,17 +30,18 @@ const int BinaryDictionaryFormat::DICTIONARY_MINIMUM_SIZE = 4; // Originally, format version 1 had a 16-bit magic number, then the version number `01' // then options that must be 0. Hence the first 32-bits of the format are always as follow // and it's okay to consider them a magic number as a whole. -const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; +const uint32_t BinaryDictionaryFormatUtils::FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; // The versions of Latin IME that only handle format version 1 only test for the magic // number, so we had to change it so that version 2 files would be rejected by older // implementations. On this occasion, we made the magic number 32 bits long. -const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; +const uint32_t BinaryDictionaryFormatUtils::FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; // Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12 -const int BinaryDictionaryFormat::FORMAT_VERSION_2_MINIMUM_SIZE = 12; +const int BinaryDictionaryFormatUtils::FORMAT_VERSION_2_MINIMUM_SIZE = 12; -/* static */ BinaryDictionaryFormat::FORMAT_VERSION BinaryDictionaryFormat::detectFormatVersion( - const uint8_t *const dict, const int dictSize) { +/* static */ BinaryDictionaryFormatUtils::FORMAT_VERSION + BinaryDictionaryFormatUtils::detectFormatVersion(const uint8_t *const dict, + const int dictSize) { // The magic number is stored big-endian. // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't // understand this format. diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h index c0fd56111..33618b9f0 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h @@ -31,7 +31,7 @@ namespace latinime { * reading methods and utility methods for various purposes. * On the other hand, this file deals with only about dictionary format version. */ -class BinaryDictionaryFormat { +class BinaryDictionaryFormatUtils { public: // TODO: Remove obsolete version logic enum FORMAT_VERSION { @@ -43,7 +43,7 @@ class BinaryDictionaryFormat { static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryFormat); + DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryFormatUtils); static const int DICTIONARY_MINIMUM_SIZE; static const uint32_t FORMAT_VERSION_1_MAGIC_NUMBER; diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_header.cpp index 04bb81f71..91c643a5f 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.cpp +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_header.cpp @@ -29,12 +29,12 @@ const float BinaryDictionaryHeader::MULTI_WORD_COST_MULTIPLIER_SCALE = 100.0f; BinaryDictionaryHeader::BinaryDictionaryHeader( const BinaryDictionaryInfo *const binaryDictionaryInfo) : mBinaryDictionaryInfo(binaryDictionaryInfo), - mDictionaryFlags(BinaryDictionaryHeaderReader::getFlags(binaryDictionaryInfo)), - mSize(BinaryDictionaryHeaderReader::getHeaderSize(binaryDictionaryInfo)), + mDictionaryFlags(BinaryDictionaryHeaderReadingUtils::getFlags(binaryDictionaryInfo)), + mSize(BinaryDictionaryHeaderReadingUtils::getHeaderSize(binaryDictionaryInfo)), mMultiWordCostMultiplier(readMultiWordCostMultiplier()) {} float BinaryDictionaryHeader::readMultiWordCostMultiplier() const { - const int headerValue = BinaryDictionaryHeaderReader::readHeaderValueInt( + const int headerValue = BinaryDictionaryHeaderReadingUtils::readHeaderValueInt( mBinaryDictionaryInfo, MULTIPLE_WORDS_DEMOTION_RATE_KEY); if (headerValue == S_INT_MIN) { // not found diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_header.h index 9db000362..6dba0b266 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_header.h @@ -37,15 +37,16 @@ class BinaryDictionaryHeader { } AK_FORCE_INLINE bool supportsDynamicUpdate() const { - return BinaryDictionaryHeaderReader::supportsDynamicUpdate(mDictionaryFlags); + return BinaryDictionaryHeaderReadingUtils::supportsDynamicUpdate(mDictionaryFlags); } AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { - return BinaryDictionaryHeaderReader::requiresGermanUmlautProcessing(mDictionaryFlags); + return BinaryDictionaryHeaderReadingUtils::requiresGermanUmlautProcessing(mDictionaryFlags); } AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const { - return BinaryDictionaryHeaderReader::requiresFrenchLigatureProcessing(mDictionaryFlags); + return BinaryDictionaryHeaderReadingUtils::requiresFrenchLigatureProcessing( + mDictionaryFlags); } AK_FORCE_INLINE float getMultiWordCostMultiplier() const { @@ -60,7 +61,7 @@ class BinaryDictionaryHeader { static const float MULTI_WORD_COST_MULTIPLIER_SCALE; const BinaryDictionaryInfo *const mBinaryDictionaryInfo; - const BinaryDictionaryHeaderReader::DictionaryFlags mDictionaryFlags; + const BinaryDictionaryHeaderReadingUtils::DictionaryFlags mDictionaryFlags; const int mSize; const float mMultiWordCostMultiplier; diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp index c09a78f03..2c9593144 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp @@ -24,32 +24,33 @@ namespace latinime { -const int BinaryDictionaryHeaderReader::MAX_OPTION_KEY_LENGTH = 256; +const int BinaryDictionaryHeaderReadingUtils::MAX_OPTION_KEY_LENGTH = 256; -const int BinaryDictionaryHeaderReader::FORMAT_VERSION_1_HEADER_SIZE = 5; +const int BinaryDictionaryHeaderReadingUtils::FORMAT_VERSION_1_HEADER_SIZE = 5; -const int BinaryDictionaryHeaderReader::VERSION_2_MAGIC_NUMBER_SIZE = 4; -const int BinaryDictionaryHeaderReader::VERSION_2_DICTIONARY_VERSION_SIZE = 2; -const int BinaryDictionaryHeaderReader::VERSION_2_DICTIONARY_FLAG_SIZE = 2; -const int BinaryDictionaryHeaderReader::VERSION_2_DICTIONARY_HEADER_SIZE_SIZE = 4; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_MAGIC_NUMBER_SIZE = 4; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_DICTIONARY_VERSION_SIZE = 2; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_DICTIONARY_FLAG_SIZE = 2; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_DICTIONARY_HEADER_SIZE_SIZE = 4; -const BinaryDictionaryHeaderReader::DictionaryFlags BinaryDictionaryHeaderReader::NO_FLAGS = 0; +const BinaryDictionaryHeaderReadingUtils::DictionaryFlags + BinaryDictionaryHeaderReadingUtils::NO_FLAGS = 0; // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or // something very bad (like, the apocalypse) will happen. Please update both at the same time. -const BinaryDictionaryHeaderReader::DictionaryFlags - BinaryDictionaryHeaderReader::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; -const BinaryDictionaryHeaderReader::DictionaryFlags - BinaryDictionaryHeaderReader::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; -const BinaryDictionaryHeaderReader::DictionaryFlags - BinaryDictionaryHeaderReader::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; +const BinaryDictionaryHeaderReadingUtils::DictionaryFlags + BinaryDictionaryHeaderReadingUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; +const BinaryDictionaryHeaderReadingUtils::DictionaryFlags + BinaryDictionaryHeaderReadingUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; +const BinaryDictionaryHeaderReadingUtils::DictionaryFlags + BinaryDictionaryHeaderReadingUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; -/* static */ int BinaryDictionaryHeaderReader::getHeaderSize( +/* static */ int BinaryDictionaryHeaderReadingUtils::getHeaderSize( const BinaryDictionaryInfo *const binaryDictionaryInfo) { switch (binaryDictionaryInfo->getFormat()) { - case BinaryDictionaryFormat::VERSION_1: + case BinaryDictionaryFormatUtils::VERSION_1: return FORMAT_VERSION_1_HEADER_SIZE; - case BinaryDictionaryFormat::VERSION_2: + case BinaryDictionaryFormatUtils::VERSION_2: // See the format of the header in the comment in // BinaryDictionaryFormatUtils::detectFormatVersion() return ByteArrayUtils::readUint32(binaryDictionaryInfo->getDictBuf(), @@ -60,12 +61,13 @@ const BinaryDictionaryHeaderReader::DictionaryFlags } } -/* static */ BinaryDictionaryHeaderReader::DictionaryFlags BinaryDictionaryHeaderReader::getFlags( - const BinaryDictionaryInfo *const binaryDictionaryInfo) { +/* static */ BinaryDictionaryHeaderReadingUtils::DictionaryFlags + BinaryDictionaryHeaderReadingUtils::getFlags( + const BinaryDictionaryInfo *const binaryDictionaryInfo) { switch (binaryDictionaryInfo->getFormat()) { - case BinaryDictionaryFormat::VERSION_1: + case BinaryDictionaryFormatUtils::VERSION_1: return NO_FLAGS; - case BinaryDictionaryFormat::VERSION_2: + case BinaryDictionaryFormatUtils::VERSION_2: return ByteArrayUtils::readUint16(binaryDictionaryInfo->getDictBuf(), VERSION_2_MAGIC_NUMBER_SIZE + VERSION_2_DICTIONARY_VERSION_SIZE); default: @@ -74,7 +76,7 @@ const BinaryDictionaryHeaderReader::DictionaryFlags } // Returns if the key is found or not and reads the found value into outValue. -/* static */ bool BinaryDictionaryHeaderReader::readHeaderValue( +/* static */ bool BinaryDictionaryHeaderReadingUtils::readHeaderValue( const BinaryDictionaryInfo *const binaryDictionaryInfo, const char *const key, int *outValue, const int outValueSize) { if (outValueSize <= 0 || !hasHeaderAttributes(binaryDictionaryInfo->getFormat())) { @@ -97,7 +99,7 @@ const BinaryDictionaryHeaderReader::DictionaryFlags return false; } -/* static */ int BinaryDictionaryHeaderReader::readHeaderValueInt( +/* static */ int BinaryDictionaryHeaderReadingUtils::readHeaderValueInt( const BinaryDictionaryInfo *const binaryDictionaryInfo, const char *const key) { const int bufferSize = LARGEST_INT_DIGIT_COUNT; int intBuffer[bufferSize]; diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h index 6e9dca73c..49ed2b9cc 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h @@ -26,7 +26,7 @@ namespace latinime { class BinaryDictionaryInfo; -class BinaryDictionaryHeaderReader { +class BinaryDictionaryHeaderReadingUtils { public: typedef uint16_t DictionaryFlags; @@ -49,10 +49,10 @@ class BinaryDictionaryHeaderReader { } static AK_FORCE_INLINE bool hasHeaderAttributes( - const BinaryDictionaryFormat::FORMAT_VERSION format) { + const BinaryDictionaryFormatUtils::FORMAT_VERSION format) { // Only format 2 and above have header attributes as {key,value} string pairs. switch (format) { - case BinaryDictionaryFormat::VERSION_2: + case BinaryDictionaryFormatUtils::VERSION_2: return true; break; default: @@ -61,9 +61,9 @@ class BinaryDictionaryHeaderReader { } static AK_FORCE_INLINE int getHeaderOptionsPosition( - const BinaryDictionaryFormat::FORMAT_VERSION format) { + const BinaryDictionaryFormatUtils::FORMAT_VERSION format) { switch (format) { - case BinaryDictionaryFormat::VERSION_2: + case BinaryDictionaryFormatUtils::VERSION_2: return VERSION_2_MAGIC_NUMBER_SIZE + VERSION_2_DICTIONARY_VERSION_SIZE + VERSION_2_DICTIONARY_FLAG_SIZE + VERSION_2_DICTIONARY_HEADER_SIZE_SIZE; break; @@ -80,7 +80,7 @@ class BinaryDictionaryHeaderReader { const BinaryDictionaryInfo *const binaryDictionaryInfo, const char *const key); private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryHeaderReader); + DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryHeaderReadingUtils); static const int FORMAT_VERSION_1_HEADER_SIZE; diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h index e0b583588..7cb31440a 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h @@ -22,19 +22,21 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_format_utils.h" #include "suggest/core/dictionary/binary_dictionary_header.h" +#include "suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h" namespace latinime { -class BinaryDictionaryHeader; - class BinaryDictionaryInfo { public: BinaryDictionaryInfo(const uint8_t *const dictBuf, const int dictSize, const int mmapFd, const int dictBufOffset, const bool isUpdatable) : mDictBuf(dictBuf), mDictSize(dictSize), mMmapFd(mmapFd), mDictBufOffset(dictBufOffset), mIsUpdatable(isUpdatable), - mDictionaryFormat(BinaryDictionaryFormat::detectFormatVersion(mDictBuf, mDictSize)), - mDictionaryHeader(this), mDictRoot(mDictBuf + mDictionaryHeader.getSize()) {} + mDictionaryFormat(BinaryDictionaryFormatUtils::detectFormatVersion( + mDictBuf, mDictSize)), + mDictionaryHeader(this), mDictRoot(mDictBuf + mDictionaryHeader.getSize()), + mStructurePolicy(DictionaryStructurePolicyFactory::getDictionaryStructurePolicy( + mDictionaryFormat)) {} AK_FORCE_INLINE const uint8_t *getDictBuf() const { return mDictBuf; @@ -56,14 +58,10 @@ class BinaryDictionaryInfo { return mDictRoot; } - AK_FORCE_INLINE BinaryDictionaryFormat::FORMAT_VERSION getFormat() const { + AK_FORCE_INLINE BinaryDictionaryFormatUtils::FORMAT_VERSION getFormat() const { return mDictionaryFormat; } - AK_FORCE_INLINE int getRootPosition() const { - return 0; - } - AK_FORCE_INLINE const BinaryDictionaryHeader *getHeader() const { return &mDictionaryHeader; } @@ -74,6 +72,10 @@ class BinaryDictionaryInfo { return mIsUpdatable && isUpdatableDictionaryFormat; } + AK_FORCE_INLINE const DictionaryStructurePolicy *getStructurePolicy() const { + return mStructurePolicy; + } + private: DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryInfo); @@ -82,9 +84,10 @@ class BinaryDictionaryInfo { const int mMmapFd; const int mDictBufOffset; const bool mIsUpdatable; - const BinaryDictionaryFormat::FORMAT_VERSION mDictionaryFormat; + const BinaryDictionaryFormatUtils::FORMAT_VERSION mDictionaryFormat; const BinaryDictionaryHeader mDictionaryHeader; const uint8_t *const mDictRoot; + const DictionaryStructurePolicy *const mStructurePolicy; }; } #endif /* LATINIME_BINARY_DICTIONARY_INFO_H */ diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 51f23dc55..675b54972 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -83,27 +83,14 @@ int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, in } int Dictionary::getProbability(const int *word, int length) const { - const uint8_t *const root = mBinaryDictionaryInfo.getDictRoot(); - int pos = BinaryFormat::getTerminalPosition(root, word, length, + const DictionaryStructurePolicy *const structurePolicy = + mBinaryDictionaryInfo.getStructurePolicy(); + int pos = structurePolicy->getTerminalNodePositionOfWord(&mBinaryDictionaryInfo, word, length, false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == pos) { return NOT_A_PROBABILITY; } - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) { - // If this is not a word, or if it's a blacklisted entry, it should behave as - // having no probability outside of the suggestion process (where it should be used - // for shortcuts). - return NOT_A_PROBABILITY; - } - const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); - if (hasMultipleChars) { - pos = BinaryFormat::skipOtherCharacters(root, pos); - } else { - BinaryFormat::getCodePointAndForwardPointer(root, &pos); - } - const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); - return unigramProbability; + return structurePolicy->getUnigramProbability(&mBinaryDictionaryInfo, pos); } bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const { diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_policy.h new file mode 100644 index 000000000..ab42c13b4 --- /dev/null +++ b/native/jni/src/suggest/core/policy/dictionary_structure_policy.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +class BinaryDictionaryInfo; +class DicNode; +class DicNodeVector; + +/* + * This class abstracts structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryStructurePolicy { + public: + // This provides a filtering method for filtering new node. + class NodeFilter { + public: + virtual bool isFilteredOut(const int codePoint) const = 0; + + protected: + NodeFilter() {} + virtual ~NodeFilter() {} + + private: + DISALLOW_COPY_AND_ASSIGN(NodeFilter); + }; + + virtual int getRootPosition() const = 0; + + virtual void createAndGetAllChildNodes(const DicNode *const dicNode, + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const = 0; + + virtual void getWordAtPosition(const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int terminalNodePos, const int maxDepth, int *const outWord, + int *const outUnigramProbability) const = 0; + + virtual int getTerminalNodePositionOfWord( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, + const int length, const bool forceLowerCaseSearch) const = 0; + + virtual int getUnigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int nodePos) const = 0; + + protected: + DictionaryStructurePolicy() {} + virtual ~DictionaryStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h new file mode 100644 index 000000000..5070651cb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_FACTORY_H +#define LATINIME_DICTIONARY_STRUCTURE_POLICY_FACTORY_H + +#include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_format_utils.h" +#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" + +namespace latinime { + +class DictionaryStructurePolicy; + +class DictionaryStructurePolicyFactory { + public: + static const DictionaryStructurePolicy *getDictionaryStructurePolicy( + const BinaryDictionaryFormatUtils::FORMAT_VERSION dictionaryFormat) { + switch (dictionaryFormat) { + case BinaryDictionaryFormatUtils::VERSION_1: + // Fall through + case BinaryDictionaryFormatUtils::VERSION_2: + return PatriciaTriePolicy::getInstance(); + default: + ASSERT(false); + return 0; + } + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructurePolicyFactory); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_STRUCTURE_POLICY_FACTORY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp new file mode 100644 index 000000000..c995af98a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" +#include "suggest/core/dictionary/binary_format.h" + +namespace latinime { + +const PatriciaTriePolicy PatriciaTriePolicy::sInstance; + +void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const { + // TODO: Move children creating methods form DicNodeUtils. +} + +void PatriciaTriePolicy::getWordAtPosition(const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int terminalNodePos, const int maxDepth, int *const outWord, + int *const outUnigramProbability) const { + BinaryFormat::getWordAtAddress(binaryDictionaryInfo->getDictRoot(), terminalNodePos, + maxDepth, outWord, outUnigramProbability); +} + +int PatriciaTriePolicy::getTerminalNodePositionOfWord( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, + const int length, const bool forceLowerCaseSearch) const { + return BinaryFormat::getTerminalPosition(binaryDictionaryInfo->getDictRoot(), inWord, + length, forceLowerCaseSearch); +} + +int PatriciaTriePolicy::getUnigramProbability( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) const { + const uint8_t *const root = binaryDictionaryInfo->getDictRoot(); + int pos = nodePos; + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) { + // If this is not a word, or if it's a blacklisted entry, it should behave as + // having no probability outside of the suggestion process (where it should be used + // for shortcuts). + return NOT_A_PROBABILITY; + } + const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); + if (hasMultipleChars) { + pos = BinaryFormat::skipOtherCharacters(root, pos); + } else { + BinaryFormat::getCodePointAndForwardPointer(root, &pos); + } + return BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h new file mode 100644 index 000000000..9b9338145 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_POLICY_H +#define LATINIME_PATRICIA_TRIE_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_policy.h" + +namespace latinime { + +class PatriciaTriePolicy : public DictionaryStructurePolicy { + public: + static AK_FORCE_INLINE const PatriciaTriePolicy *getInstance() { + return &sInstance; + } + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildNodes(const DicNode *const dicNode, + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const; + + void getWordAtPosition(const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int terminalNodePos, const int maxDepth, int *const outWord, + int *const outUnigramProbability) const; + + int getTerminalNodePositionOfWord( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, + const int length, const bool forceLowerCaseSearch) const; + + int getUnigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int nodePos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(PatriciaTriePolicy); + static const PatriciaTriePolicy sInstance; + + PatriciaTriePolicy() {} + ~PatriciaTriePolicy() {} +}; +} // namespace latinime +#endif // LATINIME_PATRICIA_TRIE_POLICY_H |