diff options
Diffstat (limited to 'native/jni/src')
143 files changed, 8757 insertions, 4329 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index 742e388e4..22cc4c02b 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -87,12 +87,21 @@ AK_FORCE_INLINE static int intArrayToCharArray(const int *const source, const in } #if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#if defined(__ANDROID__) #include <android/log.h> +#endif // defined(__ANDROID__) #ifndef LOG_TAG #define LOG_TAG "LatinIME: " #endif // LOG_TAG + +#if defined(HOST_TOOL) +#include <stdio.h> +#define AKLOGE(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) +#define AKLOGI(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) +#else // defined(HOST_TOOL) #define AKLOGE(fmt, ...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, fmt, ##__VA_ARGS__) #define AKLOGI(fmt, ...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, fmt, ##__VA_ARGS__) +#endif // defined(HOST_TOOL) #define DUMP_RESULT(words, frequencies) do { dumpResult(words, frequencies); } while (0) #define DUMP_WORD(word, length) do { dumpWord(word, length); } while (0) @@ -298,10 +307,11 @@ static inline void prof_out(void) { #define NOT_AN_INDEX (-1) #define NOT_A_PROBABILITY (-1) #define NOT_A_DICT_POS (S_INT_MIN) +#define NOT_A_TIMESTAMP (-1) // A special value to mean the first word confidence makes no sense in this case, // e.g. this is not a multi-word suggestion. -#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MAX) +#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MIN) // How high the confidence needs to be for us to auto-commit. Arbitrary. // This needs to be the same as CONFIDENCE_FOR_AUTO_COMMIT in BinaryDictionary.java #define CONFIDENCE_FOR_AUTO_COMMIT (1000000) @@ -341,12 +351,21 @@ template<typename T> AK_FORCE_INLINE const T &max(const T &a, const T &b) { retu #define INPUTLENGTH_FOR_DEBUG (-1) #define MIN_OUTPUT_INDEX_FOR_DEBUG (-1) -#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&); \ +#define DISALLOW_DEFAULT_CONSTRUCTOR(TypeName) \ + TypeName() + +#define DISALLOW_COPY_CONSTRUCTOR(TypeName) \ + TypeName(const TypeName&) + +#define DISALLOW_ASSIGNMENT_OPERATOR(TypeName) \ void operator=(const TypeName&) +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + DISALLOW_COPY_CONSTRUCTOR(TypeName); \ + DISALLOW_ASSIGNMENT_OPERATOR(TypeName) + #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ - TypeName(); \ + DISALLOW_DEFAULT_CONSTRUCTOR(TypeName); \ DISALLOW_COPY_AND_ASSIGN(TypeName) // Used as a return value for character comparison @@ -392,24 +411,4 @@ typedef enum { // Create new word with space substitution CT_NEW_WORD_SPACE_SUBSTITUTION, } CorrectionType; - -// ErrorType is mainly decided by CorrectionType but it is also depending on if -// the correction has really been performed or not. -typedef enum { - // Substitution, omission and transposition - ET_EDIT_CORRECTION, - // Proximity error - ET_PROXIMITY_CORRECTION, - // Completion - ET_COMPLETION, - // New word - // TODO: Remove. - // A new word error should be an edit correction error or a proximity correction error. - ET_NEW_WORD, - // Treat error as an intentional omission when the CorrectionType is omission and the node can - // be intentional omission. - ET_INTENTIONAL_OMISSION, - // Not treated as an error. Tracked for checking exact match - ET_NOT_AN_ERROR -} ErrorType; #endif // LATINIME_DEFINES_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index 49cfdecac..558667eb0 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -23,6 +23,7 @@ #include "suggest/core/dicnode/internal/dic_node_state.h" #include "suggest/core/dicnode/internal/dic_node_properties.h" #include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" #include "utils/char_utils.h" #if DEBUG_DICT @@ -99,7 +100,7 @@ class DicNode { virtual ~DicNode() {} // Init for copy - void initByCopy(const DicNode *dicNode) { + void initByCopy(const DicNode *const dicNode) { mIsUsed = true; mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; mDicNodeProperties.init(&dicNode->mDicNodeProperties); @@ -107,25 +108,25 @@ class DicNode { PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - // Init for root with prevWordNodePos which is used for bigram - void initAsRoot(const int rootGroupPos, const int prevWordNodePos) { + // Init for root with prevWordPtNodePos which is used for bigram + void initAsRoot(const int rootPtNodeArrayPos, const int prevWordPtNodePos) { mIsUsed = true; mIsCachedForNextSuggestion = false; mDicNodeProperties.init( - NOT_A_DICT_POS /* pos */, rootGroupPos, NOT_A_CODE_POINT /* nodeCodePoint */, + NOT_A_DICT_POS /* pos */, rootPtNodeArrayPos, NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, false /* isTerminal */, true /* hasChildren */, false /* isBlacklistedOrNotAWord */, 0 /* depth */, 0 /* terminalDepth */); - mDicNodeState.init(prevWordNodePos); + mDicNodeState.init(prevWordPtNodePos); PROF_NODE_RESET(mProfiler); } // Init for root with previous word - void initAsRootWithPreviousWord(DicNode *dicNode, const int rootGroupPos) { + void initAsRootWithPreviousWord(const DicNode *const dicNode, const int rootPtNodeArrayPos) { mIsUsed = true; mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; mDicNodeProperties.init( - NOT_A_DICT_POS /* pos */, rootGroupPos, NOT_A_CODE_POINT /* nodeCodePoint */, + NOT_A_DICT_POS /* pos */, rootPtNodeArrayPos, NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, false /* isTerminal */, true /* hasChildren */, false /* isBlacklistedOrNotAWord */, 0 /* depth */, 0 /* terminalDepth */); @@ -138,7 +139,7 @@ class DicNode { mDicNodeState.mDicNodeStatePrevWord.init( dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordCount() + 1, dicNode->mDicNodeProperties.getProbability(), - dicNode->mDicNodeProperties.getPos(), + dicNode->mDicNodeProperties.getPtNodePos(), dicNode->mDicNodeState.mDicNodeStatePrevWord.mPrevWord, dicNode->mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(), dicNode->getOutputWordBuf(), @@ -148,26 +149,27 @@ class DicNode { PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - void initAsPassingChild(DicNode *parentNode) { + void initAsPassingChild(DicNode *parentDicNode) { mIsUsed = true; - mIsCachedForNextSuggestion = parentNode->mIsCachedForNextSuggestion; - const int c = parentNode->getNodeTypedCodePoint(); - mDicNodeProperties.init(&parentNode->mDicNodeProperties, c); - mDicNodeState.init(&parentNode->mDicNodeState); - PROF_NODE_COPY(&parentNode->mProfiler, mProfiler); + mIsCachedForNextSuggestion = parentDicNode->mIsCachedForNextSuggestion; + const int parentCodePoint = parentDicNode->getNodeTypedCodePoint(); + mDicNodeProperties.init(&parentDicNode->mDicNodeProperties, parentCodePoint); + mDicNodeState.init(&parentDicNode->mDicNodeState); + PROF_NODE_COPY(&parentDicNode->mProfiler, mProfiler); } - void initAsChild(const DicNode *const dicNode, const int pos, const int childrenPos, - const int probability, const bool isTerminal, const bool hasChildren, - const bool isBlacklistedOrNotAWord, const uint16_t mergedNodeCodePointCount, - const int *const mergedNodeCodePoints) { + void initAsChild(const DicNode *const dicNode, const int ptNodePos, + const int childrenPtNodeArrayPos, const int probability, const bool isTerminal, + const bool hasChildren, const bool isBlacklistedOrNotAWord, + const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { mIsUsed = true; uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1); mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; const uint16_t newLeavingDepth = static_cast<uint16_t>( dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount); - mDicNodeProperties.init(pos, childrenPos, mergedNodeCodePoints[0], probability, - isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth, newLeavingDepth); + mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0], + probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth, + newLeavingDepth); mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount, mergedNodeCodePoints); PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); @@ -234,7 +236,7 @@ class DicNode { } bool isFirstWord() const { - return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos() == NOT_A_DICT_POS; + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordPtNodePos() == NOT_A_DICT_POS; } bool isCompletion(const int inputSize) const { @@ -246,29 +248,30 @@ class DicNode { } // Used to get bigram probability in DicNodeUtils - int getPos() const { - return mDicNodeProperties.getPos(); + int getPtNodePos() const { + return mDicNodeProperties.getPtNodePos(); } // Used to get bigram probability in DicNodeUtils - int getPrevWordPos() const { - return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos(); + int getPrevWordTerminalPtNodePos() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordPtNodePos(); } // Used in DicNodeUtils - int getChildrenPos() const { - return mDicNodeProperties.getChildrenPos(); + int getChildrenPtNodeArrayPos() const { + return mDicNodeProperties.getChildrenPtNodeArrayPos(); } int getProbability() const { return mDicNodeProperties.getProbability(); } - AK_FORCE_INLINE bool isTerminalWordNode() const { - const bool isTerminalNodes = mDicNodeProperties.isTerminal(); - const int currentNodeDepth = getNodeCodePointCount(); - const int terminalNodeDepth = mDicNodeProperties.getLeavingDepth(); - return isTerminalNodes && currentNodeDepth > 0 && currentNodeDepth == terminalNodeDepth; + AK_FORCE_INLINE bool isTerminalDicNode() const { + const bool isTerminalPtNode = mDicNodeProperties.isTerminal(); + const int currentDicNodeDepth = getNodeCodePointCount(); + const int terminalDicNodeDepth = mDicNodeProperties.getLeavingDepth(); + return isTerminalPtNode && currentDicNodeDepth > 0 + && currentDicNodeDepth == terminalDicNodeDepth; } bool shouldBeFilteredBySafetyNetForBigram() const { @@ -278,6 +281,13 @@ class DicNode { return !(currentDepth > 0 && (currentDepth != 1 || prevWordLen != 1)); } + bool hasMatchedOrProximityCodePoints() const { + // This DicNode does not have matched or proximity code points when all code points have + // been handled as edit corrections so far. + return mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount() + < getNodeCodePointCount(); + } + bool isTotalInputSizeExceedingLimit() const { const int prevWordsLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(); const int currentWordDepth = getNodeCodePointCount(); @@ -374,8 +384,8 @@ class DicNode { } // Used to commit input partially - int getPrevWordNodePos() const { - return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos(); + int getPrevWordPtNodePos() const { + return mDicNodeState.mDicNodeStatePrevWord.getPrevWordPtNodePos(); } AK_FORCE_INLINE const int *getOutputWordBuf() const { @@ -410,7 +420,7 @@ class DicNode { // TODO: Remove once touch path is merged into ProximityInfoState // Note: Returned codepoint may be a digraph codepoint if the node is in a composite glyph. int getNodeCodePoint() const { - const int codePoint = mDicNodeProperties.getNodeCodePoint(); + const int codePoint = mDicNodeProperties.getDicNodeCodePoint(); const DigraphUtils::DigraphCodePointIndex digraphIndex = mDicNodeState.mDicNodeStateScoring.getDigraphIndex(); if (digraphIndex == DigraphUtils::NOT_A_DIGRAPH_INDEX) { @@ -423,8 +433,8 @@ class DicNode { // Utils for cost calculation // //////////////////////////////// AK_FORCE_INLINE bool isSameNodeCodePoint(const DicNode *const dicNode) const { - return mDicNodeProperties.getNodeCodePoint() - == dicNode->mDicNodeProperties.getNodeCodePoint(); + return mDicNodeProperties.getDicNodeCodePoint() + == dicNode->mDicNodeProperties.getDicNodeCodePoint(); } // TODO: remove @@ -484,8 +494,8 @@ class DicNode { mDicNodeState.mDicNodeStateScoring.advanceDigraphIndex(); } - bool isExactMatch() const { - return mDicNodeState.mDicNodeStateScoring.isExactMatch(); + ErrorTypeUtils::ErrorType getContainedErrorTypes() const { + return mDicNodeState.mDicNodeStateScoring.getContainedErrorTypes(); } bool isBlacklistedOrNotAWord() const { @@ -526,8 +536,8 @@ class DicNode { return false; } // Promote exact matches to prevent them from being pruned. - const bool leftExactMatch = isExactMatch(); - const bool rightExactMatch = right->isExactMatch(); + const bool leftExactMatch = ErrorTypeUtils::isExactMatch(getContainedErrorTypes()); + const bool rightExactMatch = ErrorTypeUtils::isExactMatch(right->getContainedErrorTypes()); if (leftExactMatch != rightExactMatch) { return leftExactMatch; } @@ -574,7 +584,8 @@ class DicNode { // Caveat: Must not be called outside Weighting // This restriction is guaranteed by "friend" AK_FORCE_INLINE void addCost(const float spatialCost, const float languageCost, - const bool doNormalization, const int inputSize, const ErrorType errorType) { + const bool doNormalization, const int inputSize, + const ErrorTypeUtils::ErrorType errorType) { if (DEBUG_GEO_FULL) { LOGI_SHOW_ADD_COST_PROP; } diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index ec65114c7..71bcab6cb 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -22,7 +22,6 @@ #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/multi_bigram_map.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "utils/char_utils.h" namespace latinime { @@ -32,19 +31,20 @@ namespace latinime { /* static */ void DicNodeUtils::initAsRoot( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int prevWordNodePos, DicNode *const newRootNode) { - newRootNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordNodePos); + const int prevWordPtNodePos, DicNode *const newRootDicNode) { + newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordPtNodePos); } /*static */ void DicNodeUtils::initAsRootWithPreviousWord( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - DicNode *const prevWordLastNode, DicNode *const newRootNode) { - newRootNode->initAsRootWithPreviousWord( - prevWordLastNode, dictionaryStructurePolicy->getRootPosition()); + const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode) { + newRootDicNode->initAsRootWithPreviousWord( + prevWordLastDicNode, dictionaryStructurePolicy->getRootPosition()); } -/* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) { - destNode->initByCopy(srcNode); +/* static */ void DicNodeUtils::initByCopy(const DicNode *const srcDicNode, + DicNode *const destDicNode) { + destDicNode->initByCopy(srcDicNode); } /////////////////////////////////// @@ -52,14 +52,14 @@ namespace latinime { /////////////////////////////////// /* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - DicNodeVector *childDicNodes) { + DicNodeVector *const childDicNodes) { if (dicNode->isTotalInputSizeExceedingLimit()) { return; } if (!dicNode->isLeavingNode()) { childDicNodes->pushPassingChild(dicNode); } else { - dictionaryStructurePolicy->createAndGetAllChildNodes(dicNode, childDicNodes); + dictionaryStructurePolicy->createAndGetAllChildDicNodes(dicNode, childDicNodes); } } @@ -71,11 +71,11 @@ namespace latinime { */ /* static */ float DicNodeUtils::getBigramNodeImprobability( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const DicNode *const node, MultiBigramMap *multiBigramMap) { - if (node->hasMultipleWords() && !node->isValidMultipleWordSuggestion()) { + const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) { + if (dicNode->hasMultipleWords() && !dicNode->isValidMultipleWordSuggestion()) { return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); } - const int probability = getBigramNodeProbability(dictionaryStructurePolicy, node, + const int probability = getBigramNodeProbability(dictionaryStructurePolicy, dicNode, multiBigramMap); // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. const float cost = static_cast<float>(MAX_PROBABILITY - probability) @@ -85,19 +85,19 @@ namespace latinime { /* static */ int DicNodeUtils::getBigramNodeProbability( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const DicNode *const node, MultiBigramMap *multiBigramMap) { - const int unigramProbability = node->getProbability(); - const int wordPos = node->getPos(); - const int prevWordPos = node->getPrevWordPos(); - if (NOT_A_DICT_POS == wordPos || NOT_A_DICT_POS == prevWordPos) { + const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) { + const int unigramProbability = dicNode->getProbability(); + const int ptNodePos = dicNode->getPtNodePos(); + const int prevWordTerminalPtNodePos = dicNode->getPrevWordTerminalPtNodePos(); + if (NOT_A_DICT_POS == ptNodePos || NOT_A_DICT_POS == prevWordTerminalPtNodePos) { // Note: Normally wordPos comes from the dictionary and should never equal // NOT_A_VALID_WORD_POS. return dictionaryStructurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); } if (multiBigramMap) { - return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos, - wordPos, unigramProbability); + return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, + prevWordTerminalPtNodePos, ptNodePos, unigramProbability); } return dictionaryStructurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); @@ -109,7 +109,7 @@ namespace latinime { // TODO: Move to char_utils? /* static */ int DicNodeUtils::appendTwoWords(const int *const src0, const int16_t length0, - const int *const src1, const int16_t length1, int *dest) { + const int *const src1, const int16_t length1, int *const dest) { int actualLength0 = 0; for (int i = 0; i < length0; ++i) { if (src0[i] == 0) { @@ -118,7 +118,7 @@ namespace latinime { actualLength0 = i + 1; } actualLength0 = min(actualLength0, MAX_WORD_LENGTH); - memcpy(dest, src0, actualLength0 * sizeof(dest[0])); + memmove(dest, src0, actualLength0 * sizeof(dest[0])); if (!src1 || length1 == 0) { return actualLength0; } @@ -130,7 +130,7 @@ namespace latinime { actualLength1 = i + 1; } actualLength1 = min(actualLength1, MAX_WORD_LENGTH - actualLength0); - memcpy(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0])); + memmove(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0])); return actualLength0 + actualLength1; } } // namespace latinime diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h index 3fb351a61..3f1514a52 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -31,20 +31,20 @@ class MultiBigramMap; class DicNodeUtils { public: static int appendTwoWords(const int *src0, const int16_t length0, const int *src1, - const int16_t length1, int *dest); + const int16_t length1, int *const dest); static void initAsRoot( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int prevWordNodePos, DicNode *newRootNode); + const int prevWordPtNodePos, DicNode *const newRootDicNode); static void initAsRootWithPreviousWord( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - DicNode *prevWordLastNode, DicNode *newRootNode); - static void initByCopy(DicNode *srcNode, DicNode *destNode); + const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode); + static void initByCopy(const DicNode *const srcDicNode, DicNode *const destDicNode); static void getAllChildDicNodes(DicNode *dicNode, const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNodeVector *childDicNodes); static float getBigramNodeImprobability( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const DicNode *const node, MultiBigramMap *const multiBigramMap); + const DicNode *const dicNode, MultiBigramMap *const multiBigramMap); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils); @@ -53,7 +53,7 @@ class DicNodeUtils { static int getBigramNodeProbability( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const DicNode *const node, MultiBigramMap *multiBigramMap); + const DicNode *const dicNode, MultiBigramMap *const multiBigramMap); }; } // namespace latinime #endif // LATINIME_DIC_NODE_UTILS_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h index 42addae8d..9364e7751 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_vector.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -62,14 +62,14 @@ class DicNodeVector { mDicNodes.back().initAsPassingChild(dicNode); } - void pushLeavingChild(const DicNode *const dicNode, const int pos, const int childrenPos, - const int probability, const bool isTerminal, const bool hasChildren, - const bool isBlacklistedOrNotAWord, const uint16_t mergedNodeCodePointCount, - const int *const mergedNodeCodePoints) { + void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos, + const int childrenPtNodeArrayPos, const int probability, const bool isTerminal, + const bool hasChildren, const bool isBlacklistedOrNotAWord, + const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { ASSERT(!mLock); mDicNodes.push_back(mEmptyNode); - mDicNodes.back().initAsChild(dicNode, pos, childrenPos, probability, isTerminal, - hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, + mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability, + isTerminal, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, mergedNodeCodePoints); } diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h index 9e0f62ceb..c41a7243a 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h @@ -24,15 +24,14 @@ namespace latinime { /** - * Node for traversing the lexicon trie. + * PtNode information related to the DicNode from the lexicon trie. */ -// TODO: Introduce a dictionary node class which has attribute members required to understand the -// dictionary structure. class DicNodeProperties { public: AK_FORCE_INLINE DicNodeProperties() - : mPos(0), mChildrenPos(0), mProbability(0), mNodeCodePoint(0), mIsTerminal(false), - mHasChildren(false), mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {} + : mPtNodePos(0), mChildrenPtNodeArrayPos(0), mProbability(0), mDicNodeCodePoint(0), + mIsTerminal(false), mHasChildrenPtNodes(false), mIsBlacklistedOrNotAWord(false), + mDepth(0), mLeavingDepth(0) {} virtual ~DicNodeProperties() {} @@ -40,57 +39,57 @@ class DicNodeProperties { void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability, const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord, const uint16_t depth, const uint16_t leavingDepth) { - mPos = pos; - mChildrenPos = childrenPos; - mNodeCodePoint = nodeCodePoint; + mPtNodePos = pos; + mChildrenPtNodeArrayPos = childrenPos; + mDicNodeCodePoint = nodeCodePoint; mProbability = probability; mIsTerminal = isTerminal; - mHasChildren = hasChildren; + mHasChildrenPtNodes = hasChildren; mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord; mDepth = depth; mLeavingDepth = leavingDepth; } // Init for copy - void init(const DicNodeProperties *const nodeProp) { - mPos = nodeProp->mPos; - mChildrenPos = nodeProp->mChildrenPos; - mNodeCodePoint = nodeProp->mNodeCodePoint; - mProbability = nodeProp->mProbability; - mIsTerminal = nodeProp->mIsTerminal; - mHasChildren = nodeProp->mHasChildren; - mIsBlacklistedOrNotAWord = nodeProp->mIsBlacklistedOrNotAWord; - mDepth = nodeProp->mDepth; - mLeavingDepth = nodeProp->mLeavingDepth; + void init(const DicNodeProperties *const dicNodeProp) { + mPtNodePos = dicNodeProp->mPtNodePos; + mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; + mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint; + mProbability = dicNodeProp->mProbability; + mIsTerminal = dicNodeProp->mIsTerminal; + mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes; + mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; + mDepth = dicNodeProp->mDepth; + mLeavingDepth = dicNodeProp->mLeavingDepth; } // Init as passing child - void init(const DicNodeProperties *const nodeProp, const int codePoint) { - mPos = nodeProp->mPos; - mChildrenPos = nodeProp->mChildrenPos; - mNodeCodePoint = codePoint; // Overwrite the node char of a passing child - mProbability = nodeProp->mProbability; - mIsTerminal = nodeProp->mIsTerminal; - mHasChildren = nodeProp->mHasChildren; - mIsBlacklistedOrNotAWord = nodeProp->mIsBlacklistedOrNotAWord; - mDepth = nodeProp->mDepth + 1; // Increment the depth of a passing child - mLeavingDepth = nodeProp->mLeavingDepth; + void init(const DicNodeProperties *const dicNodeProp, const int codePoint) { + mPtNodePos = dicNodeProp->mPtNodePos; + mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; + mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child + mProbability = dicNodeProp->mProbability; + mIsTerminal = dicNodeProp->mIsTerminal; + mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes; + mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; + mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child + mLeavingDepth = dicNodeProp->mLeavingDepth; } - int getPos() const { - return mPos; + int getPtNodePos() const { + return mPtNodePos; } - int getChildrenPos() const { - return mChildrenPos; + int getChildrenPtNodeArrayPos() const { + return mChildrenPtNodeArrayPos; } int getProbability() const { return mProbability; } - int getNodeCodePoint() const { - return mNodeCodePoint; + int getDicNodeCodePoint() const { + return mDicNodeCodePoint; } uint16_t getDepth() const { @@ -107,7 +106,7 @@ class DicNodeProperties { } bool hasChildren() const { - return mHasChildren || mDepth != mLeavingDepth; + return mHasChildrenPtNodes || mDepth != mLeavingDepth; } bool isBlacklistedOrNotAWord() const { @@ -118,12 +117,12 @@ class DicNodeProperties { // Caution!!! // Use a default copy constructor and an assign operator because shallow copies are ok // for this class - int mPos; - int mChildrenPos; + int mPtNodePos; + int mChildrenPtNodeArrayPos; int mProbability; - int mNodeCodePoint; + int mDicNodeCodePoint; bool mIsTerminal; - bool mHasChildren; + bool mHasChildrenPtNodes; bool mIsBlacklistedOrNotAWord; uint16_t mDepth; uint16_t mLeavingDepth; diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h index 74eb5dfe7..fc6851099 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h @@ -17,7 +17,7 @@ #ifndef LATINIME_DIC_NODE_STATE_OUTPUT_H #define LATINIME_DIC_NODE_STATE_OUTPUT_H -#include <cstring> // for memcpy() +#include <cstring> // for memmove() #include <stdint.h> #include "defines.h" @@ -38,7 +38,7 @@ class DicNodeStateOutput { } void init(const DicNodeStateOutput *const stateOutput) { - memcpy(mCodePointsBuf, stateOutput->mCodePointsBuf, + memmove(mCodePointsBuf, stateOutput->mCodePointsBuf, stateOutput->mOutputtedCodePointCount * sizeof(mCodePointsBuf[0])); mOutputtedCodePointCount = stateOutput->mOutputtedCodePointCount; if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { @@ -51,7 +51,7 @@ class DicNodeStateOutput { if (mergedNodeCodePoints) { const int additionalCodePointCount = min(static_cast<int>(mergedNodeCodePointCount), MAX_WORD_LENGTH - mOutputtedCodePointCount); - memcpy(&mCodePointsBuf[mOutputtedCodePointCount], mergedNodeCodePoints, + memmove(&mCodePointsBuf[mOutputtedCodePointCount], mergedNodeCodePoints, additionalCodePointCount * sizeof(mCodePointsBuf[0])); mOutputtedCodePointCount = static_cast<uint16_t>( mOutputtedCodePointCount + mergedNodeCodePointCount); diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h index b8986203d..e7108d976 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h @@ -17,7 +17,7 @@ #ifndef LATINIME_DIC_NODE_STATE_PREVWORD_H #define LATINIME_DIC_NODE_STATE_PREVWORD_H -#include <cstring> // for memset() +#include <cstring> // for memset() and memmove() #include <stdint.h> #include "defines.h" @@ -30,7 +30,7 @@ class DicNodeStatePrevWord { public: AK_FORCE_INLINE DicNodeStatePrevWord() : mPrevWordCount(0), mPrevWordLength(0), mPrevWordStart(0), mPrevWordProbability(0), - mPrevWordNodePos(NOT_A_DICT_POS), mSecondWordFirstInputIndex(NOT_AN_INDEX) { + mPrevWordPtNodePos(NOT_A_DICT_POS), mSecondWordFirstInputIndex(NOT_AN_INDEX) { memset(mPrevWord, 0, sizeof(mPrevWord)); } @@ -41,7 +41,7 @@ class DicNodeStatePrevWord { mPrevWordCount = 0; mPrevWordStart = 0; mPrevWordProbability = -1; - mPrevWordNodePos = NOT_A_DICT_POS; + mPrevWordPtNodePos = NOT_A_DICT_POS; mSecondWordFirstInputIndex = NOT_AN_INDEX; } @@ -50,7 +50,7 @@ class DicNodeStatePrevWord { mPrevWordCount = 0; mPrevWordStart = 0; mPrevWordProbability = -1; - mPrevWordNodePos = prevWordNodePos; + mPrevWordPtNodePos = prevWordNodePos; mSecondWordFirstInputIndex = NOT_AN_INDEX; } @@ -60,9 +60,9 @@ class DicNodeStatePrevWord { mPrevWordCount = prevWord->mPrevWordCount; mPrevWordStart = prevWord->mPrevWordStart; mPrevWordProbability = prevWord->mPrevWordProbability; - mPrevWordNodePos = prevWord->mPrevWordNodePos; + mPrevWordPtNodePos = prevWord->mPrevWordPtNodePos; mSecondWordFirstInputIndex = prevWord->mSecondWordFirstInputIndex; - memcpy(mPrevWord, prevWord->mPrevWord, prevWord->mPrevWordLength * sizeof(mPrevWord[0])); + memmove(mPrevWord, prevWord->mPrevWord, prevWord->mPrevWordLength * sizeof(mPrevWord[0])); } void init(const int16_t prevWordCount, const int16_t prevWordProbability, @@ -71,7 +71,7 @@ class DicNodeStatePrevWord { const int prevWordSecondWordFirstInputIndex, const int lastInputIndex) { mPrevWordCount = min(prevWordCount, static_cast<int16_t>(MAX_RESULTS)); mPrevWordProbability = prevWordProbability; - mPrevWordNodePos = prevWordNodePos; + mPrevWordPtNodePos = prevWordNodePos; int twoWordsLen = DicNodeUtils::appendTwoWords(src0, length0, src1, length1, mPrevWord); if (twoWordsLen >= MAX_WORD_LENGTH) { @@ -116,8 +116,8 @@ class DicNodeStatePrevWord { return mPrevWordStart; } - int getPrevWordNodePos() const { - return mPrevWordNodePos; + int getPrevWordPtNodePos() const { + return mPrevWordPtNodePos; } int getPrevWordCodePointAt(const int id) const { @@ -147,7 +147,7 @@ class DicNodeStatePrevWord { int16_t mPrevWordLength; int16_t mPrevWordStart; int16_t mPrevWordProbability; - int mPrevWordNodePos; + int mPrevWordPtNodePos; int mSecondWordFirstInputIndex; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h index 3c85d0e9d..11c201e52 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h @@ -21,6 +21,7 @@ #include "defines.h" #include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" namespace latinime { @@ -31,7 +32,7 @@ class DicNodeStateScoring { mDigraphIndex(DigraphUtils::NOT_A_DIGRAPH_INDEX), mEditCorrectionCount(0), mProximityCorrectionCount(0), mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f), - mRawLength(0.0f), mExactMatch(true), + mRawLength(0.0f), mContainedErrorTypes(ErrorTypeUtils::NOT_AN_ERROR), mNormalizedCompoundDistanceAfterFirstWord(MAX_VALUE_FOR_WEIGHTING) { } @@ -47,7 +48,7 @@ class DicNodeStateScoring { mDoubleLetterLevel = NOT_A_DOUBLE_LETTER; mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX; mNormalizedCompoundDistanceAfterFirstWord = MAX_VALUE_FOR_WEIGHTING; - mExactMatch = true; + mContainedErrorTypes = ErrorTypeUtils::NOT_AN_ERROR; } AK_FORCE_INLINE void init(const DicNodeStateScoring *const scoring) { @@ -59,34 +60,21 @@ class DicNodeStateScoring { mRawLength = scoring->mRawLength; mDoubleLetterLevel = scoring->mDoubleLetterLevel; mDigraphIndex = scoring->mDigraphIndex; - mExactMatch = scoring->mExactMatch; + mContainedErrorTypes = scoring->mContainedErrorTypes; mNormalizedCompoundDistanceAfterFirstWord = scoring->mNormalizedCompoundDistanceAfterFirstWord; } void addCost(const float spatialCost, const float languageCost, const bool doNormalization, - const int inputSize, const int totalInputIndex, const ErrorType errorType) { + const int inputSize, const int totalInputIndex, + const ErrorTypeUtils::ErrorType errorType) { addDistance(spatialCost, languageCost, doNormalization, inputSize, totalInputIndex); - switch (errorType) { - case ET_EDIT_CORRECTION: - ++mEditCorrectionCount; - mExactMatch = false; - break; - case ET_PROXIMITY_CORRECTION: - ++mProximityCorrectionCount; - mExactMatch = false; - break; - case ET_COMPLETION: - mExactMatch = false; - break; - case ET_NEW_WORD: - mExactMatch = false; - break; - case ET_INTENTIONAL_OMISSION: - mExactMatch = false; - break; - case ET_NOT_AN_ERROR: - break; + mContainedErrorTypes = mContainedErrorTypes | errorType; + if (ErrorTypeUtils::isEditCorrectionError(errorType)) { + ++mEditCorrectionCount; + } + if (ErrorTypeUtils::isProximityCorrectionError(errorType)) { + ++mProximityCorrectionCount; } } @@ -181,8 +169,8 @@ class DicNodeStateScoring { } } - bool isExactMatch() const { - return mExactMatch; + ErrorTypeUtils::ErrorType getContainedErrorTypes() const { + return mContainedErrorTypes; } private: @@ -199,7 +187,8 @@ class DicNodeStateScoring { float mSpatialDistance; float mLanguageDistance; float mRawLength; - bool mExactMatch; + // All accumulated error types so far + ErrorTypeUtils::ErrorType mContainedErrorTypes; float mNormalizedCompoundDistanceAfterFirstWord; AK_FORCE_INLINE void addDistance(float spatialDistance, float languageDistance, diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index 71f4ef6ea..d0b96b0fe 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -41,6 +41,9 @@ BigramDictionary::~BigramDictionary() { void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const { + if (length >= MAX_WORD_LENGTH) { + length = MAX_WORD_LENGTH - 1; + } word[length] = 0; if (DEBUG_DICT_FULL) { #ifdef FLAG_DBG @@ -66,14 +69,17 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int if (insertAt >= MAX_RESULTS) { return; } - memmove(bigramProbability + (insertAt + 1), - bigramProbability + insertAt, + // Shift result buffers to insert the new entry. + memmove(bigramProbability + (insertAt + 1), bigramProbability + insertAt, (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); - bigramProbability[insertAt] = probability; - outputTypes[insertAt] = Dictionary::KIND_PREDICTION; + memmove(outputTypes + (insertAt + 1), outputTypes + insertAt, + (MAX_RESULTS - insertAt - 1) * sizeof(outputTypes[0])); memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH, (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); + // Put the result. + bigramProbability[insertAt] = probability; + outputTypes[insertAt] = Dictionary::KIND_PREDICTION; int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; while (length--) { *dest++ = *word++; @@ -144,7 +150,7 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return NOT_A_DICT_POS; - int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength, + int pos = mDictionaryStructurePolicy->getTerminalPtNodePositionOfWord(prevWord, prevWordLength, forceLowerCaseSearch); if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS; return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos); @@ -155,7 +161,7 @@ int BigramDictionary::getBigramProbability(const int *word0, int length0, const int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY; - int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1, + int nextWordPos = mDictionaryStructurePolicy->getTerminalPtNodePositionOfWord(word1, length1, false /* forceLowerCaseSearch */); if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; @@ -163,7 +169,8 @@ int BigramDictionary::getBigramProbability(const int *word0, int length0, const mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); while (bigramsIt.hasNext()) { bigramsIt.next(); - if (bigramsIt.getBigramPos() == nextWordPos) { + if (bigramsIt.getBigramPos() == nextWordPos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { return mDictionaryStructurePolicy->getProbability( mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos), bigramsIt.getProbability()); diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/suggest/core/dictionary/bloom_filter.h index 5205456a8..e22c3ae5c 100644 --- a/native/jni/src/suggest/core/dictionary/bloom_filter.h +++ b/native/jni/src/suggest/core/dictionary/bloom_filter.h @@ -17,6 +17,7 @@ #ifndef LATINIME_BLOOM_FILTER_H #define LATINIME_BLOOM_FILTER_H +#include <cstring> #include <stdint.h> #include "defines.h" @@ -35,6 +36,7 @@ class BloomFilter { public: BloomFilter() { ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO); + memset(mFilter, 0, sizeof(mFilter)); } // TODO: uint32_t position @@ -50,6 +52,8 @@ class BloomFilter { } private: + DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter); + // Size, in bytes, of the bloom filter index for bigrams // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k, // where k is the number of hash functions, n the number of bigrams, and m the number of diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 59ead1894..035232f7a 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -21,74 +21,70 @@ #include <stdint.h> #include "defines.h" -#include "suggest/core/dictionary/bigram_dictionary.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/session/dic_traverse_session.h" #include "suggest/core/suggest.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" #include "suggest/policyimpl/typing/typing_suggest_policy_factory.h" #include "utils/log_utils.h" +#include "utils/time_keeper.h" namespace latinime { const int Dictionary::HEADER_ATTRIBUTE_BUFFER_SIZE = 32; -Dictionary::Dictionary(JNIEnv *env, - DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPolicy) +Dictionary::Dictionary(JNIEnv *env, const DictionaryStructureWithBufferPolicy::StructurePolicyPtr + &dictionaryStructureWithBufferPolicy) : mDictionaryStructureWithBufferPolicy(dictionaryStructureWithBufferPolicy), - mBigramDictionary(new BigramDictionary(mDictionaryStructureWithBufferPolicy)), + mBigramDictionary(new BigramDictionary(mDictionaryStructureWithBufferPolicy.get())), mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())), mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) { logDictionaryInfo(env); } -Dictionary::~Dictionary() { - delete mBigramDictionary; - delete mGestureSuggest; - delete mTypingSuggest; - delete mDictionaryStructureWithBufferPolicy; -} - int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, int inputSize, int *prevWordCodePoints, int prevWordLength, int commitPoint, - const SuggestOptions *const suggestOptions, int *outWords, int *frequencies, + const SuggestOptions *const suggestOptions, int *outWords, int *outputScores, int *spaceIndices, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const { + TimeKeeper::setCurrentTime(); int result = 0; if (suggestOptions->isGesture()) { DicTraverseSession::initSessionInstance( traverseSession, this, prevWordCodePoints, prevWordLength, suggestOptions); - result = mGestureSuggest->getSuggestions(proximityInfo, traverseSession, xcoordinates, + result = mGestureSuggest.get()->getSuggestions(proximityInfo, traverseSession, xcoordinates, ycoordinates, times, pointerIds, inputCodePoints, inputSize, commitPoint, outWords, - frequencies, spaceIndices, outputTypes, outputAutoCommitFirstWordConfidence); + outputScores, spaceIndices, outputTypes, outputAutoCommitFirstWordConfidence); if (DEBUG_DICT) { - DUMP_RESULT(outWords, frequencies); + DUMP_RESULT(outWords, outputScores); } return result; } else { DicTraverseSession::initSessionInstance( traverseSession, this, prevWordCodePoints, prevWordLength, suggestOptions); - result = mTypingSuggest->getSuggestions(proximityInfo, traverseSession, xcoordinates, + result = mTypingSuggest.get()->getSuggestions(proximityInfo, traverseSession, xcoordinates, ycoordinates, times, pointerIds, inputCodePoints, inputSize, commitPoint, - outWords, frequencies, spaceIndices, outputTypes, + outWords, outputScores, spaceIndices, outputTypes, outputAutoCommitFirstWordConfidence); if (DEBUG_DICT) { - DUMP_RESULT(outWords, frequencies); + DUMP_RESULT(outWords, outputScores); } return result; } } -int Dictionary::getBigrams(const int *word, int length, int *outWords, int *frequencies, +int Dictionary::getBigrams(const int *word, int length, int *outWords, int *outputScores, int *outputTypes) const { + TimeKeeper::setCurrentTime(); if (length <= 0) return 0; - return mBigramDictionary->getPredictions(word, length, outWords, frequencies, outputTypes); + return mBigramDictionary.get()->getPredictions(word, length, outWords, outputScores, + outputTypes); } int Dictionary::getProbability(const int *word, int length) const { - int pos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord(word, length, + TimeKeeper::setCurrentTime(); + int pos = getDictionaryStructurePolicy()->getTerminalPtNodePositionOfWord(word, length, false /* forceLowerCaseSearch */); if (NOT_A_DICT_POS == pos) { return NOT_A_PROBABILITY; @@ -98,39 +94,66 @@ int Dictionary::getProbability(const int *word, int length) const { int Dictionary::getBigramProbability(const int *word0, int length0, const int *word1, int length1) const { - return mBigramDictionary->getBigramProbability(word0, length0, word1, length1); + TimeKeeper::setCurrentTime(); + return mBigramDictionary.get()->getBigramProbability(word0, length0, word1, length1); } -void Dictionary::addUnigramWord(const int *const word, const int length, const int probability) { - mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, probability); +void Dictionary::addUnigramWord(const int *const word, const int length, const int probability, + const int *const shortcutTargetCodePoints, const int shortcutLength, + const int shortcutProbability, const bool isNotAWord, const bool isBlacklisted, + const int timestamp) { + TimeKeeper::setCurrentTime(); + mDictionaryStructureWithBufferPolicy.get()->addUnigramWord(word, length, probability, + shortcutTargetCodePoints, shortcutLength, shortcutProbability, isNotAWord, + isBlacklisted, timestamp); } void Dictionary::addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability) { - mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, word1, length1, - probability); + const int length1, const int probability, const int timestamp) { + TimeKeeper::setCurrentTime(); + mDictionaryStructureWithBufferPolicy.get()->addBigramWords(word0, length0, word1, length1, + probability, timestamp); } void Dictionary::removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1) { - mDictionaryStructureWithBufferPolicy->removeBigramWords(word0, length0, word1, length1); + TimeKeeper::setCurrentTime(); + mDictionaryStructureWithBufferPolicy.get()->removeBigramWords(word0, length0, word1, length1); } void Dictionary::flush(const char *const filePath) { - mDictionaryStructureWithBufferPolicy->flush(filePath); + TimeKeeper::setCurrentTime(); + mDictionaryStructureWithBufferPolicy.get()->flush(filePath); } void Dictionary::flushWithGC(const char *const filePath) { - mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); + TimeKeeper::setCurrentTime(); + mDictionaryStructureWithBufferPolicy.get()->flushWithGC(filePath); } bool Dictionary::needsToRunGC(const bool mindsBlockByGC) { - return mDictionaryStructureWithBufferPolicy->needsToRunGC(mindsBlockByGC); + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy.get()->needsToRunGC(mindsBlockByGC); } -void Dictionary::getProperty(const char *const query, char *const outResult, +void Dictionary::getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength) { - return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength); + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy.get()->getProperty(query, queryLength, outResult, + maxResultLength); +} + +const WordProperty Dictionary::getWordProperty(const int *const codePoints, + const int codePointCount) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy.get()->getWordProperty( + codePoints, codePointCount); +} + +int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy.get()->getNextWordAndNextToken( + token, outCodePoints); } void Dictionary::logDictionaryInfo(JNIEnv *const env) const { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 0195d5bf0..c58be8475 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -21,15 +21,20 @@ #include "defines.h" #include "jni.h" +#include "suggest/core/dictionary/bigram_dictionary.h" +#include "suggest/core/dictionary/word_property.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/core/suggest_interface.h" +#include "utils/exclusive_ownership_pointer.h" namespace latinime { -class BigramDictionary; class DictionaryStructureWithBufferPolicy; class DicTraverseSession; class ProximityInfo; -class SuggestInterface; class SuggestOptions; +class WordProperty; class Dictionary { public: @@ -53,26 +58,29 @@ class Dictionary { static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; static const int KIND_FLAG_EXACT_MATCH = 0x40000000; - Dictionary(JNIEnv *env, - DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPoilcy); + Dictionary(JNIEnv *env, const DictionaryStructureWithBufferPolicy::StructurePolicyPtr + &dictionaryStructureWithBufferPolicy); int getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, int inputSize, int *prevWordCodePoints, int prevWordLength, int commitPoint, - const SuggestOptions *const suggestOptions, int *outWords, int *frequencies, + const SuggestOptions *const suggestOptions, int *outWords, int *outputScores, int *spaceIndices, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const; - int getBigrams(const int *word, int length, int *outWords, int *frequencies, + int getBigrams(const int *word, int length, int *outWords, int *outputScores, int *outputTypes) const; int getProbability(const int *word, int length) const; int getBigramProbability(const int *word0, int length0, const int *word1, int length1) const; - void addUnigramWord(const int *const word, const int length, const int probability); + void addUnigramWord(const int *const word, const int length, const int probability, + const int *const shortcutTargetCodePoints, const int shortcutLength, + const int shortcutProbability, const bool isNotAWord, const bool isBlacklisted, + const int timestamp); void addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability); + const int length1, const int probability, const int timestamp); void removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1); @@ -83,24 +91,33 @@ class Dictionary { bool needsToRunGC(const bool mindsBlockByGC); - void getProperty(const char *const query, char *const outResult, + void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); + const WordProperty getWordProperty(const int *const codePoints, const int codePointCount); + + // Method to iterate all words in the dictionary. + // The returned token has to be used to get the next word. If token is 0, this method newly + // starts iterating the dictionary. + int getNextWordAndNextToken(const int token, int *const outCodePoints); + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { - return mDictionaryStructureWithBufferPolicy; + return mDictionaryStructureWithBufferPolicy.get(); } - virtual ~Dictionary(); - private: DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary); + typedef ExclusiveOwnershipPointer<BigramDictionary> BigramDictionaryPtr; + typedef ExclusiveOwnershipPointer<SuggestInterface> SuggestInterfacePtr; + static const int HEADER_ATTRIBUTE_BUFFER_SIZE; - DictionaryStructureWithBufferPolicy *const mDictionaryStructureWithBufferPolicy; - const BigramDictionary *const mBigramDictionary; - const SuggestInterface *const mGestureSuggest; - const SuggestInterface *const mTypingSuggest; + const DictionaryStructureWithBufferPolicy::StructurePolicyPtr + mDictionaryStructureWithBufferPolicy; + const BigramDictionaryPtr mBigramDictionary; + const SuggestInterfacePtr mGestureSuggest; + const SuggestInterfacePtr mTypingSuggest; void logDictionaryInfo(JNIEnv *const env) const; }; diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp index 3271c1bfb..5f9b8f3e2 100644 --- a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp @@ -28,11 +28,8 @@ const DigraphUtils::digraph_t DigraphUtils::GERMAN_UMLAUT_DIGRAPHS[] = { { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS { 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS { 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS -const DigraphUtils::digraph_t DigraphUtils::FRENCH_LIGATURES_DIGRAPHS[] = - { { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE - { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = - { DIGRAPH_TYPE_GERMAN_UMLAUT, DIGRAPH_TYPE_FRENCH_LIGATURES }; + { DIGRAPH_TYPE_GERMAN_UMLAUT }; /* static */ bool DigraphUtils::hasDigraphForCodePoint( const DictionaryHeaderStructurePolicy *const headerPolicy, @@ -50,9 +47,6 @@ const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = if (headerPolicy->requiresGermanUmlautProcessing()) { return DIGRAPH_TYPE_GERMAN_UMLAUT; } - if (headerPolicy->requiresFrenchLigatureProcessing()) { - return DIGRAPH_TYPE_FRENCH_LIGATURES; - } return DIGRAPH_TYPE_NONE; } @@ -86,10 +80,6 @@ const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = *digraphs = GERMAN_UMLAUT_DIGRAPHS; return NELEMS(GERMAN_UMLAUT_DIGRAPHS); } - if (digraphType == DIGRAPH_TYPE_FRENCH_LIGATURES) { - *digraphs = FRENCH_LIGATURES_DIGRAPHS; - return NELEMS(FRENCH_LIGATURES_DIGRAPHS); - } return 0; } diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.h b/native/jni/src/suggest/core/dictionary/digraph_utils.h index 6ae16e390..bec2cd6e2 100644 --- a/native/jni/src/suggest/core/dictionary/digraph_utils.h +++ b/native/jni/src/suggest/core/dictionary/digraph_utils.h @@ -34,7 +34,6 @@ class DigraphUtils { typedef enum { DIGRAPH_TYPE_NONE, DIGRAPH_TYPE_GERMAN_UMLAUT, - DIGRAPH_TYPE_FRENCH_LIGATURES } DigraphType; typedef struct { int first; int second; int compositeGlyph; } digraph_t; @@ -55,7 +54,6 @@ class DigraphUtils { const DigraphType digraphType, const int compositeGlyphCodePoint); static const digraph_t GERMAN_UMLAUT_DIGRAPHS[]; - static const digraph_t FRENCH_LIGATURES_DIGRAPHS[]; static const DigraphType USED_DIGRAPH_TYPES[]; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp new file mode 100644 index 000000000..0635fef7e --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/error_type_utils.h" + +namespace latinime { + +const ErrorTypeUtils::ErrorType ErrorTypeUtils::NOT_AN_ERROR = 0x0; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_CASE_ERROR = 0x1; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR = 0x2; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x4; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x8; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x10; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x20; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x40; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x80; + +const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = + NOT_AN_ERROR | MATCH_WITH_CASE_ERROR | MATCH_WITH_ACCENT_ERROR | MATCH_WITH_DIGRAPH; + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h new file mode 100644 index 000000000..1122291a6 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ERROR_TYPE_UTILS_H +#define LATINIME_ERROR_TYPE_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class ErrorTypeUtils { + public: + // ErrorType is mainly decided by CorrectionType but it is also depending on if + // the correction has really been performed or not. + typedef uint32_t ErrorType; + + static const ErrorType NOT_AN_ERROR; + static const ErrorType MATCH_WITH_CASE_ERROR; + static const ErrorType MATCH_WITH_ACCENT_ERROR; + static const ErrorType MATCH_WITH_DIGRAPH; + // Treat error as an intentional omission when the CorrectionType is omission and the node can + // be intentional omission. + static const ErrorType INTENTIONAL_OMISSION; + // Substitution, omission and transposition + static const ErrorType EDIT_CORRECTION; + // Proximity error + static const ErrorType PROXIMITY_CORRECTION; + // Completion + static const ErrorType COMPLETION; + // New word + // TODO: Remove. + // A new word error should be an edit correction error or a proximity correction error. + static const ErrorType NEW_WORD; + + static bool isExactMatch(const ErrorType containedErrorTypes) { + return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; + } + + static bool isEditCorrectionError(const ErrorType errorType) { + return (errorType & EDIT_CORRECTION) != 0; + } + + static bool isProximityCorrectionError(const ErrorType errorType) { + return (errorType & PROXIMITY_CORRECTION) != 0; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); + + static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; +}; +} // namespace latinime +#endif // LATINIME_ERROR_TYPE_UTILS_H diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp b/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp index b1d2f4b4d..49d82e69a 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp @@ -30,4 +30,75 @@ const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; // Most common previous word contexts currently have 100 bigrams const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; +// Look up the bigram probability for the given word pair from the cached bigram maps. +// Also caches the bigrams if there is space remaining and they have not been cached already. +int MultiBigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int wordPosition, const int nextWordPosition, const int unigramProbability) { + hash_map_compat<int, BigramMap>::const_iterator mapPosition = + mBigramMaps.find(wordPosition); + if (mapPosition != mBigramMaps.end()) { + return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, + unigramProbability); + } + if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { + addBigramsForWordPosition(structurePolicy, wordPosition); + return mBigramMaps[wordPosition].getBigramProbability(structurePolicy, + nextWordPosition, unigramProbability); + } + return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, + nextWordPosition, unigramProbability); +} + +void MultiBigramMap::BigramMap::init( + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) { + const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); + BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), + bigramsListPos); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { + continue; + } + mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); + mBloomFilter.setInFilter(bigramsIt.getBigramPos()); + } +} + +int MultiBigramMap::BigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordPosition, const int unigramProbability) const { + int bigramProbability = NOT_A_PROBABILITY; + if (mBloomFilter.isInFilter(nextWordPosition)) { + const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = + mBigramMap.find(nextWordPosition); + if (bigramProbabilityIt != mBigramMap.end()) { + bigramProbability = bigramProbabilityIt->second; + } + } + return structurePolicy->getProbability(unigramProbability, bigramProbability); +} + +void MultiBigramMap::addBigramsForWordPosition( + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position) { + mBigramMaps[position].init(structurePolicy, position); +} + +int MultiBigramMap::readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, + const int nextWordPosition, const int unigramProbability) { + int bigramProbability = NOT_A_PROBABILITY; + const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); + BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), + bigramsListPos); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == nextWordPosition) { + bigramProbability = bigramsIt.getProbability(); + break; + } + } + return structurePolicy->getProbability(unigramProbability, bigramProbability); +} + } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index 4633c07b0..421b2681c 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -38,21 +38,7 @@ class MultiBigramMap { // Look up the bigram probability for the given word pair from the cached bigram maps. // Also caches the bigrams if there is space remaining and they have not been cached already. int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int wordPosition, const int nextWordPosition, const int unigramProbability) { - hash_map_compat<int, BigramMap>::const_iterator mapPosition = - mBigramMaps.find(wordPosition); - if (mapPosition != mBigramMaps.end()) { - return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, - unigramProbability); - } - if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { - addBigramsForWordPosition(structurePolicy, wordPosition); - return mBigramMaps[wordPosition].getBigramProbability(structurePolicy, - nextWordPosition, unigramProbability); - } - return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, - nextWordPosition, unigramProbability); - } + const int wordPosition, const int nextWordPosition, const int unigramProbability); void clear() { mBigramMaps.clear(); @@ -67,33 +53,11 @@ class MultiBigramMap { ~BigramMap() {} void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int nodePos) { - const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); - BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), - bigramsListPos); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { - continue; - } - mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); - mBloomFilter.setInFilter(bigramsIt.getBigramPos()); - } - } + const int nodePos); - AK_FORCE_INLINE int getBigramProbability( + int getBigramProbability( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int nextWordPosition, const int unigramProbability) const { - int bigramProbability = NOT_A_PROBABILITY; - if (mBloomFilter.isInFilter(nextWordPosition)) { - const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = - mBigramMap.find(nextWordPosition); - if (bigramProbabilityIt != mBigramMap.end()) { - bigramProbability = bigramProbabilityIt->second; - } - } - return structurePolicy->getProbability(unigramProbability, bigramProbability); - } + const int nextWordPosition, const int unigramProbability) const; private: // NOTE: The BigramMap class doesn't use DISALLOW_COPY_AND_ASSIGN() because its default @@ -103,27 +67,12 @@ class MultiBigramMap { BloomFilter mBloomFilter; }; - AK_FORCE_INLINE void addBigramsForWordPosition( - const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position) { - mBigramMaps[position].init(structurePolicy, position); - } + void addBigramsForWordPosition( + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position); - AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary( + int readBigramProbabilityFromBinaryDictionary( const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, - const int nextWordPosition, const int unigramProbability) { - int bigramProbability = NOT_A_PROBABILITY; - const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); - BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), - bigramsListPos); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == nextWordPosition) { - bigramProbability = bigramsIt.getProbability(); - break; - } - } - return structurePolicy->getProbability(unigramProbability, bigramProbability); - } + const int nextWordPosition, const int unigramProbability); static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; hash_map_compat<int, BigramMap> mBigramMaps; diff --git a/native/jni/src/suggest/core/dictionary/shortcut_utils.h b/native/jni/src/suggest/core/dictionary/shortcut_utils.h deleted file mode 100644 index 9ccef020f..000000000 --- a/native/jni/src/suggest/core/dictionary/shortcut_utils.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_SHORTCUT_UTILS -#define LATINIME_SHORTCUT_UTILS - -#include "defines.h" -#include "suggest/core/dicnode/dic_node_utils.h" -#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" - -namespace latinime { - -class ShortcutUtils { - public: - static int outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt, - int outputWordIndex, const int finalScore, int *const outputCodePoints, - int *const frequencies, int *const outputTypes, const bool sameAsTyped) { - int shortcutTarget[MAX_WORD_LENGTH]; - while (shortcutIt->hasNextShortcutTarget() && outputWordIndex < MAX_RESULTS) { - bool isWhilelist; - int shortcutTargetStringLength; - shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, - &shortcutTargetStringLength, &isWhilelist); - int shortcutScore; - int kind; - if (isWhilelist && sameAsTyped) { - shortcutScore = S_INT_MAX; - kind = Dictionary::KIND_WHITELIST; - } else { - // shortcut entry's score == its base entry's score - 1 - shortcutScore = finalScore; - // Protection against int underflow - shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1; - kind = Dictionary::KIND_SHORTCUT; - } - outputTypes[outputWordIndex] = kind; - frequencies[outputWordIndex] = shortcutScore; - frequencies[outputWordIndex] = max(S_INT_MIN + 1, shortcutScore) - 1; - const int startIndex2 = outputWordIndex * MAX_WORD_LENGTH; - DicNodeUtils::appendTwoWords(0, 0, shortcutTarget, shortcutTargetStringLength, - &outputCodePoints[startIndex2]); - ++outputWordIndex; - } - return outputWordIndex; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutUtils); -}; -} // namespace latinime -#endif // LATINIME_SHORTCUT_UTILS diff --git a/native/jni/src/suggest/core/dictionary/suggestions_output_utils.cpp b/native/jni/src/suggest/core/dictionary/suggestions_output_utils.cpp new file mode 100644 index 000000000..07c2e6ea9 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/suggestions_output_utils.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/suggestions_output_utils.h" + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/error_type_utils.h" +#include "suggest/core/policy/scoring.h" +#include "suggest/core/session/dic_traverse_session.h" + +namespace latinime { + +const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; + +// TODO: Split this method. +/* static */ int SuggestionsOutputUtils::outputSuggestions( + const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, + int *outputScores, int *outputCodePoints, int *outputIndicesToPartialCommit, + int *outputTypes, int *outputAutoCommitFirstWordConfidence) { +#if DEBUG_EVALUATE_MOST_PROBABLE_STRING + const int terminalSize = 0; +#else + const int terminalSize = min(MAX_RESULTS, + static_cast<int>(traverseSession->getDicTraverseCache()->terminalSize())); +#endif + DicNode terminals[MAX_RESULTS]; // Avoiding non-POD variable length array + + for (int index = terminalSize - 1; index >= 0; --index) { + traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); + } + + const float languageWeight = scoringPolicy->getAdjustedLanguageWeight( + traverseSession, terminals, terminalSize); + + int outputWordIndex = 0; + // Insert most probable word at index == 0 as long as there is one terminal at least + const bool hasMostProbableString = + scoringPolicy->getMostProbableString(traverseSession, terminalSize, languageWeight, + &outputCodePoints[0], &outputTypes[0], &outputScores[0]); + if (hasMostProbableString) { + outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX; + ++outputWordIndex; + } + + int maxScore = S_INT_MIN; + // Force autocorrection for obvious long multi-word suggestions when the top suggestion is + // a long multiple words suggestion. + // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. + // traverseSession->isPartiallyCommited() always returns false because we never auto partial + // commit for now. + const bool forceCommitMultiWords = (terminalSize > 0) ? + scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop() + && (traverseSession->isPartiallyCommited() + || (traverseSession->getInputSize() + >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT + && terminals[0].hasMultipleWords())) : false; + // TODO: have partial commit work even with multiple pointers. + const bool outputSecondWordFirstLetterInputIndex = + traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); + if (terminalSize > 0) { + // If we have no suggestions, don't write this + outputAutoCommitFirstWordConfidence[0] = + computeFirstWordConfidence(&terminals[0]); + } + const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()-> + getHeaderStructurePolicy()->shouldBoostExactMatches(); + // Output suggestion results here + for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS; + ++terminalIndex) { + DicNode *terminalDicNode = &terminals[terminalIndex]; + if (DEBUG_GEO_FULL) { + terminalDicNode->dump("OUT:"); + } + const float doubleLetterCost = + scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); + const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) + + doubleLetterCost; + const bool isPossiblyOffensiveWord = + traverseSession->getDictionaryStructurePolicy()->getProbability( + terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; + const bool isExactMatch = + ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); + // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. + // (e.g. "AMD" and "and") + const bool isSafeExactMatch = isExactMatch + && !(isPossiblyOffensiveWord && isFirstCharUppercase); + const int outputTypeFlags = + (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) + | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); + + // Entries that are blacklisted or do not represent a word should not be output. + const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); + + // Increase output score of top typing suggestion to ensure autocorrection. + // TODO: Better integration with java side autocorrection logic. + const int finalScore = scoringPolicy->calculateFinalScore( + compoundDistance, traverseSession->getInputSize(), + terminalDicNode->getContainedErrorTypes(), + (forceCommitMultiWords && terminalDicNode->hasMultipleWords()) + || (isValidWord && scoringPolicy->doesAutoCorrectValidWord()), + boostExactMatches); + if (maxScore < finalScore && isValidWord) { + maxScore = finalScore; + } + + // Don't output invalid words. However, we still need to submit their shortcuts if any. + if (isValidWord) { + outputTypes[outputWordIndex] = Dictionary::KIND_CORRECTION | outputTypeFlags; + outputScores[outputWordIndex] = finalScore; + if (outputSecondWordFirstLetterInputIndex) { + outputIndicesToPartialCommit[outputWordIndex] = + terminalDicNode->getSecondWordFirstInputIndex( + traverseSession->getProximityInfoState(0)); + } else { + outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX; + } + // Populate the outputChars array with the suggested word. + const int startIndex = outputWordIndex * MAX_WORD_LENGTH; + terminalDicNode->outputResult(&outputCodePoints[startIndex]); + ++outputWordIndex; + } + + if (!terminalDicNode->hasMultipleWords()) { + BinaryDictionaryShortcutIterator shortcutIt( + traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), + traverseSession->getDictionaryStructurePolicy() + ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos())); + // Shortcut is not supported for multiple words suggestions. + // TODO: Check shortcuts during traversal for multiple words suggestions. + const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); + const int shortcutBaseScore = scoringPolicy->doesAutoCorrectValidWord() ? + scoringPolicy->calculateFinalScore(compoundDistance, + traverseSession->getInputSize(), + terminalDicNode->getContainedErrorTypes(), + true /* forceCommit */, boostExactMatches) : finalScore; + const int updatedOutputWordIndex = outputShortcuts(&shortcutIt, + outputWordIndex, shortcutBaseScore, outputCodePoints, outputScores, outputTypes, + sameAsTyped); + const int secondWordFirstInputIndex = terminalDicNode->getSecondWordFirstInputIndex( + traverseSession->getProximityInfoState(0)); + for (int i = outputWordIndex; i < updatedOutputWordIndex; ++i) { + if (outputSecondWordFirstLetterInputIndex) { + outputIndicesToPartialCommit[i] = secondWordFirstInputIndex; + } else { + outputIndicesToPartialCommit[i] = NOT_AN_INDEX; + } + } + outputWordIndex = updatedOutputWordIndex; + } + DicNode::managedDelete(terminalDicNode); + } + + if (hasMostProbableString) { + scoringPolicy->safetyNetForMostProbableString(outputWordIndex, maxScore, + &outputCodePoints[0], outputScores); + } + return outputWordIndex; +} + +/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence( + const DicNode *const terminalDicNode) { + // Get the number of spaces in the first suggestion + const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); + // Get the number of characters in the first suggestion + const int length = terminalDicNode->getTotalNodeCodePointCount(); + // Get the distance for the first word of the suggestion + const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); + + // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. + // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or + // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means + // we are very confident. + // Expected space count is 1 ~ 5 + static const int MIN_EXPECTED_SPACE_COUNT = 1; + static const int MAX_EXPECTED_SPACE_COUNT = 5; + // Expected length is about 4 ~ 30 + static const int MIN_EXPECTED_LENGTH = 4; + static const int MAX_EXPECTED_LENGTH = 30; + // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 + static const float MIN_EXPECTED_DISTANCE = 0.0; + static const float MAX_EXPECTED_DISTANCE = 2.0; + // This is not strict: it's where most stuff will be falling, but it's still fine if it's + // outside these values. We want to output a value that reflects all of these. Each factor + // contributes a bit. + + // We need at least a space. + if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; + + // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 + // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the + // weight of the distance. Clamp to avoid overflows. + const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE + : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; + const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT + * (MAX_EXPECTED_DISTANCE - clampedDistance) + / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); + // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no + // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the + // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. + const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT + * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); + // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no + // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the + // weight of the space count. + const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT + * (spaceCount - MIN_EXPECTED_SPACE_COUNT) + / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); + + return distanceContribution + lengthContribution + spaceContribution; +} + +/* static */ int SuggestionsOutputUtils::outputShortcuts( + BinaryDictionaryShortcutIterator *const shortcutIt, + int outputWordIndex, const int finalScore, int *const outputCodePoints, + int *const outputScores, int *const outputTypes, const bool sameAsTyped) { + int shortcutTarget[MAX_WORD_LENGTH]; + while (shortcutIt->hasNextShortcutTarget() && outputWordIndex < MAX_RESULTS) { + bool isWhilelist; + int shortcutTargetStringLength; + shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetStringLength, &isWhilelist); + int shortcutScore; + int kind; + if (isWhilelist && sameAsTyped) { + shortcutScore = S_INT_MAX; + kind = Dictionary::KIND_WHITELIST; + } else { + // shortcut entry's score == its base entry's score - 1 + shortcutScore = finalScore; + // Protection against int underflow + shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1; + kind = Dictionary::KIND_SHORTCUT; + } + outputTypes[outputWordIndex] = kind; + outputScores[outputWordIndex] = shortcutScore; + outputScores[outputWordIndex] = max(S_INT_MIN + 1, shortcutScore) - 1; + const int startIndex2 = outputWordIndex * MAX_WORD_LENGTH; + DicNodeUtils::appendTwoWords(0, 0, shortcutTarget, shortcutTargetStringLength, + &outputCodePoints[startIndex2]); + ++outputWordIndex; + } + return outputWordIndex; +} +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/suggestions_output_utils.h b/native/jni/src/suggest/core/dictionary/suggestions_output_utils.h new file mode 100644 index 000000000..d456a545f --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/suggestions_output_utils.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGESTIONS_OUTPUT_UTILS +#define LATINIME_SUGGESTIONS_OUTPUT_UTILS + +#include "defines.h" + +namespace latinime { + +class BinaryDictionaryShortcutIterator; +class DicNode; +class DicTraverseSession; +class Scoring; + +class SuggestionsOutputUtils { + public: + /** + * Outputs the final list of suggestions (i.e., terminal nodes). + */ + static int outputSuggestions(const Scoring *const scoringPolicy, + DicTraverseSession *traverseSession, int *outputScores, int *outputCodePoints, + int *outputIndicesToPartialCommit, int *outputTypes, + int *outputAutoCommitFirstWordConfidence); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestionsOutputUtils); + + // Inputs longer than this will autocorrect if the suggestion is multi-word + static const int MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT; + + static int computeFirstWordConfidence(const DicNode *const terminalDicNode); + + static int outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt, + int outputWordIndex, const int finalScore, int *const outputCodePoints, + int *const outputScores, int *const outputTypes, const bool sameAsTyped); +}; +} // namespace latinime +#endif // LATINIME_SUGGESTIONS_OUTPUT_UTILS diff --git a/native/jni/src/suggest/core/dictionary/word_property.cpp b/native/jni/src/suggest/core/dictionary/word_property.cpp new file mode 100644 index 000000000..473311842 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/word_property.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/word_property.h" + +namespace latinime { + +void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, + jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, + jobject outBigramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities) const { + env->SetIntArrayRegion(outCodePoints, 0 /* start */, mCodePoints.size(), &mCodePoints[0]); + jboolean flags[] = {mIsNotAWord, mIsBlacklisted, mHasBigrams, mHasShortcuts}; + env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); + int probabilityInfo[] = {mProbability, mTimestamp, mLevel, mCount}; + env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo), + probabilityInfo); + + jclass integerClass = env->FindClass("java/lang/Integer"); + jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V"); + jclass arrayListClass = env->FindClass("java/util/ArrayList"); + jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); + + // Output bigrams. + const int bigramCount = mBigrams.size(); + for (int i = 0; i < bigramCount; ++i) { + const BigramProperty *const bigramProperty = &mBigrams[i]; + const std::vector<int> *const word1CodePoints = bigramProperty->getTargetCodePoints(); + jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size()); + env->SetIntArrayRegion(bigramWord1CodePointArray, 0 /* start */, + word1CodePoints->size(), &word1CodePoints->at(0)); + env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray); + env->DeleteLocalRef(bigramWord1CodePointArray); + + int bigramProbabilityInfo[] = {bigramProperty->getProbability(), + bigramProperty->getTimestamp(), bigramProperty->getLevel(), + bigramProperty->getCount()}; + jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); + env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, + NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); + env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray); + env->DeleteLocalRef(bigramProbabilityInfoArray); + } + + // Output shortcuts. + const int shortcutTargetCount = mShortcuts.size(); + for (int i = 0; i < shortcutTargetCount; ++i) { + const std::vector<int> *const targetCodePoints = mShortcuts[i].getTargetCodePoints(); + jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size()); + env->SetIntArrayRegion(shortcutTargetCodePointArray, 0 /* start */, + targetCodePoints->size(), &targetCodePoints->at(0)); + env->CallBooleanMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); + env->DeleteLocalRef(shortcutTargetCodePointArray); + jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, + mShortcuts[i].getProbability()); + env->CallBooleanMethod(outShortcutProbabilities, addMethodId, integerProbability); + env->DeleteLocalRef(integerProbability); + } + env->DeleteLocalRef(integerClass); + env->DeleteLocalRef(arrayListClass); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/word_property.h b/native/jni/src/suggest/core/dictionary/word_property.h new file mode 100644 index 000000000..40b1a91a4 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/word_property.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_PROPERTY_H +#define LATINIME_WORD_PROPERTY_H + +#include <cstring> +#include <vector> + +#include "defines.h" +#include "jni.h" + +namespace latinime { + +// This class is used for returning information belonging to a word to java side. +class WordProperty { + public: + class BigramProperty { + public: + BigramProperty(const std::vector<int> *const targetCodePoints, + const int probability, const int timestamp, const int level, const int count) + : mTargetCodePoints(*targetCodePoints), mProbability(probability), + mTimestamp(timestamp), mLevel(level), mCount(count) {} + + const std::vector<int> *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + int getTimestamp() const { + return mTimestamp; + } + + int getLevel() const { + return mLevel; + } + + int getCount() const { + return mCount; + } + + private: + std::vector<int> mTargetCodePoints; + int mProbability; + int mTimestamp; + int mLevel; + int mCount; + }; + + class ShortcutProperty { + public: + ShortcutProperty(const std::vector<int> *const targetCodePoints, const int probability) + : mTargetCodePoints(*targetCodePoints), mProbability(probability) {} + + const std::vector<int> *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + private: + std::vector<int> mTargetCodePoints; + int mProbability; + }; + + // Invalid word. + WordProperty() + : mCodePoints(), mIsNotAWord(false), mIsBlacklisted(false), + mHasBigrams(false), mHasShortcuts(false), mProbability(NOT_A_PROBABILITY), + mTimestamp(0), mLevel(0), mCount(0), mBigrams(), mShortcuts() {} + + WordProperty(const std::vector<int> *const codePoints, + const bool isNotAWord, const bool isBlacklisted, const bool hasBigrams, + const bool hasShortcuts, const int probability, const int timestamp, + const int level, const int count, const std::vector<BigramProperty> *const bigrams, + const std::vector<ShortcutProperty> *const shortcuts) + : mCodePoints(*codePoints), mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mHasBigrams(hasBigrams), mHasShortcuts(hasShortcuts), mProbability(probability), + mTimestamp(timestamp), mLevel(level), mCount(count), mBigrams(*bigrams), + mShortcuts(*shortcuts) {} + + void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, + jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, + jobject outShortcutTargets, jobject outShortcutProbabilities) const; + + private: + DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); + + std::vector<int> mCodePoints; + bool mIsNotAWord; + bool mIsBlacklisted; + bool mHasBigrams; + bool mHasShortcuts; + int mProbability; + // Historical information + int mTimestamp; + int mLevel; + int mCount; + std::vector<BigramProperty> mBigrams; + std::vector<ShortcutProperty> mShortcuts; +}; +} // namespace latinime +#endif // LATINIME_WORD_PROPERTY_H diff --git a/native/jni/src/suggest/core/layout/proximity_info.cpp b/native/jni/src/suggest/core/layout/proximity_info.cpp index e64476d82..ee8e59ef9 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info.cpp @@ -71,7 +71,7 @@ ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr, && sweetSpotCenterYs && sweetSpotRadii), mProximityCharsArray(new int[GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE /* proximityCharsLength */]), - mCodeToKeyMap() { + mLowerCodePointToKeyMap() { /* Let's check the input array length here to make sure */ const jsize proximityCharsLength = env->GetArrayLength(proximityChars); if (proximityCharsLength != GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE) { @@ -147,7 +147,14 @@ int ProximityInfo::getCodePointOf(const int keyIndex) const { if (keyIndex < 0 || keyIndex >= KEY_COUNT) { return NOT_A_CODE_POINT; } - return mKeyIndexToCodePointG[keyIndex]; + return mKeyIndexToLowerCodePointG[keyIndex]; +} + +int ProximityInfo::getOriginalCodePointOf(const int keyIndex) const { + if (keyIndex < 0 || keyIndex >= KEY_COUNT) { + return NOT_A_CODE_POINT; + } + return mKeyIndexToOriginalCodePoint[keyIndex]; } void ProximityInfo::initializeG() { @@ -164,8 +171,9 @@ void ProximityInfo::initializeG() { const float gapY = sweetSpotCenterY - mCenterYsG[i]; mSweetSpotCenterYsG[i] = static_cast<int>(mCenterYsG[i] + gapY * verticalScale); } - mCodeToKeyMap[lowerCode] = i; - mKeyIndexToCodePointG[i] = lowerCode; + mLowerCodePointToKeyMap[lowerCode] = i; + mKeyIndexToOriginalCodePoint[i] = code; + mKeyIndexToLowerCodePointG[i] = lowerCode; } for (int i = 0; i < KEY_COUNT; i++) { mKeyKeyDistancesG[i][i] = 0; diff --git a/native/jni/src/suggest/core/layout/proximity_info.h b/native/jni/src/suggest/core/layout/proximity_info.h index f25949001..a91b9d674 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.h +++ b/native/jni/src/suggest/core/layout/proximity_info.h @@ -39,6 +39,7 @@ class ProximityInfo { float getNormalizedSquaredDistanceFromCenterFloatG( const int keyId, const int x, const int y, const bool isGeometric) const; int getCodePointOf(const int keyIndex) const; + int getOriginalCodePointOf(const int keyIndex) const; bool hasSweetSpotData(const int keyIndex) const { // When there are no calibration data for a key, // the radius of the key is assigned to zero. @@ -76,11 +77,11 @@ class ProximityInfo { ProximityInfoUtils::initializeProximities(inputCodes, inputXCoordinates, inputYCoordinates, inputSize, mKeyXCoordinates, mKeyYCoordinates, mKeyWidths, mKeyHeights, mProximityCharsArray, CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH, MOST_COMMON_KEY_WIDTH, - KEY_COUNT, mLocaleStr, &mCodeToKeyMap, allInputCodes); + KEY_COUNT, mLocaleStr, &mLowerCodePointToKeyMap, allInputCodes); } AK_FORCE_INLINE int getKeyIndexOf(const int c) const { - return ProximityInfoUtils::getKeyIndexOf(KEY_COUNT, c, &mCodeToKeyMap); + return ProximityInfoUtils::getKeyIndexOf(KEY_COUNT, c, &mLowerCodePointToKeyMap); } AK_FORCE_INLINE bool isCodePointOnKeyboard(const int codePoint) const { @@ -117,9 +118,9 @@ class ProximityInfo { // Sweet spots for geometric input. Note that we have extra sweet spots only for Y coordinates. float mSweetSpotCenterYsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; float mSweetSpotRadii[MAX_KEY_COUNT_IN_A_KEYBOARD]; - hash_map_compat<int, int> mCodeToKeyMap; - - int mKeyIndexToCodePointG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + hash_map_compat<int, int> mLowerCodePointToKeyMap; + int mKeyIndexToOriginalCodePoint[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyIndexToLowerCodePointG[MAX_KEY_COUNT_IN_A_KEYBOARD]; int mCenterXsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; int mCenterYsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; int mKeyKeyDistancesG[MAX_KEY_COUNT_IN_A_KEYBOARD][MAX_KEY_COUNT_IN_A_KEYBOARD]; diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.cpp b/native/jni/src/suggest/core/layout/proximity_info_state.cpp index fbabd92f2..40c3448ef 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info_state.cpp @@ -18,7 +18,7 @@ #include "suggest/core/layout/proximity_info_state.h" -#include <cstring> // for memset() and memcpy() +#include <cstring> // for memset() and memmove() #include <sstream> // for debug prints #include <vector> @@ -30,6 +30,12 @@ namespace latinime { +int ProximityInfoState::getPrimaryOriginalCodePointAt(const int index) const { + const int primaryCodePoint = getPrimaryCodePointAt(index); + const int keyIndex = mProximityInfo->getKeyIndexOf(primaryCodePoint); + return mProximityInfo->getOriginalCodePointOf(keyIndex); +} + // TODO: Remove the dependency of "isGeometric" void ProximityInfoState::initInputParams(const int pointerId, const float maxPointToKeyLength, const ProximityInfo *proximityInfo, const int *const inputCodes, const int inputSize, @@ -249,6 +255,14 @@ ProximityType ProximityInfoState::getProximityTypeG(const int index, const int c if (!isUsed()) { return UNRELATED_CHAR; } + const int sampledSearchKeyVectorsSize = static_cast<int>(mSampledSearchKeyVectors.size()); + if (index < 0 || index >= sampledSearchKeyVectorsSize) { + AKLOGE("getProximityTypeG() is called with an invalid index(%d). " + "mSampledSearchKeyVectors.size() = %d, codePoint = %x.", index, + sampledSearchKeyVectorsSize, codePoint); + ASSERT(false); + return UNRELATED_CHAR; + } const int lowerCodePoint = CharUtils::toLowerCase(codePoint); const int baseLowerCodePoint = CharUtils::toBaseCodePoint(lowerCodePoint); for (int i = 0; i < static_cast<int>(mSampledSearchKeyVectors[index].size()); ++i) { @@ -271,7 +285,7 @@ float ProximityInfoState::getDirection(const int index0, const int index1) const } float ProximityInfoState::getMostProbableString(int *const codePointBuf) const { - memcpy(codePointBuf, mMostProbableString, sizeof(mMostProbableString)); + memmove(codePointBuf, mMostProbableString, sizeof(mMostProbableString)); return mMostProbableStringProbability; } diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.h b/native/jni/src/suggest/core/layout/proximity_info_state.h index c94060fa9..e941e43d8 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state.h +++ b/native/jni/src/suggest/core/layout/proximity_info_state.h @@ -65,6 +65,8 @@ class ProximityInfoState { return getProximityCodePointsAt(index)[0]; } + int getPrimaryOriginalCodePointAt(const int index) const; + inline bool sameAsTyped(const int *word, int length) const { if (length != mSampledInputSize) { return false; diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index 5492c6070..59748c80d 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -17,6 +17,9 @@ #ifndef LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H #define LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H +#include <map> +#include <vector> + #include "defines.h" namespace latinime { @@ -27,13 +30,17 @@ namespace latinime { */ class DictionaryHeaderStructurePolicy { public: + typedef std::map<std::vector<int>, std::vector<int> > AttributeMap; + virtual ~DictionaryHeaderStructurePolicy() {} - virtual bool supportsDynamicUpdate() const = 0; + virtual int getFormatVersionNumber() const = 0; - virtual bool requiresGermanUmlautProcessing() const = 0; + virtual int getSize() const = 0; - virtual bool requiresFrenchLigatureProcessing() const = 0; + virtual const AttributeMap *getAttributeMap() const = 0; + + virtual bool requiresGermanUmlautProcessing() const = 0; virtual float getMultiWordCostMultiplier() const = 0; @@ -42,6 +49,8 @@ class DictionaryHeaderStructurePolicy { virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const = 0; + virtual bool shouldBoostExactMatches() const = 0; + protected: DictionaryHeaderStructurePolicy() {} diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 41f82049f..38e8ff183 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -18,6 +18,8 @@ #define LATINIME_DICTIONARY_STRUCTURE_POLICY_H #include "defines.h" +#include "suggest/core/dictionary/word_property.h" +#include "utils/exclusive_ownership_pointer.h" namespace latinime { @@ -28,23 +30,25 @@ class DictionaryHeaderStructurePolicy; class DictionaryShortcutsStructurePolicy; /* - * This class abstracts structure of dictionaries. + * This class abstracts the structure of dictionaries. * Implement this policy to support additional dictionaries. */ class DictionaryStructureWithBufferPolicy { public: + typedef ExclusiveOwnershipPointer<DictionaryStructureWithBufferPolicy> StructurePolicyPtr; + virtual ~DictionaryStructureWithBufferPolicy() {} virtual int getRootPosition() const = 0; - virtual void createAndGetAllChildNodes(const DicNode *const dicNode, + virtual void createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const = 0; virtual int getCodePointsAndProbabilityAndReturnCodePointCount( const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const = 0; - virtual int getTerminalNodePositionOfWord(const int *const inWord, + virtual int getTerminalPtNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const = 0; virtual int getProbability(const int unigramProbability, @@ -64,11 +68,13 @@ class DictionaryStructureWithBufferPolicy { // Returns whether the update was success or not. virtual bool addUnigramWord(const int *const word, const int length, - const int probability) = 0; + const int probability, const int *const shortcutTargetCodePoints, + const int shortcutLength, const int shortcutProbability, const bool isNotAWord, + const bool isBlacklisted,const int timestamp) = 0; // Returns whether the update was success or not. virtual bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability) = 0; + const int length1, const int probability, const int timestamp) = 0; // Returns whether the update was success or not. virtual bool removeBigramWords(const int *const word0, const int length0, @@ -82,9 +88,20 @@ class DictionaryStructureWithBufferPolicy { // Currently, this method is used only for testing. You may want to consider creating new // dedicated method instead of this if you want to use this in the production. - virtual void getProperty(const char *const query, char *const outResult, + virtual void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength) = 0; + // Used for testing. + virtual const WordProperty getWordProperty(const int *const codePonts, + const int codePointCount) const = 0; + + // Method to iterate all words in the dictionary. + // The returned token has to be used to get the next word. If token is 0, this method newly + // starts iterating the dictionary. + virtual int getNextWordAndNextToken(const int token, int *const outCodePoints) = 0; + + virtual bool isCorrupted() const = 0; + protected: DictionaryStructureWithBufferPolicy() {} diff --git a/native/jni/src/suggest/core/policy/scoring.h b/native/jni/src/suggest/core/policy/scoring.h index 102e856f5..0251475d5 100644 --- a/native/jni/src/suggest/core/policy/scoring.h +++ b/native/jni/src/suggest/core/policy/scoring.h @@ -28,21 +28,21 @@ class DicTraverseSession; class Scoring { public: virtual int calculateFinalScore(const float compoundDistance, const int inputSize, - const bool forceCommit) const = 0; + const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, + const bool boostExactMatches) const = 0; virtual bool getMostProbableString(const DicTraverseSession *const traverseSession, const int terminalSize, const float languageWeight, int *const outputCodePoints, int *const type, int *const freq) const = 0; - virtual void safetyNetForMostProbableString(const int terminalSize, - const int maxScore, int *const outputCodePoints, int *const frequencies) const = 0; - // TODO: Make more generic - virtual void searchWordWithDoubleLetter(DicNode *terminals, const int terminalSize, - int *doubleLetterTerminalIndex, DoubleLetterLevel *doubleLetterLevel) const = 0; + virtual void safetyNetForMostProbableString(const int scoreCount, + const int maxScore, int *const outputCodePoints, int *const scores) const = 0; virtual float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession, DicNode *const terminals, const int size) const = 0; - virtual float getDoubleLetterDemotionDistanceCost(const int terminalIndex, - const int doubleLetterTerminalIndex, - const DoubleLetterLevel doubleLetterLevel) const = 0; + virtual float getDoubleLetterDemotionDistanceCost( + const DicNode *const terminalDicNode) const = 0; virtual bool doesAutoCorrectValidWord() const = 0; + virtual bool autoCorrectsToMultiWordSuggestionIfTop() const = 0; + virtual bool sameAsTyped(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; protected: Scoring() {} diff --git a/native/jni/src/suggest/core/policy/traversal.h b/native/jni/src/suggest/core/policy/traversal.h index e935533f2..d3b8da0cc 100644 --- a/native/jni/src/suggest/core/policy/traversal.h +++ b/native/jni/src/suggest/core/policy/traversal.h @@ -41,11 +41,8 @@ class Traversal { const DicNode *const dicNode) const = 0; virtual ProximityType getProximityType(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, const DicNode *const childDicNode) const = 0; - virtual bool sameAsTyped(const DicTraverseSession *const traverseSession, - const DicNode *const dicNode) const = 0; virtual bool needsToTraverseAllUserInput() const = 0; virtual float getMaxSpatialDistance() const = 0; - virtual bool autoCorrectsToMultiWordSuggestionIfTop() const = 0; virtual int getDefaultExpandDicNodeSize() const = 0; virtual int getMaxCacheSize(const int inputSize) const = 0; virtual bool isPossibleOmissionChildNode(const DicTraverseSession *const traverseSession, diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp index 0c4016893..c202b81fe 100644 --- a/native/jni/src/suggest/core/policy/weighting.cpp +++ b/native/jni/src/suggest/core/policy/weighting.cpp @@ -20,6 +20,7 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_profiler.h" #include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" #include "suggest/core/session/dic_traverse_session.h" namespace latinime { @@ -82,8 +83,8 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n traverseSession, parentDicNode, dicNode, &inputStateG); const float languageCost = Weighting::getLanguageCost(weighting, correctionType, traverseSession, parentDicNode, dicNode, multiBigramMap); - const ErrorType errorType = weighting->getErrorType(correctionType, traverseSession, - parentDicNode, dicNode); + const ErrorTypeUtils::ErrorType errorType = weighting->getErrorType(correctionType, + traverseSession, parentDicNode, dicNode); profile(correctionType, dicNode); if (inputStateG.mNeedsToUpdateInputStateG) { dicNode->updateInputIndexG(&inputStateG); diff --git a/native/jni/src/suggest/core/policy/weighting.h b/native/jni/src/suggest/core/policy/weighting.h index 2d49e98a6..bd6b3cf41 100644 --- a/native/jni/src/suggest/core/policy/weighting.h +++ b/native/jni/src/suggest/core/policy/weighting.h @@ -18,6 +18,7 @@ #define LATINIME_WEIGHTING_H #include "defines.h" +#include "suggest/core/dictionary/error_type_utils.h" namespace latinime { @@ -84,7 +85,7 @@ class Weighting { virtual float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const = 0; - virtual ErrorType getErrorType(const CorrectionType correctionType, + virtual ErrorTypeUtils::ErrorType getErrorType(const CorrectionType correctionType, const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp index 50f2bbd8d..5070491f4 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp +++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -35,16 +35,16 @@ void DicTraverseSession::init(const Dictionary *const dictionary, const int *pre ->getMultiWordCostMultiplier(); mSuggestOptions = suggestOptions; if (!prevWord) { - mPrevWordPos = NOT_A_DICT_POS; + mPrevWordPtNodePos = NOT_A_DICT_POS; return; } // TODO: merge following similar calls to getTerminalPosition into one case-insensitive call. - mPrevWordPos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord( + mPrevWordPtNodePos = getDictionaryStructurePolicy()->getTerminalPtNodePositionOfWord( prevWord, prevWordLength, false /* forceLowerCaseSearch */); - if (mPrevWordPos == NOT_A_DICT_POS) { + if (mPrevWordPtNodePos == NOT_A_DICT_POS) { // Check bigrams for lower-cased previous word if original was not found. Useful for // auto-capitalized words like "The [current_word]". - mPrevWordPos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord( + mPrevWordPtNodePos = getDictionaryStructurePolicy()->getTerminalPtNodePositionOfWord( prevWord, prevWordLength, true /* forceLowerCaseSearch */); } } diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h index e0b1c67d9..6e4dda44d 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.h +++ b/native/jni/src/suggest/core/session/dic_traverse_session.h @@ -59,7 +59,7 @@ class DicTraverseSession { } AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache) - : mPrevWordPos(NOT_A_DICT_POS), mProximityInfo(0), + : mPrevWordPtNodePos(NOT_A_DICT_POS), mProximityInfo(0), mDictionary(0), mSuggestOptions(0), mDicNodesCache(usesLargeCache), mMultiBigramMap(), mInputSize(0), mPartiallyCommited(false), mMaxPointerCount(1), mMultiWordCostMultiplier(1.0f) { @@ -86,11 +86,9 @@ class DicTraverseSession { //-------------------- const ProximityInfo *getProximityInfo() const { return mProximityInfo; } const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; } - int getPrevWordPos() const { return mPrevWordPos; } + int getPrevWordPtNodePos() const { return mPrevWordPtNodePos; } // TODO: REMOVE - void setPrevWordPos(int pos) { mPrevWordPos = pos; } - // TODO: Use proper parameter when changed - int getDicRootPos() const { return 0; } + void setPrevWordPtNodePos(const int ptNodePos) { mPrevWordPtNodePos = ptNodePos; } DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; } MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; } const ProximityInfoState *getProximityInfoState(int id) const { @@ -119,26 +117,13 @@ class DicTraverseSession { return true; } - void getSearchKeys(const DicNode *node, std::vector<int> *const outputSearchKeyVector) const { - for (int i = 0; i < MAX_POINTER_COUNT_G; ++i) { - if (!mProximityInfoStates[i].isUsed()) { - continue; - } - const int pointerId = node->getInputIndex(i); - const std::vector<int> *const searchKeyVector = - mProximityInfoStates[i].getSearchKeyVector(pointerId); - outputSearchKeyVector->insert(outputSearchKeyVector->end(), searchKeyVector->begin(), - searchKeyVector->end()); - } - } - - ProximityType getProximityTypeG(const DicNode *const node, const int childCodePoint) const { + ProximityType getProximityTypeG(const DicNode *const dicNode, const int childCodePoint) const { ProximityType proximityType = UNRELATED_CHAR; for (int i = 0; i < MAX_POINTER_COUNT_G; ++i) { if (!mProximityInfoStates[i].isUsed()) { continue; } - const int pointerId = node->getInputIndex(i); + const int pointerId = dicNode->getInputIndex(i); proximityType = mProximityInfoStates[i].getProximityTypeG(pointerId, childCodePoint); ASSERT(proximityType == UNRELATED_CHAR || proximityType == MATCH_CHAR); // TODO: Make this more generic @@ -192,7 +177,7 @@ class DicTraverseSession { const int *const inputYs, const int *const times, const int *const pointerIds, const int inputSize, const float maxSpatialDistance, const int maxPointerCount); - int mPrevWordPos; + int mPrevWordPtNodePos; const ProximityInfo *mProximityInfo; const Dictionary *mDictionary; const SuggestOptions *mSuggestOptions; diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 73ccebc88..56acc2dc1 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -19,13 +19,11 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_priority_queue.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" -#include "suggest/core/dictionary/shortcut_utils.h" +#include "suggest/core/dictionary/suggestions_output_utils.h" #include "suggest/core/layout/proximity_info.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/core/policy/scoring.h" #include "suggest/core/policy/traversal.h" #include "suggest/core/policy/weighting.h" #include "suggest/core/session/dic_traverse_session.h" @@ -33,9 +31,7 @@ namespace latinime { // Initialization of class constants. -const int Suggest::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2; -const float Suggest::AUTOCORRECT_CLASSIFICATION_THRESHOLD = 0.33f; /** * Returns a set of suggestions for the given input touch points. The commitPoint argument indicates @@ -48,7 +44,7 @@ const float Suggest::AUTOCORRECT_CLASSIFICATION_THRESHOLD = 0.33f; */ int Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, - int inputSize, int commitPoint, int *outWords, int *frequencies, int *outputIndices, + int inputSize, int commitPoint, int *outWords, int *outputScores, int *outputIndices, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const { PROF_OPEN; PROF_START(0); @@ -70,8 +66,8 @@ int Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession, } PROF_END(1); PROF_START(2); - const int size = outputSuggestions(tSession, frequencies, outWords, outputIndices, outputTypes, - outputAutoCommitFirstWordConfidence); + const int size = SuggestionsOutputUtils::outputSuggestions(SCORING, tSession, outputScores, + outWords, outputIndices, outputTypes, outputAutoCommitFirstWordConfidence); PROF_END(2); PROF_CLOSE; return size; @@ -98,7 +94,7 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPo // Continue suggestion after partial commit. DicNode *topDicNode = traverseSession->getDicTraverseCache()->setCommitPoint(commitPoint); - traverseSession->setPrevWordPos(topDicNode->getPrevWordNodePos()); + traverseSession->setPrevWordPtNodePos(topDicNode->getPrevWordPtNodePos()); traverseSession->getDicTraverseCache()->continueSearch(); traverseSession->setPartiallyCommited(); } @@ -109,208 +105,12 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPo // Create a new dic node here DicNode rootNode; DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(), - traverseSession->getPrevWordPos(), &rootNode); + traverseSession->getPrevWordPtNodePos(), &rootNode); traverseSession->getDicTraverseCache()->copyPushActive(&rootNode); } } /** - * Outputs the final list of suggestions (i.e., terminal nodes). - */ -int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequencies, - int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes, - int *outputAutoCommitFirstWordConfidence) const { -#if DEBUG_EVALUATE_MOST_PROBABLE_STRING - const int terminalSize = 0; -#else - const int terminalSize = min(MAX_RESULTS, - static_cast<int>(traverseSession->getDicTraverseCache()->terminalSize())); -#endif - DicNode terminals[MAX_RESULTS]; // Avoiding non-POD variable length array - - for (int index = terminalSize - 1; index >= 0; --index) { - traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); - } - - const float languageWeight = SCORING->getAdjustedLanguageWeight( - traverseSession, terminals, terminalSize); - - int outputWordIndex = 0; - // Insert most probable word at index == 0 as long as there is one terminal at least - const bool hasMostProbableString = - SCORING->getMostProbableString(traverseSession, terminalSize, languageWeight, - &outputCodePoints[0], &outputTypes[0], &frequencies[0]); - if (hasMostProbableString) { - outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX; - ++outputWordIndex; - } - - // Initial value of the loop index for terminal nodes (words) - int doubleLetterTerminalIndex = -1; - DoubleLetterLevel doubleLetterLevel = NOT_A_DOUBLE_LETTER; - SCORING->searchWordWithDoubleLetter(terminals, terminalSize, - &doubleLetterTerminalIndex, &doubleLetterLevel); - - int maxScore = S_INT_MIN; - // Force autocorrection for obvious long multi-word suggestions when the top suggestion is - // a long multiple words suggestion. - // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. - // traverseSession->isPartiallyCommited() always returns false because we never auto partial - // commit for now. - const bool forceCommitMultiWords = (terminalSize > 0) ? - TRAVERSAL->autoCorrectsToMultiWordSuggestionIfTop() - && (traverseSession->isPartiallyCommited() - || (traverseSession->getInputSize() - >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT - && terminals[0].hasMultipleWords())) : false; - // TODO: have partial commit work even with multiple pointers. - const bool outputSecondWordFirstLetterInputIndex = - traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); - if (terminalSize > 0) { - // If we have no suggestions, don't write this - outputAutoCommitFirstWordConfidence[0] = - computeFirstWordConfidence(&terminals[0]); - } - - // Output suggestion results here - for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS; - ++terminalIndex) { - DicNode *terminalDicNode = &terminals[terminalIndex]; - if (DEBUG_GEO_FULL) { - terminalDicNode->dump("OUT:"); - } - const float doubleLetterCost = SCORING->getDoubleLetterDemotionDistanceCost( - terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel); - const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) - + doubleLetterCost; - const bool isPossiblyOffensiveWord = - traverseSession->getDictionaryStructurePolicy()->getProbability( - terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; - const bool isExactMatch = terminalDicNode->isExactMatch(); - const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); - // Heuristic: We exclude freq=0 first-char-uppercase words from exact match. - // (e.g. "AMD" and "and") - const bool isSafeExactMatch = isExactMatch - && !(isPossiblyOffensiveWord && isFirstCharUppercase); - const int outputTypeFlags = - (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) - | (isSafeExactMatch ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); - - // Entries that are blacklisted or do not represent a word should not be output. - const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); - - // Increase output score of top typing suggestion to ensure autocorrection. - // TODO: Better integration with java side autocorrection logic. - const int finalScore = SCORING->calculateFinalScore( - compoundDistance, traverseSession->getInputSize(), - terminalDicNode->isExactMatch() - || (forceCommitMultiWords && terminalDicNode->hasMultipleWords()) - || (isValidWord && SCORING->doesAutoCorrectValidWord())); - if (maxScore < finalScore && isValidWord) { - maxScore = finalScore; - } - - // Don't output invalid words. However, we still need to submit their shortcuts if any. - if (isValidWord) { - outputTypes[outputWordIndex] = Dictionary::KIND_CORRECTION | outputTypeFlags; - frequencies[outputWordIndex] = finalScore; - if (outputSecondWordFirstLetterInputIndex) { - outputIndicesToPartialCommit[outputWordIndex] = - terminalDicNode->getSecondWordFirstInputIndex( - traverseSession->getProximityInfoState(0)); - } else { - outputIndicesToPartialCommit[outputWordIndex] = NOT_AN_INDEX; - } - // Populate the outputChars array with the suggested word. - const int startIndex = outputWordIndex * MAX_WORD_LENGTH; - terminalDicNode->outputResult(&outputCodePoints[startIndex]); - ++outputWordIndex; - } - - if (!terminalDicNode->hasMultipleWords()) { - BinaryDictionaryShortcutIterator shortcutIt( - traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), - traverseSession->getDictionaryStructurePolicy() - ->getShortcutPositionOfPtNode(terminalDicNode->getPos())); - // Shortcut is not supported for multiple words suggestions. - // TODO: Check shortcuts during traversal for multiple words suggestions. - const bool sameAsTyped = TRAVERSAL->sameAsTyped(traverseSession, terminalDicNode); - const int updatedOutputWordIndex = ShortcutUtils::outputShortcuts(&shortcutIt, - outputWordIndex, finalScore, outputCodePoints, frequencies, outputTypes, - sameAsTyped); - const int secondWordFirstInputIndex = terminalDicNode->getSecondWordFirstInputIndex( - traverseSession->getProximityInfoState(0)); - for (int i = outputWordIndex; i < updatedOutputWordIndex; ++i) { - if (outputSecondWordFirstLetterInputIndex) { - outputIndicesToPartialCommit[i] = secondWordFirstInputIndex; - } else { - outputIndicesToPartialCommit[i] = NOT_AN_INDEX; - } - } - outputWordIndex = updatedOutputWordIndex; - } - DicNode::managedDelete(terminalDicNode); - } - - if (hasMostProbableString) { - SCORING->safetyNetForMostProbableString(terminalSize, maxScore, - &outputCodePoints[0], &frequencies[0]); - } - return outputWordIndex; -} - -int Suggest::computeFirstWordConfidence(const DicNode *const terminalDicNode) const { - // Get the number of spaces in the first suggestion - const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); - // Get the number of characters in the first suggestion - const int length = terminalDicNode->getTotalNodeCodePointCount(); - // Get the distance for the first word of the suggestion - const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); - - // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. - // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or - // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means - // we are very confident. - // Expected space count is 1 ~ 5 - static const int MIN_EXPECTED_SPACE_COUNT = 1; - static const int MAX_EXPECTED_SPACE_COUNT = 5; - // Expected length is about 4 ~ 30 - static const int MIN_EXPECTED_LENGTH = 4; - static const int MAX_EXPECTED_LENGTH = 30; - // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 - static const float MIN_EXPECTED_DISTANCE = 0.0; - static const float MAX_EXPECTED_DISTANCE = 2.0; - // This is not strict: it's where most stuff will be falling, but it's still fine if it's - // outside these values. We want to output a value that reflects all of these. Each factor - // contributes a bit. - - // We need at least a space. - if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; - - // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 - // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the - // weight of the distance. Clamp to avoid overflows. - const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE - : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; - const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT - * (MAX_EXPECTED_DISTANCE - clampedDistance) - / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); - // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no - // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the - // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. - const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT - * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); - // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no - // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the - // weight of the space count. - const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT - * (spaceCount - MIN_EXPECTED_SPACE_COUNT) - / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); - - return distanceContribution + lengthContribution + spaceContribution; -} - -/** * Expands the dicNodes in the current search priority queue by advancing to the possible child * nodes based on the next touch point(s) (or no touch points for lookahead) */ @@ -421,15 +221,15 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { } break; case UNRELATED_CHAR: - // Just drop this node and do nothing. + // Just drop this dicNode and do nothing. break; default: - // Just drop this node and do nothing. + // Just drop this dicNode and do nothing. break; } } - // Push the node for look-ahead correction + // Push the dicNode for look-ahead correction if (allowsErrorCorrections && canDoLookAheadCorrection) { traverseSession->getDicTraverseCache()->copyPushNextActive(&dicNode); } @@ -442,7 +242,7 @@ void Suggest::processTerminalDicNode( if (dicNode->getCompoundDistance() >= static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) { return; } - if (!dicNode->isTerminalWordNode()) { + if (!dicNode->isTerminalDicNode()) { return; } if (dicNode->shouldBeFilteredBySafetyNetForBigram()) { @@ -456,6 +256,9 @@ void Suggest::processTerminalDicNode( Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL_INSERTION, traverseSession, 0, &terminalDicNode, traverseSession->getMultiBigramMap()); } + if (!dicNode->hasMatchedOrProximityCodePoints()) { + return; + } Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL, traverseSession, 0, &terminalDicNode, traverseSession->getMultiBigramMap()); traverseSession->getDicTraverseCache()->copyPushTerminal(&terminalDicNode); @@ -463,7 +266,7 @@ void Suggest::processTerminalDicNode( /** * Adds the expanded dicNode to the next search priority queue. Also creates an additional next word - * (by the space omission error correction) search path if input dicNode is on a terminal node. + * (by the space omission error correction) search path if input dicNode is on a terminal. */ void Suggest::processExpandedDicNode( DicTraverseSession *traverseSession, DicNode *dicNode) const { @@ -505,7 +308,7 @@ void Suggest::processDicNodeAsSubstitution(DicTraverseSession *traverseSession, processExpandedDicNode(traverseSession, childDicNode); } -// Process the node codepoint as a digraph. This means that composite glyphs like the German +// Process the DicNode codepoint as a digraph. This means that composite glyphs like the German // u-umlaut is expanded to the transliteration "ue". Note that this happens in parallel with // the normal non-digraph traversal, so both "uber" and "ueber" can be corrected to "[u-umlaut]ber". void Suggest::processDicNodeAsDigraph(DicTraverseSession *traverseSession, @@ -518,7 +321,7 @@ void Suggest::processDicNodeAsDigraph(DicTraverseSession *traverseSession, /** * Handle the dicNode as an omission error (e.g., ths => this). Skip the current letter and consider * matches for all possible next letters. Note that just skipping the current letter without any - * other conditions tends to flood the search dic nodes cache with omission nodes. Instead, check + * other conditions tends to flood the search DicNodes cache with omission DicNodes. Instead, check * the possible *next* letters after the omission to better limit search to plausible omissions. * Note that apostrophes are handled as omissions. */ @@ -605,7 +408,7 @@ void Suggest::processDicNodeAsTransposition(DicTraverseSession *traverseSession, } /** - * Weight child node by aligning it to the key + * Weight child dicNode by aligning it to the key */ void Suggest::weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const { const int inputSize = traverseSession->getInputSize(); diff --git a/native/jni/src/suggest/core/suggest.h b/native/jni/src/suggest/core/suggest.h index b20343d29..b1d12ad9a 100644 --- a/native/jni/src/suggest/core/suggest.h +++ b/native/jni/src/suggest/core/suggest.h @@ -48,25 +48,18 @@ class Suggest : public SuggestInterface { AK_FORCE_INLINE virtual ~Suggest() {} int getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, int inputSize, int commitPoint, - int *outWords, int *frequencies, int *outputIndices, int *outputTypes, + int *outWords, int *outputScores, int *outputIndices, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const; private: DISALLOW_IMPLICIT_CONSTRUCTORS(Suggest); void createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode, const bool spaceSubstitution) const; - int outputSuggestions(DicTraverseSession *traverseSession, int *frequencies, - int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes, - int *outputAutoCommitFirstWordConfidence) const; - int computeFirstWordConfidence(const DicNode *const terminalDicNode) const; void initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const; void expandCurrentDicNodes(DicTraverseSession *traverseSession) const; void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; void processExpandedDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; void weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; - float getAutocorrectScore(DicTraverseSession *traverseSession, DicNode *dicNode) const; - void generateFeatures( - DicTraverseSession *traverseSession, DicNode *dicNode, float *features) const; void processDicNodeAsOmission(DicTraverseSession *traverseSession, DicNode *dicNode) const; void processDicNodeAsDigraph(DicTraverseSession *traverseSession, DicNode *dicNode) const; void processDicNodeAsTransposition(DicTraverseSession *traverseSession, @@ -79,13 +72,8 @@ class Suggest : public SuggestInterface { void processDicNodeAsMatch(DicTraverseSession *traverseSession, DicNode *childDicNode) const; - // Inputs longer than this will autocorrect if the suggestion is multi-word - static const int MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT; static const int MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE; - // Threshold for autocorrection classifier - static const float AUTOCORRECT_CLASSIFICATION_THRESHOLD; - const Traversal *const TRAVERSAL; const Scoring *const SCORING; const Weighting *const WEIGHTING; diff --git a/native/jni/src/suggest/core/suggest_interface.h b/native/jni/src/suggest/core/suggest_interface.h index 4deb4d924..9a0758085 100644 --- a/native/jni/src/suggest/core/suggest_interface.h +++ b/native/jni/src/suggest/core/suggest_interface.h @@ -27,7 +27,7 @@ class SuggestInterface { public: virtual int getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, int inputSize, - int commitPoint, int *outWords, int *frequencies, int *outputIndices, + int commitPoint, int *outWords, int *outputScores, int *outputIndices, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const = 0; SuggestInterface() {} virtual ~SuggestInterface() {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp index 1926b9831..7d0d09631 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp @@ -16,7 +16,6 @@ #include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" @@ -38,7 +37,6 @@ const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRI // Mask for attribute probability, stored on 4 bits inside the flags byte. const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; -const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; /* static */ void BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( const uint8_t *const bigramsBuf, BigramFlags *const outBigramFlags, @@ -79,11 +77,6 @@ const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos); break; } - if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID) { - return NOT_A_DICT_POS; - } else if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET) { - return origin; - } if (isOffsetNegative(flags)) { return origin - offset; } else { @@ -91,92 +84,4 @@ const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; } } -/* static */ bool BigramListReadWriteUtils::setHasNextFlag( - BufferWithExtendableBuffer *const buffer, const bool hasNext, const int entryPos) { - const bool usesAdditionalBuffer = buffer->isInAdditionalBuffer(entryPos); - int readingPos = entryPos; - if (usesAdditionalBuffer) { - readingPos -= buffer->getOriginalBufferSize(); - } - BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition( - buffer->getBuffer(usesAdditionalBuffer), &readingPos); - if (hasNext) { - bigramFlags = bigramFlags | FLAG_ATTRIBUTE_HAS_NEXT; - } else { - bigramFlags = bigramFlags & (~FLAG_ATTRIBUTE_HAS_NEXT); - } - int writingPos = entryPos; - return buffer->writeUintAndAdvancePosition(bigramFlags, 1 /* size */, &writingPos); -} - -/* static */ bool BigramListReadWriteUtils::createAndWriteBigramEntry( - BufferWithExtendableBuffer *const buffer, const int targetPos, const int probability, - const bool hasNext, int *const writingPos) { - BigramFlags flags; - if (!createAndGetBigramFlags(*writingPos, targetPos, probability, hasNext, &flags)) { - return false; - } - return writeBigramEntry(buffer, flags, targetPos, writingPos); -} - -/* static */ bool BigramListReadWriteUtils::writeBigramEntry( - BufferWithExtendableBuffer *const bufferToWrite, const BigramFlags flags, - const int targetPtNodePos, int *const writingPos) { - const int offset = getBigramTargetOffset(targetPtNodePos, *writingPos); - const BigramFlags flagsToWrite = (offset < 0) ? - (flags | FLAG_ATTRIBUTE_OFFSET_NEGATIVE) : (flags & ~FLAG_ATTRIBUTE_OFFSET_NEGATIVE); - if (!bufferToWrite->writeUintAndAdvancePosition(flagsToWrite, 1 /* size */, writingPos)) { - return false; - } - const uint32_t absOffest = abs(offset); - const int bigramTargetFieldSize = attributeAddressSize(flags); - return bufferToWrite->writeUintAndAdvancePosition(absOffest, bigramTargetFieldSize, - writingPos); -} - -// Returns true if the bigram entry is valid and put entry flags into out*. -/* static */ bool BigramListReadWriteUtils::createAndGetBigramFlags(const int entryPos, - const int targetPtNodePos, const int probability, const bool hasNext, - BigramFlags *const outBigramFlags) { - BigramFlags flags = probability & MASK_ATTRIBUTE_PROBABILITY; - if (hasNext) { - flags |= FLAG_ATTRIBUTE_HAS_NEXT; - } - const int offset = getBigramTargetOffset(targetPtNodePos, entryPos); - if (offset < 0) { - flags |= FLAG_ATTRIBUTE_OFFSET_NEGATIVE; - } - const uint32_t absOffest = abs(offset); - if ((absOffest >> 24) != 0) { - // Offset is too large. - return false; - } else if ((absOffest >> 16) != 0) { - flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; - } else if ((absOffest >> 8) != 0) { - flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; - } else { - flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; - } - // Currently, all newly written bigram position fields are 3 bytes to simplify dictionary - // writing. - // TODO: Remove following 2 lines and optimize memory space. - flags = (flags & (~MASK_ATTRIBUTE_ADDRESS_TYPE)) | FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; - *outBigramFlags = flags; - return true; -} - -/* static */ int BigramListReadWriteUtils::getBigramTargetOffset(const int targetPtNodePos, - const int entryPos) { - if (targetPtNodePos == NOT_A_DICT_POS) { - return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; - } else { - const int offset = targetPtNodePos - (entryPos + 1 /* bigramFlagsField */); - if (offset == 0) { - return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; - } else { - return offset; - } - } -} - } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h index eabe4e099..7e1038300 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h @@ -45,34 +45,6 @@ public: // Bigrams reading methods static void skipExistingBigrams(const uint8_t *const bigramsBuf, int *const bigramListPos); - // Returns the size of the bigram position field that is stored in bigram flags. - static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { - return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; - /* Note: this is a value-dependant optimization of what may probably be - more readably written this way: - switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { - case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; - default: return 0; - } - */ - } - - static bool setHasNextFlag(BufferWithExtendableBuffer *const buffer, - const bool hasNext, const int entryPos); - - static AK_FORCE_INLINE BigramFlags setProbabilityInFlags(const BigramFlags flags, - const int probability) { - return (flags & (~MASK_ATTRIBUTE_PROBABILITY)) | (probability & MASK_ATTRIBUTE_PROBABILITY); - } - - static bool createAndWriteBigramEntry(BufferWithExtendableBuffer *const buffer, - const int targetPos, const int probability, const bool hasNext, int *const writingPos); - - static bool writeBigramEntry(BufferWithExtendableBuffer *const buffer, const BigramFlags flags, - const int targetOffset, int *const writingPos); - private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); @@ -83,11 +55,6 @@ private: static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; - static const int ATTRIBUTE_ADDRESS_SHIFT; - - // Returns true if the bigram entry is valid and put entry flags into out*. - static bool createAndGetBigramFlags(const int entryPos, const int targetPos, - const int probability, const bool hasNext, BigramFlags *const outBigramFlags); static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; @@ -95,8 +62,6 @@ private: static int getBigramAddressAndAdvancePosition(const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos); - - static int getBigramTargetOffset(const int targetPtNodePos, const int entryPos); }; } // namespace latinime #endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp deleted file mode 100644 index b1170e251..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" - -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { - -const int DynamicBigramListPolicy::CONTINUING_BIGRAM_LINK_COUNT_LIMIT = 10000; -const int DynamicBigramListPolicy::BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT = 100000; - -void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, - bool *const outHasNext, int *const bigramEntryPos) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramEntryPos); - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - *bigramEntryPos -= mBuffer->getOriginalBufferSize(); - } - BigramListReadWriteUtils::BigramFlags bigramFlags; - int originalBigramPos; - BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(buffer, &bigramFlags, - &originalBigramPos, bigramEntryPos); - if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { - originalBigramPos += mBuffer->getOriginalBufferSize(); - } - *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); - *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); - if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) { - // This bigram is too weak to output. - *outBigramPos = NOT_A_DICT_POS; - } else { - *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); - } - if (usesAdditionalBuffer) { - *bigramEntryPos += mBuffer->getOriginalBufferSize(); - } -} - -void DynamicBigramListPolicy::skipAllBigrams(int *const bigramListPos) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - *bigramListPos -= mBuffer->getOriginalBufferSize(); - } - BigramListReadWriteUtils::skipExistingBigrams(buffer, bigramListPos); - if (usesAdditionalBuffer) { - *bigramListPos += mBuffer->getOriginalBufferSize(); - } -} - -bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, - int *const fromPos, int *const toPos, int *const outBigramsCount) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); - if (usesAdditionalBuffer) { - *fromPos -= mBuffer->getOriginalBufferSize(); - } - *outBigramsCount = 0; - BigramListReadWriteUtils::BigramFlags bigramFlags; - int bigramEntryCount = 0; - int lastWrittenEntryPos = NOT_A_DICT_POS; - do { - if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { - AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", - bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); - ASSERT(false); - return false; - } - // The buffer address can be changed after calling buffer writing methods. - int originalBigramPos; - BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( - mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, - fromPos); - if (originalBigramPos == NOT_A_DICT_POS) { - // skip invalid bigram entry. - continue; - } - if (usesAdditionalBuffer) { - originalBigramPos += mBuffer->getOriginalBufferSize(); - } - const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); - if (bigramPos == NOT_A_DICT_POS) { - // Target PtNode has been invalidated. - continue; - } - lastWrittenEntryPos = *toPos; - if (!BigramListReadWriteUtils::createAndWriteBigramEntry(bufferToWrite, bigramPos, - BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), - BigramListReadWriteUtils::hasNext(bigramFlags), toPos)) { - return false; - } - (*outBigramsCount)++; - } while(BigramListReadWriteUtils::hasNext(bigramFlags)); - // Makes the last entry the terminal of the list. Updates the flags. - if (lastWrittenEntryPos != NOT_A_DICT_POS) { - if (!BigramListReadWriteUtils::setHasNextFlag(bufferToWrite, false /* hasNext */, - lastWrittenEntryPos)) { - return false; - } - } - if (usesAdditionalBuffer) { - *fromPos += mBuffer->getOriginalBufferSize(); - } - return true; -} - -// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode -// has been deleted or is not a valid terminal. -bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( - int *const bigramListPos, int *const outValidBigramEntryCount) { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); - if (usesAdditionalBuffer) { - *bigramListPos -= mBuffer->getOriginalBufferSize(); - } - DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); - BigramListReadWriteUtils::BigramFlags bigramFlags; - int bigramEntryCount = 0; - do { - if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { - AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", - bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); - ASSERT(false); - return false; - } - int bigramEntryPos = *bigramListPos; - int originalBigramPos; - // The buffer address can be changed after calling buffer writing methods. - BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( - mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, - bigramListPos); - if (usesAdditionalBuffer) { - bigramEntryPos += mBuffer->getOriginalBufferSize(); - } - if (originalBigramPos == NOT_A_DICT_POS) { - // This entry has already been removed. - continue; - } - if (usesAdditionalBuffer) { - originalBigramPos += mBuffer->getOriginalBufferSize(); - } - const int bigramTargetNodePos = - followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos); - if (nodeReader.isDeleted() || !nodeReader.isTerminal() - || bigramTargetNodePos == NOT_A_DICT_POS) { - // The target is no longer valid terminal. Invalidate the current bigram entry. - if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, - NOT_A_DICT_POS /* targetPtNodePos */, &bigramEntryPos)) { - return false; - } - continue; - } - bool isRemoved = false; - if (!updateProbabilityForDecay(bigramFlags, bigramTargetNodePos, &bigramEntryPos, - &isRemoved)) { - return false; - } - if (!isRemoved) { - (*outValidBigramEntryCount) += 1; - } - } while(BigramListReadWriteUtils::hasNext(bigramFlags)); - return true; -} - -// Updates bigram target PtNode positions in the list after the placing step in GC. -bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos, - const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const - ptNodePositionRelocationMap, int *const outBigramEntryCount) { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); - if (usesAdditionalBuffer) { - *bigramListPos -= mBuffer->getOriginalBufferSize(); - } - BigramListReadWriteUtils::BigramFlags bigramFlags; - int bigramEntryCount = 0; - do { - if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { - AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", - bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); - ASSERT(false); - return false; - } - int bigramEntryPos = *bigramListPos; - if (usesAdditionalBuffer) { - bigramEntryPos += mBuffer->getOriginalBufferSize(); - } - int bigramTargetPtNodePos; - // The buffer address can be changed after calling buffer writing methods. - BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( - mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &bigramTargetPtNodePos, - bigramListPos); - if (bigramTargetPtNodePos == NOT_A_DICT_POS) { - continue; - } - if (usesAdditionalBuffer) { - bigramTargetPtNodePos += mBuffer->getOriginalBufferSize(); - } - - DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = - ptNodePositionRelocationMap->find(bigramTargetPtNodePos); - if (it != ptNodePositionRelocationMap->end()) { - bigramTargetPtNodePos = it->second; - } else { - bigramTargetPtNodePos = NOT_A_DICT_POS; - } - if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, - bigramTargetPtNodePos, &bigramEntryPos)) { - return false; - } - } while(BigramListReadWriteUtils::hasNext(bigramFlags)); - (*outBigramEntryCount) = bigramEntryCount; - return true; -} - -bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos, - const int probability, int *const bigramListPos, bool *const outAddedNewBigram) { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); - if (usesAdditionalBuffer) { - *bigramListPos -= mBuffer->getOriginalBufferSize(); - } - BigramListReadWriteUtils::BigramFlags bigramFlags; - int bigramEntryCount = 0; - do { - if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { - AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", - bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); - ASSERT(false); - return false; - } - int entryPos = *bigramListPos; - if (usesAdditionalBuffer) { - entryPos += mBuffer->getOriginalBufferSize(); - } - int originalBigramPos; - // The buffer address can be changed after calling buffer writing methods. - BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( - mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, - bigramListPos); - if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { - originalBigramPos += mBuffer->getOriginalBufferSize(); - } - if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) { - // Update this bigram entry. - *outAddedNewBigram = false; - const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags( - bigramFlags); - const int probabilityToWrite = mIsDecayingDict ? - ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, - probability) : probability; - const BigramListReadWriteUtils::BigramFlags updatedFlags = - BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, - probabilityToWrite); - return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags, - originalBigramPos, &entryPos); - } - if (BigramListReadWriteUtils::hasNext(bigramFlags)) { - continue; - } - // The current last entry is found. - // First, update the flags of the last entry. - if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) { - *outAddedNewBigram = false; - return false; - } - if (usesAdditionalBuffer) { - *bigramListPos += mBuffer->getOriginalBufferSize(); - } - // Then, add a new entry after the last entry. - *outAddedNewBigram = true; - return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos); - } while(BigramListReadWriteUtils::hasNext(bigramFlags)); - // We return directly from the while loop. - ASSERT(false); - return false; -} - -bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability, - int *const writingPos) { - // hasNext is false because we are adding a new bigram entry at the end of the bigram list. - const int probabilityToWrite = mIsDecayingDict ? - ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) : - probability; - return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, - probabilityToWrite, false /* hasNext */, writingPos); -} - -bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(bigramListPos); - int pos = bigramListPos; - if (usesAdditionalBuffer) { - pos -= mBuffer->getOriginalBufferSize(); - } - BigramListReadWriteUtils::BigramFlags bigramFlags; - int bigramEntryCount = 0; - do { - if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { - AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", - bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); - ASSERT(false); - return false; - } - int bigramEntryPos = pos; - int originalBigramPos; - // The buffer address can be changed after calling buffer writing methods. - BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( - mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, &pos); - if (usesAdditionalBuffer) { - bigramEntryPos += mBuffer->getOriginalBufferSize(); - } - if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { - originalBigramPos += mBuffer->getOriginalBufferSize(); - } - const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); - if (bigramPos != bigramTargetPos) { - continue; - } - // Target entry is found. Write an invalid target position to mark the bigram invalid. - return BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, - NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos); - } while(BigramListReadWriteUtils::hasNext(bigramFlags)); - return false; -} - -int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( - const int originalBigramPos) const { - if (originalBigramPos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - int currentPos = originalBigramPos; - DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); - int bigramLinkCount = 0; - while (nodeReader.getBigramLinkedNodePos() != NOT_A_DICT_POS) { - currentPos = nodeReader.getBigramLinkedNodePos(); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); - bigramLinkCount++; - if (bigramLinkCount > CONTINUING_BIGRAM_LINK_COUNT_LIMIT) { - AKLOGE("Bigram link is invalid. start position: %d", originalBigramPos); - ASSERT(false); - return NOT_A_DICT_POS; - } - } - return currentPos; -} - -bool DynamicBigramListPolicy::updateProbabilityForDecay( - const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, - int *const bigramEntryPos, bool *const outRemoved) const { - *outRemoved = false; - if (mIsDecayingDict) { - // Update bigram probability for decaying. - const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave( - BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy); - if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { - // Write new probability. - const BigramListReadWriteUtils::BigramFlags updatedBigramFlags = - BigramListReadWriteUtils::setProbabilityInFlags( - bigramFlags, newProbability); - if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedBigramFlags, - targetPtNodePos, bigramEntryPos)) { - return false; - } - } else { - // Remove current bigram entry. - *outRemoved = true; - if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, - NOT_A_DICT_POS /* targetPtNodePos */, bigramEntryPos)) { - return false; - } - } - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h deleted file mode 100644 index 0504b59d5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H -#define LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H - -#include <stdint.h> - -#include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" - -namespace latinime { - -class BufferWithExtendableBuffer; -class DictionaryHeaderStructurePolicy; -class DictionaryShortcutsStructurePolicy; - -/* - * This is a dynamic version of BigramListPolicy and supports an additional buffer. - */ -class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { - public: - DynamicBigramListPolicy(const DictionaryHeaderStructurePolicy *const headerPolicy, - BufferWithExtendableBuffer *const buffer, - const DictionaryShortcutsStructurePolicy *const shortcutPolicy, - const bool isDecayingDict) - : mHeaderPolicy(headerPolicy), mBuffer(buffer), mShortcutPolicy(shortcutPolicy), - mIsDecayingDict(isDecayingDict) {} - - ~DynamicBigramListPolicy() {} - - void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, - int *const bigramEntryPos) const; - - void skipAllBigrams(int *const bigramListPos) const; - - // Copy bigrams from the bigram list that starts at fromPos in mBuffer to toPos in - // bufferToWrite and advance these positions after bigram lists. This method skips invalid - // bigram entries and write the valid bigram entry count to outBigramsCount. - bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos, - int *const toPos, int *const outBigramsCount) const; - - bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos, - int *const outBigramEntryCount); - - bool updateAllBigramTargetPtNodePositions(int *const bigramListPos, - const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const - ptNodePositionRelocationMap, int *const outValidBigramEntryCount); - - bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability, - int *const bigramListPos, bool *const outAddedNewBigram); - - bool writeNewBigramEntry(const int bigramTargetPos, const int probability, - int *const writingPos); - - // Return whether or not targetBigramPos is found. - bool removeBigram(const int bigramListPos, const int bigramTargetPos); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicBigramListPolicy); - - static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT; - static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT; - - const DictionaryHeaderStructurePolicy *const mHeaderPolicy; - BufferWithExtendableBuffer *const mBuffer; - const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; - const bool mIsDecayingDict; - - // Follow bigram link and return the position of bigram target PtNode that is currently valid. - int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const; - - bool updateProbabilityForDecay(const BigramListReadWriteUtils::BigramFlags bigramFlags, - const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const; -}; -} // namespace latinime -#endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp new file mode 100644 index 000000000..cd2243025 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" + +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos); + if (outBigramPos) { + // Lookup target PtNode position. + *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + } + if (outProbability) { + if (bigramEntry.hasHistoricalInfo()) { + *outProbability = + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()); + } else { + *outProbability = bigramEntry.getProbability(); + } + } + if (outHasNext) { + *outHasNext = bigramEntry.hasNext(); + } +} + +bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, + const int newProbability, const int timestamp, bool *const outAddedNewEntry) { + if (outAddedNewEntry) { + *outAddedNewEntry = false; + } + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Updating PtNode doesn't have a bigram list. + // Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, + newProbability, timestamp); + // Write an entry. + const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; + } + + const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos); + if (entryPosToUpdate != NOT_A_DICT_POS) { + // Overwrite existing entry. + const BigramEntry originalBigramEntry = + mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (!originalBigramEntry.isValid()) { + // Reuse invalid entry. + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + } + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &updatedBigramEntry, newProbability, timestamp); + return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); + } + + // Add new entry to the bigram list. + // Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + // Write new entry at a head position of the bigram list. + int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); + const BigramEntry newBigramEntry(true /* hasNext */, NOT_A_PROBABILITY, newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &newBigramEntry, newProbability, timestamp); + if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite, &writingPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + // Append existing entries by copying. + return mBigramDictContent->copyBigramList(bigramListPos, writingPos); +} + +bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return false; + } + const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos); + if (entryPosToUpdate == NOT_A_DICT_POS) { + // Bigram entry doesn't exist. + return false; + } + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (targetTerminalId != bigramEntry.getTargetTerminalId()) { + // Bigram entry doesn't exist. + return false; + } + // Remove bigram entry by marking it as invalid entry and overwriting the original entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate); +} + +bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return true; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + if (targetPtNodePos == NOT_A_DICT_POS) { + // Invalidate bigram entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } else if (bigramEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + bigramEntry.getHistoricalInfo()); + if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) { + const BigramEntry updatedBigramEntry = + bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + *outBigramCount += 1; + } else { + // Remove entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } + } else { + *outBigramCount += 1; + } + } + return true; +} + +int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return 0; + } + int bigramCount = 0; + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.isValid()) { + bigramCount++; + } + } + return bigramCount; +} + +int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, + const int bigramListPos) const { + bool hasNext = true; + int invalidEntryPos = NOT_A_DICT_POS; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) { + // Entry with same target is found. + return entryPos; + } else if (!bigramEntry.isValid()) { + // Invalid entry that can be reused is found. + invalidEntryPos = entryPos; + } + } + return invalidEntryPos; +} + +const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( + const BigramEntry *const originalBigramEntry, const int newProbability, + const int timestamp) const { + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalBigramEntry->getHistoricalInfo(), newProbability, timestamp); + return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); + } else { + return originalBigramEntry->updateProbabilityAndGetEntry(newProbability); + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h new file mode 100644 index 000000000..5b6c5a173 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_BIGRAM_LIST_POLICY_H +#define LATINIME_VER4_BIGRAM_LIST_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h" + +namespace latinime { + +class BigramDictContent; +class HeaderPolicy; +class TerminalPositionLookupTable; + +class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable, + const HeaderPolicy *const headerPolicy) + : mBigramDictContent(bigramDictContent), + mTerminalPositionLookupTable(terminalPositionLookupTable), + mHeaderPolicy(headerPolicy) {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const; + + void skipAllBigrams(int *const pos) const { + // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. + } + + bool addNewEntry(const int terminalId, const int newTargetTerminalId, const int newProbability, + const int timestamp, bool *const outAddedNewEntry); + + bool removeEntry(const int terminalId, const int targetTerminalId); + + bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount); + + int getBigramEntryConut(const int terminalId); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); + + int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos) const; + + const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, + const int newProbability, const int timestamp) const; + + BigramDictContent *const mBigramDictContent; + const TerminalPositionLookupTable *const mTerminalPositionLookupTable; + const HeaderPolicy *const mHeaderPolicy; +}; +} // namespace latinime +#endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp deleted file mode 100644 index ff80dd2f6..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h" - -#include <stdint.h> - -#include "defines.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" - -namespace latinime { - -/* static */ DictionaryStructureWithBufferPolicy *DictionaryStructureWithBufferPolicyFactory - ::newDictionaryStructureWithBufferPolicy(const char *const path, const int bufOffset, - const int size, const bool isUpdatable) { - // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of - // impl classes of DictionaryStructureWithBufferPolicy. - const MmappedBuffer *const mmapedBuffer = MmappedBuffer::openBuffer(path, bufOffset, size, - isUpdatable); - if (!mmapedBuffer) { - return 0; - } - switch (FormatUtils::detectFormatVersion(mmapedBuffer->getBuffer(), - mmapedBuffer->getBufferSize())) { - case FormatUtils::VERSION_2: - return new PatriciaTriePolicy(mmapedBuffer); - case FormatUtils::VERSION_3: - return new DynamicPatriciaTriePolicy(mmapedBuffer); - default: - AKLOGE("DICT: dictionary format is unknown, bad magic number"); - delete mmapedBuffer; - ASSERT(false); - return 0; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp deleted file mode 100644 index 5724c5d88..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" - -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { - -bool DynamicPatriciaTrieGcEventListeners - ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints) { - // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless - // children. - bool isUselessPtNode = !node->isTerminal(); - if (node->isTerminal() && mIsDecayingDict) { - const int newProbability = - ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability(), - mHeaderPolicy); - int writingPos = node->getProbabilityFieldPos(); - // Update probability. - if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( - mBuffer, newProbability, &writingPos)) { - return false; - } - if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { - isUselessPtNode = true; - } - } - if (mChildrenValue > 0) { - isUselessPtNode = false; - } else if (node->isTerminal()) { - // Remove children as all children are useless. - int writingPos = node->getChildrenPosFieldPos(); - if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( - mBuffer, NOT_A_DICT_POS /* childrenPosition */, &writingPos)) { - return false; - } - } - if (isUselessPtNode) { - // Current PtNode is no longer needed. Mark it as deleted. - if (!mWritingHelper->markNodeAsDeleted(node)) { - return false; - } - } else { - mValueStack.back() += 1; - if (node->isTerminal()) { - mValidUnigramCount += 1; - } - } - return true; -} - -bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability - ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints) { - if (!node->isDeleted()) { - int pos = node->getBigramsPos(); - if (pos != NOT_A_DICT_POS) { - int bigramEntryCount = 0; - if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos, - &bigramEntryCount)) { - return false; - } - mValidBigramEntryCount += bigramEntryCount; - } - } - return true; -} - -// Writes dummy PtNode array size when the head of PtNode array is read. -bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - ::onDescend(const int ptNodeArrayPos) { - mValidPtNodeCount = 0; - int writingPos = mBufferToWrite->getTailPosition(); - mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( - DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::value_type( - ptNodeArrayPos, writingPos)); - // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. - // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. - mPtNodeArraySizeFieldPos = writingPos; - return DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( - mBufferToWrite, 0 /* arraySize */, &writingPos); -} - -// Write PtNode array terminal and actual PtNode array size. -bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - ::onReadingPtNodeArrayTail() { - int writingPos = mBufferToWrite->getTailPosition(); - // Write PtNode array terminal. - if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( - mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - // Write actual PtNode array size. - if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( - mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { - return false; - } - return true; -} - -// Write valid PtNode to buffer and memorize mapping from the old position to the new position. -bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints) { - if (node->isDeleted()) { - // Current PtNode is not written in new buffer because it has been deleted. - mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( - DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( - node->getHeadPos(), NOT_A_DICT_POS)); - return true; - } - int writingPos = mBufferToWrite->getTailPosition(); - mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( - DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( - node->getHeadPos(), writingPos)); - mValidPtNodeCount++; - // Writes current PtNode. - return mWritingHelper->writePtNodeToBufferByCopyingPtNodeInfo(mBufferToWrite, node, - node->getParentPos(), nodeCodePoints, node->getCodePointCount(), - node->getProbability(), &writingPos); -} - -bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields - ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints) { - // Updates parent position. - int parentPos = node->getParentPos(); - if (parentPos != NOT_A_DICT_POS) { - DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = - mDictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); - if (it != mDictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { - parentPos = it->second; - } - } - int writingPos = node->getHeadPos() + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; - // Write updated parent offset. - if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(mBufferToWrite, - parentPos, node->getHeadPos(), &writingPos)) { - return false; - } - - // Updates children position. - int childrenPos = node->getChildrenPos(); - if (childrenPos != NOT_A_DICT_POS) { - DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::const_iterator it = - mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); - if (it != mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { - childrenPos = it->second; - } - } - writingPos = node->getChildrenPosFieldPos(); - if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBufferToWrite, - childrenPos, &writingPos)) { - return false; - } - - // Updates bigram target PtNode positions in the bigram list. - int bigramsPos = node->getBigramsPos(); - if (bigramsPos != NOT_A_DICT_POS) { - int bigramEntryCount; - if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos, - &mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) { - return false; - } - mBigramCount += bigramEntryCount; - } - if (node->isTerminal()) { - mUnigramCount++; - } - - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp deleted file mode 100644 index 2fa3111d3..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" - -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -void DynamicPatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( - const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { - if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", - ptNodePos, mBuffer->getTailPosition()); - ASSERT(false); - invalidatePtNodeInfo(); - return; - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int pos = ptNodePos; - mHeadPos = ptNodePos; - if (usesAdditionalBuffer) { - pos -= mBuffer->getOriginalBufferSize(); - } - mFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const int parentPosOffset = - DynamicPatriciaTrieReadingUtils::getParentPtNodePosOffsetAndAdvancePosition(dictBuf, - &pos); - mParentPos = DynamicPatriciaTrieReadingUtils::getParentPtNodePos(parentPosOffset, mHeadPos); - if (outCodePoints != 0) { - mCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictBuf, mFlags, maxCodePointCount, outCodePoints, &pos); - } else { - mCodePointCount = PatriciaTrieReadingUtils::skipCharacters( - dictBuf, mFlags, MAX_WORD_LENGTH, &pos); - } - if (isTerminal()) { - mProbabilityFieldPos = pos; - if (usesAdditionalBuffer) { - mProbabilityFieldPos += mBuffer->getOriginalBufferSize(); - } - mProbability = PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictBuf, &pos); - } else { - mProbabilityFieldPos = NOT_A_DICT_POS; - mProbability = NOT_A_PROBABILITY; - } - mChildrenPosFieldPos = pos; - if (usesAdditionalBuffer) { - mChildrenPosFieldPos += mBuffer->getOriginalBufferSize(); - } - mChildrenPos = DynamicPatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - dictBuf, &pos); - if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) { - mChildrenPos += mBuffer->getOriginalBufferSize(); - } - if (mSiblingPos == NOT_A_DICT_POS) { - if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) { - mBigramLinkedNodePos = mChildrenPos; - } else { - mBigramLinkedNodePos = NOT_A_DICT_POS; - } - } - if (usesAdditionalBuffer) { - pos += mBuffer->getOriginalBufferSize(); - } - if (PatriciaTrieReadingUtils::hasShortcutTargets(mFlags)) { - mShortcutPos = pos; - mShortcutsPolicy->skipAllShortcuts(&pos); - } else { - mShortcutPos = NOT_A_DICT_POS; - } - if (PatriciaTrieReadingUtils::hasBigrams(mFlags)) { - mBigramPos = pos; - mBigramsPolicy->skipAllBigrams(&pos); - } else { - mBigramPos = NOT_A_DICT_POS; - } - // Update siblingPos if needed. - if (mSiblingPos == NOT_A_DICT_POS) { - // Sibling position is the tail position of current node. - mSiblingPos = pos; - } - // Read destination node if the read node is a moved node. - if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) { - // The destination position is stored at the same place as the parent position. - fetchPtNodeInfoFromBufferAndProcessMovedPtNode(mParentPos, maxCodePointCount, - outCodePoints); - } -} - -void DynamicPatriciaTrieNodeReader::invalidatePtNodeInfo() { - mHeadPos = NOT_A_DICT_POS; - mFlags = 0; - mParentPos = NOT_A_DICT_POS; - mCodePointCount = 0; - mProbabilityFieldPos = NOT_A_DICT_POS; - mProbability = NOT_A_PROBABILITY; - mChildrenPosFieldPos = NOT_A_DICT_POS; - mChildrenPos = NOT_A_DICT_POS; - mBigramLinkedNodePos = NOT_A_DICT_POS; - mShortcutPos = NOT_A_DICT_POS; - mBigramPos = NOT_A_DICT_POS; - mSiblingPos = NOT_A_DICT_POS; -} - -} diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h deleted file mode 100644 index 3b36d425f..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H -#define LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H - -#include <stdint.h> - -#include "defines.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" - -namespace latinime { - -class BufferWithExtendableBuffer; -class DictionaryBigramsStructurePolicy; -class DictionaryShortcutsStructurePolicy; - -/* - * This class is used for helping to read nodes of dynamic patricia trie. This class handles moved - * node and reads node attributes. - */ -class DynamicPatriciaTrieNodeReader { - public: - DynamicPatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, - const DictionaryBigramsStructurePolicy *const bigramsPolicy, - const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) - : mBuffer(buffer), mBigramsPolicy(bigramsPolicy), - mShortcutsPolicy(shortcutsPolicy), mHeadPos(NOT_A_DICT_POS), mFlags(0), - mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mProbabilityFieldPos(NOT_A_DICT_POS), - mProbability(NOT_A_PROBABILITY), mChildrenPosFieldPos(NOT_A_DICT_POS), - mChildrenPos(NOT_A_DICT_POS), mBigramLinkedNodePos(NOT_A_DICT_POS), - mShortcutPos(NOT_A_DICT_POS), mBigramPos(NOT_A_DICT_POS), - mSiblingPos(NOT_A_DICT_POS) {} - - ~DynamicPatriciaTrieNodeReader() {} - - // Reads PtNode information from dictionary buffer and updates members with the information. - AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) { - fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(ptNodePos , - 0 /* maxCodePointCount */, 0 /* outCodePoints */); - } - - AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints( - const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { - mSiblingPos = NOT_A_DICT_POS; - mBigramLinkedNodePos = NOT_A_DICT_POS; - fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, maxCodePointCount, outCodePoints); - } - - // HeadPos is different from NodePos when the current PtNode is a moved PtNode. - AK_FORCE_INLINE int getHeadPos() const { - return mHeadPos; - } - - // Flags - AK_FORCE_INLINE bool isDeleted() const { - return DynamicPatriciaTrieReadingUtils::isDeleted(mFlags); - } - - AK_FORCE_INLINE bool hasChildren() const { - return mChildrenPos != NOT_A_DICT_POS; - } - - AK_FORCE_INLINE bool isTerminal() const { - return PatriciaTrieReadingUtils::isTerminal(mFlags); - } - - AK_FORCE_INLINE bool isBlacklisted() const { - return PatriciaTrieReadingUtils::isBlacklisted(mFlags); - } - - AK_FORCE_INLINE bool isNotAWord() const { - return PatriciaTrieReadingUtils::isNotAWord(mFlags); - } - - // Parent node position - AK_FORCE_INLINE int getParentPos() const { - return mParentPos; - } - - // Number of code points - AK_FORCE_INLINE uint8_t getCodePointCount() const { - return mCodePointCount; - } - - // Probability - AK_FORCE_INLINE int getProbabilityFieldPos() const { - return mProbabilityFieldPos; - } - - AK_FORCE_INLINE int getProbability() const { - return mProbability; - } - - // Children PtNode array position - AK_FORCE_INLINE int getChildrenPosFieldPos() const { - return mChildrenPosFieldPos; - } - - AK_FORCE_INLINE int getChildrenPos() const { - return mChildrenPos; - } - - // Bigram linked node position. - AK_FORCE_INLINE int getBigramLinkedNodePos() const { - return mBigramLinkedNodePos; - } - - // Shortcutlist position - AK_FORCE_INLINE int getShortcutPos() const { - return mShortcutPos; - } - - // Bigrams position - AK_FORCE_INLINE int getBigramsPos() const { - return mBigramPos; - } - - // Sibling node position - AK_FORCE_INLINE int getSiblingNodePos() const { - return mSiblingPos; - } - - private: - DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieNodeReader); - - const BufferWithExtendableBuffer *const mBuffer; - const DictionaryBigramsStructurePolicy *const mBigramsPolicy; - const DictionaryShortcutsStructurePolicy *const mShortcutsPolicy; - int mHeadPos; - DynamicPatriciaTrieReadingUtils::NodeFlags mFlags; - int mParentPos; - uint8_t mCodePointCount; - int mProbabilityFieldPos; - int mProbability; - int mChildrenPosFieldPos; - int mChildrenPos; - int mBigramLinkedNodePos; - int mShortcutPos; - int mBigramPos; - int mSiblingPos; - - void fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, - const int maxCodePointCount, int *const outCodePoints); - - void invalidatePtNodeInfo(); -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp deleted file mode 100644 index 495b146c2..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" - -#include <cstdio> -#include <cstring> -#include <ctime> - -#include "defines.h" -#include "suggest/core/dicnode/dic_node.h" -#include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" -#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" - -namespace latinime { - -// Note that these are corresponding definitions in Java side in BinaryDictionaryTests and -// BinaryDictionaryDecayingTests. -const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; -const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; -const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; -const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; -const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = - "SET_NEEDS_TO_DECAY_FOR_TESTING"; -const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; -const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = - DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024; - -void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const { - if (!dicNode->hasChildren()) { - return; - } - DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, - getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPos()); - const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); - while (!readingHelper.isEnd()) { - bool isTerminal = nodeReader->isTerminal() && !nodeReader->isDeleted(); - if (isTerminal && mHeaderPolicy.isDecayingDict()) { - // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose - // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a - // valid terminal DicNode. - isTerminal = getProbability(nodeReader->getProbability(), NOT_A_PROBABILITY) - != NOT_A_PROBABILITY; - } - childDicNodes->pushLeavingChild(dicNode, nodeReader->getHeadPos(), - nodeReader->getChildrenPos(), nodeReader->getProbability(), isTerminal, - nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), - nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); - readingHelper.readNextSiblingNode(); - } -} - -int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const { - // This method traverses parent nodes from the terminal by following parent pointers; thus, - // node code points are stored in the buffer in the reverse order. - int reverseCodePoints[maxCodePointCount]; - DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, - getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - // First, read the terminal node and get its probability. - readingHelper.initWithPtNodePos(ptNodePos); - if (!readingHelper.isValidTerminalNode()) { - // Node at the ptNodePos is not a valid terminal node. - *outUnigramProbability = NOT_A_PROBABILITY; - return 0; - } - // Store terminal node probability. - *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); - // Then, following parent node link to the dictionary root and fetch node code points. - while (!readingHelper.isEnd()) { - if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { - // The ptNodePos is not a valid terminal node position in the dictionary. - *outUnigramProbability = NOT_A_PROBABILITY; - return 0; - } - // Store node code points to buffer in the reverse order. - readingHelper.fetchMergedNodeCodePointsInReverseOrder( - readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); - // Follow parent node toward the root node. - readingHelper.readParentNode(); - } - if (readingHelper.isError()) { - // The node position or the dictionary is invalid. - *outUnigramProbability = NOT_A_PROBABILITY; - return 0; - } - // Reverse the stored code points to output them. - const int codePointCount = readingHelper.getTotalCodePointCount(); - for (int i = 0; i < codePointCount; ++i) { - outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; - } - return codePointCount; -} - -int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const { - int searchCodePoints[length]; - for (int i = 0; i < length; ++i) { - searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; - } - DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, - getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); - while (!readingHelper.isEnd()) { - const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); - if (readingHelper.getTotalCodePointCount() > length - || !readingHelper.isMatchedCodePoint(0 /* index */, - searchCodePoints[matchedCodePointCount])) { - // Current node has too many code points or its first code point is different from - // target code point. Skip this node and read the next sibling node. - readingHelper.readNextSiblingNode(); - continue; - } - // Check following merged node code points. - const int nodeCodePointCount = nodeReader->getCodePointCount(); - for (int j = 1; j < nodeCodePointCount; ++j) { - if (!readingHelper.isMatchedCodePoint( - j, searchCodePoints[matchedCodePointCount + j])) { - // Different code point is found. The given word is not included in the dictionary. - return NOT_A_DICT_POS; - } - } - // All characters are matched. - if (length == readingHelper.getTotalCodePointCount()) { - // Terminal position is found. - return nodeReader->getHeadPos(); - } - if (!nodeReader->hasChildren()) { - return NOT_A_DICT_POS; - } - // Advance to the children nodes. - readingHelper.readChildNode(); - } - // If we already traversed the tree further than the word is long, there means - // there was no match (or we would have found it). - return NOT_A_DICT_POS; -} - -int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, - const int bigramProbability) const { - if (mHeaderPolicy.isDecayingDict()) { - return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); - } else { - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (bigramProbability == NOT_A_PROBABILITY) { - return ProbabilityUtils::backoff(unigramProbability); - } else { - return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, - bigramProbability); - } - } -} - -int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_PROBABILITY; - } - DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, - getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); - if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { - return NOT_A_PROBABILITY; - } - return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY); -} - -int DynamicPatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, - getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); - if (nodeReader.isDeleted()) { - return NOT_A_DICT_POS; - } - return nodeReader.getShortcutPos(); -} - -int DynamicPatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, - getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); - if (nodeReader.isDeleted()) { - return NOT_A_DICT_POS; - } - return nodeReader.getBigramsPos(); -} - -bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length, - const int probability) { - if (!mBuffer->isUpdatable()) { - AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); - return false; - } - if (mBufferWithExtendableBuffer.getTailPosition() - >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update."); - return false; - } - DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, - getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); - bool addedNewUnigram = false; - if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, - &addedNewUnigram)) { - if (addedNewUnigram) { - mUnigramCount++; - } - return true; - } else { - return false; - } -} - -bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, - const int *const word1, const int length1, const int probability) { - if (!mBuffer->isUpdatable()) { - AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); - return false; - } - if (mBufferWithExtendableBuffer.getTailPosition() - >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update."); - return false; - } - const int word0Pos = getTerminalNodePositionOfWord(word0, length0, - false /* forceLowerCaseSearch */); - if (word0Pos == NOT_A_DICT_POS) { - return false; - } - const int word1Pos = getTerminalNodePositionOfWord(word1, length1, - false /* forceLowerCaseSearch */); - if (word1Pos == NOT_A_DICT_POS) { - return false; - } - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); - bool addedNewBigram = false; - if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) { - if (addedNewBigram) { - mBigramCount++; - } - return true; - } else { - return false; - } -} - -bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, - const int *const word1, const int length1) { - if (!mBuffer->isUpdatable()) { - AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); - return false; - } - if (mBufferWithExtendableBuffer.getTailPosition() - >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update."); - return false; - } - const int word0Pos = getTerminalNodePositionOfWord(word0, length0, - false /* forceLowerCaseSearch */); - if (word0Pos == NOT_A_DICT_POS) { - return false; - } - const int word1Pos = getTerminalNodePositionOfWord(word1, length1, - false /* forceLowerCaseSearch */); - if (word1Pos == NOT_A_DICT_POS) { - return false; - } - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); - if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { - mBigramCount--; - return true; - } else { - return false; - } -} - -void DynamicPatriciaTriePolicy::flush(const char *const filePath) { - if (!mBuffer->isUpdatable()) { - AKLOGI("Warning: flush() is called for non-updatable dictionary."); - return; - } - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */); - writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount); -} - -void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { - if (!mBuffer->isUpdatable()) { - AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return; - } - const bool needsToDecay = mHeaderPolicy.isDecayingDict() - && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( - false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy)); - DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer, - &mShortcutListPolicy, needsToDecay); - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay); - writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); - mNeedsToDecayForTesting = false; -} - -bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { - if (!mBuffer->isUpdatable()) { - AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); - return false; - } - if (mBufferWithExtendableBuffer.isNearSizeLimit()) { - // Additional buffer size is near the limit. - return true; - } else if (mHeaderPolicy.getExtendedRegionSize() - + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() - > MAX_DICT_EXTENDED_REGION_SIZE) { - // Total extended region size exceeds the limit. - return true; - } else if (mBufferWithExtendableBuffer.getTailPosition() - >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS - && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) { - // Needs to reduce dictionary size. - return true; - } else if (mHeaderPolicy.isDecayingDict()) { - return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( - mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy); - } - return false; -} - -void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult, - const int maxResultLength) { - if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mUnigramCount); - } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mBigramCount); - } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : - static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); - } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : - static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); - } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) { - mNeedsToDecayForTesting = true; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h deleted file mode 100644 index be97ee1a5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H -#define LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H - -#include "defines.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" - -namespace latinime { - -class DicNode; -class DicNodeVector; - -class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { - public: - DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer) - : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), - mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), - mBuffer->getBufferSize() - mHeaderPolicy.getSize()), - mShortcutListPolicy(&mBufferWithExtendableBuffer), - mBigramListPolicy(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy, - mHeaderPolicy.isDecayingDict()), - mUnigramCount(mHeaderPolicy.getUnigramCount()), - mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {} - - ~DynamicPatriciaTriePolicy() { - delete mBuffer; - } - - AK_FORCE_INLINE int getRootPosition() const { - return 0; - } - - void createAndGetAllChildNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const; - - int getCodePointsAndProbabilityAndReturnCodePointCount( - const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const; - - int getTerminalNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const; - - int getProbability(const int unigramProbability, const int bigramProbability) const; - - int getUnigramProbabilityOfPtNode(const int ptNodePos) const; - - int getShortcutPositionOfPtNode(const int ptNodePos) const; - - int getBigramsPositionOfPtNode(const int ptNodePos) const; - - const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { - return &mHeaderPolicy; - } - - const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { - return &mBigramListPolicy; - } - - const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { - return &mShortcutListPolicy; - } - - bool addUnigramWord(const int *const word, const int length, const int probability); - - bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability); - - bool removeBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1); - - void flush(const char *const filePath); - - void flushWithGC(const char *const filePath); - - bool needsToRunGC(const bool mindsBlockByGC) const; - - void getProperty(const char *const query, char *const outResult, - const int maxResultLength); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); - - static const char *const UNIGRAM_COUNT_QUERY; - static const char *const BIGRAM_COUNT_QUERY; - static const char *const MAX_UNIGRAM_COUNT_QUERY; - static const char *const MAX_BIGRAM_COUNT_QUERY; - static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; - static const int MAX_DICT_EXTENDED_REGION_SIZE; - static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; - - const MmappedBuffer *const mBuffer; - const HeaderPolicy mHeaderPolicy; - BufferWithExtendableBuffer mBufferWithExtendableBuffer; - DynamicShortcutListPolicy mShortcutListPolicy; - DynamicBigramListPolicy mBigramListPolicy; - int mUnigramCount; - int mBigramCount; - int mNeedsToDecayForTesting; -}; -} // namespace latinime -#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp deleted file mode 100644 index f108c219f..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -// To avoid infinite loop caused by invalid or malicious forward links. -const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; -const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; -const size_t DynamicPatriciaTrieReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; - -// Visits all PtNodes in post-order depth first manner. -// For example, visits c -> b -> y -> x -> a for the following dictionary: -// a _ b _ c -// \ x _ y -bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( - TraversingEventListener *const listener) { - bool alreadyVisitedChildren = false; - // Descend from the root to the root PtNode array. - if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { - return false; - } - while (!isEnd()) { - if (!alreadyVisitedChildren) { - if (mNodeReader.hasChildren()) { - // Move to the first child. - if (!listener->onDescend(mNodeReader.getChildrenPos())) { - return false; - } - pushReadingStateToStack(); - readChildNode(); - } else { - alreadyVisitedChildren = true; - } - } else { - if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { - return false; - } - readNextSiblingNode(); - if (isEnd()) { - // All PtNodes in current linked PtNode arrays have been visited. - // Return to the parent. - if (!listener->onReadingPtNodeArrayTail()) { - return false; - } - if (mReadingStateStack.size() <= 0) { - break; - } - if (!listener->onAscend()) { - return false; - } - popReadingStateFromStack(); - alreadyVisitedChildren = true; - } else { - // Process sibling PtNode. - alreadyVisitedChildren = false; - } - } - } - // Ascend from the root PtNode array to the root. - if (!listener->onAscend()) { - return false; - } - return !isError(); -} - -// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order -// that PtNodes are written in the dictionary buffer. -// For example, visits a -> b -> x -> c -> y for the following dictionary: -// a _ b _ c -// \ x _ y -bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - TraversingEventListener *const listener) { - bool alreadyVisitedAllPtNodesInArray = false; - bool alreadyVisitedChildren = false; - // Descend from the root to the root PtNode array. - if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { - return false; - } - if (isEnd()) { - // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. - if (!listener->onReadingPtNodeArrayTail()) { - return false; - } - } - pushReadingStateToStack(); - while (!isEnd()) { - if (alreadyVisitedAllPtNodesInArray) { - if (alreadyVisitedChildren) { - // Move to next sibling PtNode's children. - readNextSiblingNode(); - if (isEnd()) { - // Return to the parent PTNode. - if (!listener->onAscend()) { - return false; - } - if (mReadingStateStack.size() <= 0) { - break; - } - popReadingStateFromStack(); - alreadyVisitedChildren = true; - alreadyVisitedAllPtNodesInArray = true; - } else { - alreadyVisitedChildren = false; - } - } else { - if (mNodeReader.hasChildren()) { - // Move to the first child. - if (!listener->onDescend(mNodeReader.getChildrenPos())) { - return false; - } - pushReadingStateToStack(); - readChildNode(); - // Push state to return the head of PtNode array. - pushReadingStateToStack(); - alreadyVisitedAllPtNodesInArray = false; - alreadyVisitedChildren = false; - } else { - alreadyVisitedChildren = true; - } - } - } else { - if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { - return false; - } - readNextSiblingNode(); - if (isEnd()) { - if (!listener->onReadingPtNodeArrayTail()) { - return false; - } - // Return to the head of current PtNode array. - popReadingStateFromStack(); - alreadyVisitedAllPtNodesInArray = true; - } - } - } - popReadingStateFromStack(); - // Ascend from the root PtNode array to the root. - if (!listener->onAscend()) { - return false; - } - return !isError(); -} - -// Read node array size and process empty node arrays. Nodes and arrays are counted up in this -// method to avoid an infinite loop. -void DynamicPatriciaTrieReadingHelper::nextPtNodeArray() { - if (mReadingState.mPos < 0 || mReadingState.mPos >= mBuffer->getTailPosition()) { - // Reading invalid position because of a bug or a broken dictionary. - AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", - mReadingState.mPos, mBuffer->getTailPosition()); - ASSERT(false); - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - return; - } - mReadingState.mPosOfLastPtNodeArrayHead = mReadingState.mPos; - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - mReadingState.mPos -= mBuffer->getOriginalBufferSize(); - } - mReadingState.mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - dictBuf, &mReadingState.mPos); - if (usesAdditionalBuffer) { - mReadingState.mPos += mBuffer->getOriginalBufferSize(); - } - // Count up nodes and node arrays to avoid infinite loop. - mReadingState.mTotalNodeCount += mReadingState.mNodeCount; - mReadingState.mNodeArrayCount++; - if (mReadingState.mNodeCount < 0 - || mReadingState.mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP - || mReadingState.mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { - // Invalid dictionary. - AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" - "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", - mReadingState.mNodeCount, mReadingState.mTotalNodeCount, - MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, mReadingState.mNodeArrayCount, - MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); - ASSERT(false); - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - return; - } - if (mReadingState.mNodeCount == 0) { - // Empty node array. Try following forward link. - followForwardLink(); - } -} - -// Follow the forward link and read the next node array if exists. -void DynamicPatriciaTrieReadingHelper::followForwardLink() { - if (mReadingState.mPos < 0 || mReadingState.mPos >= mBuffer->getTailPosition()) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", - mReadingState.mPos, mBuffer->getTailPosition()); - ASSERT(false); - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - return; - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - mReadingState.mPos -= mBuffer->getOriginalBufferSize(); - } - const int forwardLinkPosition = - DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mReadingState.mPos); - if (usesAdditionalBuffer) { - mReadingState.mPos += mBuffer->getOriginalBufferSize(); - } - mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; - if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { - // Follow the forward link. - mReadingState.mPos += forwardLinkPosition; - nextPtNodeArray(); - } else { - // All node arrays have been read. - mReadingState.mPos = NOT_A_DICT_POS; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h deleted file mode 100644 index a71c06971..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H -#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H - -#include <cstddef> -#include <vector> - -#include "defines.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" - -namespace latinime { - -class BufferWithExtendableBuffer; -class DictionaryBigramsStructurePolicy; -class DictionaryShortcutsStructurePolicy; - -/* - * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and - * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. - */ -class DynamicPatriciaTrieReadingHelper { - public: - class TraversingEventListener { - public: - virtual ~TraversingEventListener() {}; - - // Returns whether the event handling was succeeded or not. - virtual bool onAscend() = 0; - - // Returns whether the event handling was succeeded or not. - virtual bool onDescend(const int ptNodeArrayPos) = 0; - - // Returns whether the event handling was succeeded or not. - virtual bool onReadingPtNodeArrayTail() = 0; - - // Returns whether the event handling was succeeded or not. - virtual bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints) = 0; - - protected: - TraversingEventListener() {}; - - private: - DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); - }; - - DynamicPatriciaTrieReadingHelper(const BufferWithExtendableBuffer *const buffer, - const DictionaryBigramsStructurePolicy *const bigramsPolicy, - const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) - : mIsError(false), mReadingState(), mBuffer(buffer), - mNodeReader(mBuffer, bigramsPolicy, shortcutsPolicy), mReadingStateStack() {} - - ~DynamicPatriciaTrieReadingHelper() {} - - AK_FORCE_INLINE bool isError() const { - return mIsError; - } - - AK_FORCE_INLINE bool isEnd() const { - return mReadingState.mPos == NOT_A_DICT_POS; - } - - // Initialize reading state with the head position of a PtNode array. - AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { - if (ptNodeArrayPos == NOT_A_DICT_POS) { - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mIsError = false; - mReadingState.mPos = ptNodeArrayPos; - mReadingState.mPrevTotalCodePointCount = 0; - mReadingState.mTotalNodeCount = 0; - mReadingState.mNodeArrayCount = 0; - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - mReadingStateStack.clear(); - nextPtNodeArray(); - if (!isEnd()) { - fetchPtNodeInfo(); - } - } - } - - // Initialize reading state with the head position of a node. - AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { - if (ptNodePos == NOT_A_DICT_POS) { - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mIsError = false; - mReadingState.mPos = ptNodePos; - mReadingState.mNodeCount = 1; - mReadingState.mPrevTotalCodePointCount = 0; - mReadingState.mTotalNodeCount = 1; - mReadingState.mNodeArrayCount = 1; - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; - mReadingStateStack.clear(); - fetchPtNodeInfo(); - } - } - - AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const { - return &mNodeReader; - } - - AK_FORCE_INLINE bool isValidTerminalNode() const { - return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal(); - } - - AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const { - return mMergedNodeCodePoints[index] == codePoint; - } - - // Return code point count exclude the last read node's code points. - AK_FORCE_INLINE int getPrevTotalCodePointCount() const { - return mReadingState.mPrevTotalCodePointCount; - } - - // Return code point count include the last read node's code points. - AK_FORCE_INLINE int getTotalCodePointCount() const { - return mReadingState.mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); - } - - AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( - const int index, int *const outCodePoints) const { - const int nodeCodePointCount = mNodeReader.getCodePointCount(); - for (int i = 0; i < nodeCodePointCount; ++i) { - outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i]; - } - } - - AK_FORCE_INLINE const int *getMergedNodeCodePoints() const { - return mMergedNodeCodePoints; - } - - AK_FORCE_INLINE void readNextSiblingNode() { - mReadingState.mNodeCount -= 1; - mReadingState.mPos = mNodeReader.getSiblingNodePos(); - if (mReadingState.mNodeCount <= 0) { - // All nodes in the current node array have been read. - followForwardLink(); - if (!isEnd()) { - fetchPtNodeInfo(); - } - } else { - fetchPtNodeInfo(); - } - } - - // Read the first child node of the current node. - AK_FORCE_INLINE void readChildNode() { - if (mNodeReader.hasChildren()) { - mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); - mReadingState.mTotalNodeCount = 0; - mReadingState.mNodeArrayCount = 0; - mReadingState.mPos = mNodeReader.getChildrenPos(); - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - // Read children node array. - nextPtNodeArray(); - if (!isEnd()) { - fetchPtNodeInfo(); - } - } else { - mReadingState.mPos = NOT_A_DICT_POS; - } - } - - // Read the parent node of the current node. - AK_FORCE_INLINE void readParentNode() { - if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { - mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); - mReadingState.mTotalNodeCount = 1; - mReadingState.mNodeArrayCount = 1; - mReadingState.mNodeCount = 1; - mReadingState.mPos = mNodeReader.getParentPos(); - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; - fetchPtNodeInfo(); - } else { - mReadingState.mPos = NOT_A_DICT_POS; - } - } - - AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { - return mReadingState.mPosOfLastForwardLinkField; - } - - AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { - return mReadingState.mPosOfLastPtNodeArrayHead; - } - - AK_FORCE_INLINE void reloadCurrentPtNodeInfo() { - if (!isEnd()) { - fetchPtNodeInfo(); - } - } - - bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); - - bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - TraversingEventListener *const listener); - - private: - DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); - - class ReadingState { - public: - // Note that copy constructor and assignment operator are used for this class to use - // std::vector. - ReadingState() : mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), - mTotalNodeCount(0), mNodeArrayCount(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), - mPosOfLastPtNodeArrayHead(NOT_A_DICT_POS) {} - - int mPos; - // Node count of a node array. - int mNodeCount; - int mPrevTotalCodePointCount; - int mTotalNodeCount; - int mNodeArrayCount; - int mPosOfLastForwardLinkField; - int mPosOfLastPtNodeArrayHead; - }; - - static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; - static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; - static const size_t MAX_READING_STATE_STACK_SIZE; - - // TODO: Introduce error code to track what caused the error. - bool mIsError; - ReadingState mReadingState; - const BufferWithExtendableBuffer *const mBuffer; - DynamicPatriciaTrieNodeReader mNodeReader; - int mMergedNodeCodePoints[MAX_WORD_LENGTH]; - std::vector<ReadingState> mReadingStateStack; - - void nextPtNodeArray(); - - void followForwardLink(); - - AK_FORCE_INLINE void fetchPtNodeInfo() { - mNodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(mReadingState.mPos, - MAX_WORD_LENGTH, mMergedNodeCodePoints); - if (mNodeReader.getCodePointCount() <= 0) { - // Empty node is not allowed. - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - } - } - - AK_FORCE_INLINE void pushReadingStateToStack() { - if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { - AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); - ASSERT(false); - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mReadingStateStack.push_back(mReadingState); - } - } - - AK_FORCE_INLINE void popReadingStateFromStack() { - if (mReadingStateStack.empty()) { - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mReadingState = mReadingStateStack.back(); - mReadingStateStack.pop_back(); - if (!isEnd()) { - fetchPtNodeInfo(); - } - } - } -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp deleted file mode 100644 index 052558bfc..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ /dev/null @@ -1,558 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" - -#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" -#include "utils/hash_map_compat.h" - -namespace latinime { - -const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; -// TODO: Make MAX_DICTIONARY_SIZE 8MB. -const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 * 1024; - -bool DynamicPatriciaTrieWritingHelper::addUnigramWord( - DynamicPatriciaTrieReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - bool *const outAddedNewUnigram) { - int parentPos = NOT_A_DICT_POS; - while (!readingHelper->isEnd()) { - const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); - if (!readingHelper->isMatchedCodePoint(0 /* index */, - wordCodePoints[matchedCodePointCount])) { - // The first code point is different from target code point. Skip this node and read - // the next sibling node. - readingHelper->readNextSiblingNode(); - continue; - } - // Check following merged node code points. - const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper->getNodeReader(); - const int nodeCodePointCount = nodeReader->getCodePointCount(); - for (int j = 1; j < nodeCodePointCount; ++j) { - const int nextIndex = matchedCodePointCount + j; - if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j, - wordCodePoints[matchedCodePointCount + j])) { - *outAddedNewUnigram = true; - return reallocatePtNodeAndAddNewPtNodes(nodeReader, - readingHelper->getMergedNodeCodePoints(), j, - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, - probability), - wordCodePoints + matchedCodePointCount, - codePointCount - matchedCodePointCount); - } - } - // All characters are matched. - if (codePointCount == readingHelper->getTotalCodePointCount()) { - return setPtNodeProbability(nodeReader, probability, - readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram); - } - if (!nodeReader->hasChildren()) { - *outAddedNewUnigram = true; - return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), - wordCodePoints + readingHelper->getTotalCodePointCount(), - codePointCount - readingHelper->getTotalCodePointCount()); - } - // Advance to the children nodes. - parentPos = nodeReader->getHeadPos(); - readingHelper->readChildNode(); - } - if (readingHelper->isError()) { - // The dictionary is invalid. - return false; - } - int pos = readingHelper->getPosOfLastForwardLinkField(); - *outAddedNewUnigram = true; - return createAndInsertNodeIntoPtNodeArray(parentPos, - wordCodePoints + readingHelper->getPrevTotalCodePointCount(), - codePointCount - readingHelper->getPrevTotalCodePointCount(), - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), &pos); -} - -bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, - const int probability, bool *const outAddedNewBigram) { - int mMergedNodeCodePoints[MAX_WORD_LENGTH]; - DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); - nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH, - mMergedNodeCodePoints); - // Move node to add bigram entry. - const int newNodePos = mBuffer->getTailPosition(); - if (!markNodeAsMovedAndSetPosition(&nodeReader, newNodePos, newNodePos)) { - return false; - } - int writingPos = newNodePos; - // Write a new PtNode using original PtNode's info to the tail of the dictionary in mBuffer. - if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, &nodeReader, nodeReader.getParentPos(), - mMergedNodeCodePoints, nodeReader.getCodePointCount(), nodeReader.getProbability(), - &writingPos)) { - return false; - } - nodeReader.fetchNodeInfoInBufferFromPtNodePos(newNodePos); - if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) { - // Insert a new bigram entry into the existing bigram list. - int bigramListPos = nodeReader.getBigramsPos(); - return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos, - outAddedNewBigram); - } else { - // The PtNode doesn't have a bigram list. - *outAddedNewBigram = true; - // First, Write a bigram entry at the tail position of the PtNode. - if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) { - return false; - } - // Then, Mark as the PtNode having bigram list in the flags. - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - PatriciaTrieReadingUtils::createAndGetFlags(nodeReader.isBlacklisted(), - nodeReader.isNotAWord(), nodeReader.getProbability() != NOT_A_PROBABILITY, - nodeReader.getShortcutPos() != NOT_A_DICT_POS, true /* hasBigrams */, - nodeReader.getCodePointCount() > 1, CHILDREN_POSITION_FIELD_SIZE); - writingPos = newNodePos; - // Write updated flags into the moved PtNode's flags field. - return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, - &writingPos); - } -} - -// Remove a bigram relation from word0Pos to word1Pos. -bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { - DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(word0Pos); - if (nodeReader.getBigramsPos() == NOT_A_DICT_POS) { - return false; - } - return mBigramPolicy->removeBigram(nodeReader.getBigramsPos(), word1Pos); -} - -void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName, - const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) { - BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); - const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + - mBuffer->getUsedAdditionalBufferSize(); - if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */, - false /* updatesLastDecayedTime */, unigramCount, bigramCount, extendedRegionSize)) { - return; - } - DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer); -} - -void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, - const char *const fileName, const HeaderPolicy *const headerPolicy) { - BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */, - MAX_DICTIONARY_SIZE); - int unigramCount = 0; - int bigramCount = 0; - if (mNeedsToDecay) { - ForgettingCurveUtils::sTimeKeeper.setCurrentTime(); - } - if (!runGC(rootPtNodeArrayPos, headerPolicy, &newDictBuffer, &unigramCount, &bigramCount)) { - return; - } - BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); - if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, - mNeedsToDecay, unigramCount, bigramCount, 0 /* extendedRegionSize */)) { - return; - } - DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); -} - -bool DynamicPatriciaTrieWritingHelper::markNodeAsDeleted( - const DynamicPatriciaTrieNodeReader *const nodeToUpdate) { - int pos = nodeToUpdate->getHeadPos(); - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPatriciaTrieReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, - true /* isDeleted */); - int writingPos = nodeToUpdate->getHeadPos(); - // Update flags. - return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, - &writingPos); -} - -bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( - const DynamicPatriciaTrieNodeReader *const originalNode, const int movedPos, - const int bigramLinkedNodePos) { - int pos = originalNode->getHeadPos(); - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPatriciaTrieReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, - false /* isDeleted */); - int writingPos = originalNode->getHeadPos(); - // Update flags. - if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, - &writingPos)) { - return false; - } - // Update moved position, which is stored in the parent offset field. - if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( - mBuffer, movedPos, originalNode->getHeadPos(), &writingPos)) { - return false; - } - // Update bigram linked node position, which is stored in the children position field. - int childrenPosFieldPos = originalNode->getChildrenPosFieldPos(); - if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( - mBuffer, bigramLinkedNodePos, &childrenPosFieldPos)) { - return false; - } - if (originalNode->hasChildren()) { - // Update children's parent position. - DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); - const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); - readingHelper.initWithPtNodeArrayPos(originalNode->getChildrenPos()); - while (!readingHelper.isEnd()) { - int parentOffsetFieldPos = nodeReader->getHeadPos() - + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; - if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( - mBuffer, bigramLinkedNodePos, nodeReader->getHeadPos(), - &parentOffsetFieldPos)) { - // Parent offset cannot be written because of a bug or a broken dictionary; thus, - // we give up to update dictionary. - return false; - } - readingHelper.readNextSiblingNode(); - } - } - return true; -} - -// Write new PtNode at writingPos. -bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer( - BufferWithExtendableBuffer *const bufferToWrite, const bool isBlacklisted, - const bool isNotAWord, const int parentPos, const int *const codePoints, - const int codePointCount, const int probability, const int childrenPos, - const int originalBigramListPos, const int originalShortcutListPos, - int *const writingPos) { - const int nodePos = *writingPos; - // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the - // PtNode writing. - if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, - 0 /* nodeFlags */, writingPos)) { - return false; - } - // Calculate a parent offset and write the offset. - if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(bufferToWrite, - parentPos, nodePos, writingPos)) { - return false; - } - // Write code points - if (!DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition(bufferToWrite, - codePoints, codePointCount, writingPos)) { - return false; - } - // Write probability when the probability is a valid probability, which means this node is - // terminal. - if (probability != NOT_A_PROBABILITY) { - if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(bufferToWrite, - probability, writingPos)) { - return false; - } - } - // Write children position - if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(bufferToWrite, - childrenPos, writingPos)) { - return false; - } - // Copy shortcut list when the originalShortcutListPos is valid dictionary position. - if (originalShortcutListPos != NOT_A_DICT_POS) { - int fromPos = originalShortcutListPos; - if (!mShortcutPolicy->copyAllShortcutsAndReturnIfSucceededOrNot(bufferToWrite, &fromPos, - writingPos)) { - return false; - } - } - // Copy bigram list when the originalBigramListPos is valid dictionary position. - int bigramCount = 0; - if (originalBigramListPos != NOT_A_DICT_POS) { - int fromPos = originalBigramListPos; - if (!mBigramPolicy->copyAllBigrams(bufferToWrite, &fromPos, writingPos, &bigramCount)) { - return false; - } - } - // Create node flags and write them. - PatriciaTrieReadingUtils::NodeFlags nodeFlags = - PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, - probability != NOT_A_PROBABILITY /* isTerminal */, - originalShortcutListPos != NOT_A_DICT_POS /* hasShortcutTargets */, - bigramCount > 0 /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, - CHILDREN_POSITION_FIELD_SIZE); - int flagsFieldPos = nodePos; - if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, nodeFlags, - &flagsFieldPos)) { - return false; - } - return true; -} - -bool DynamicPatriciaTrieWritingHelper::writePtNodeToBuffer( - BufferWithExtendableBuffer *const bufferToWrite, const int parentPos, - const int *const codePoints, const int codePointCount, const int probability, - int *const writingPos) { - return writePtNodeWithFullInfoToBuffer(bufferToWrite, false /* isBlacklisted */, - false /* isNotAWord */, parentPos, codePoints, codePointCount, probability, - NOT_A_DICT_POS /* childrenPos */, NOT_A_DICT_POS /* originalBigramsPos */, - NOT_A_DICT_POS /* originalShortcutPos */, writingPos); -} - -bool DynamicPatriciaTrieWritingHelper::writePtNodeToBufferByCopyingPtNodeInfo( - BufferWithExtendableBuffer *const bufferToWrite, - const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, - const int *const codePoints, const int codePointCount, const int probability, - int *const writingPos) { - return writePtNodeWithFullInfoToBuffer(bufferToWrite, originalNode->isBlacklisted(), - originalNode->isNotAWord(), parentPos, codePoints, codePointCount, probability, - originalNode->getChildrenPos(), originalNode->getBigramsPos(), - originalNode->getShortcutPos(), writingPos); -} - -bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, - const int *const nodeCodePoints, const int nodeCodePointCount, const int probability, - int *const forwardLinkFieldPos) { - const int newPtNodeArrayPos = mBuffer->getTailPosition(); - if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - newPtNodeArrayPos, forwardLinkFieldPos)) { - return false; - } - return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, - probability); -} - -bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( - const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability, - const int *const codePoints, bool *const outAddedNewUnigram) { - if (originalPtNode->isTerminal()) { - // Overwrites the probability. - *outAddedNewUnigram = false; - const int probabilityToWrite = getUpdatedProbability(originalPtNode->getProbability(), - probability); - int probabilityFieldPos = originalPtNode->getProbabilityFieldPos(); - if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer, - probabilityToWrite, &probabilityFieldPos)) { - return false; - } - } else { - // Make the node terminal and write the probability. - *outAddedNewUnigram = true; - int movedPos = mBuffer->getTailPosition(); - if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) { - return false; - } - if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, originalPtNode, - originalPtNode->getParentPos(), codePoints, originalPtNode->getCodePointCount(), - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), - &movedPos)) { - return false; - } - } - return true; -} - -bool DynamicPatriciaTrieWritingHelper::createChildrenPtNodeArrayAndAChildPtNode( - const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, - const int *const codePoints, const int codePointCount) { - const int newPtNodeArrayPos = mBuffer->getTailPosition(); - int childrenPosFieldPos = parentNode->getChildrenPosFieldPos(); - if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, - newPtNodeArrayPos, &childrenPosFieldPos)) { - return false; - } - return createNewPtNodeArrayWithAChildPtNode(parentNode->getHeadPos(), codePoints, - codePointCount, probability); -} - -bool DynamicPatriciaTrieWritingHelper::createNewPtNodeArrayWithAChildPtNode( - const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, - const int probability) { - int writingPos = mBuffer->getTailPosition(); - if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, - 1 /* arraySize */, &writingPos)) { - return false; - } - if (!writePtNodeToBuffer(mBuffer, parentPtNodePos, nodeCodePoints, nodeCodePointCount, - probability, &writingPos)) { - return false; - } - if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - return true; -} - -// Returns whether the dictionary updating was succeeded or not. -bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( - const DynamicPatriciaTrieNodeReader *const reallocatingPtNode, - const int *const reallocatingPtNodeCodePoints, const int overlappingCodePointCount, - const int probabilityOfNewPtNode, const int *const newNodeCodePoints, - const int newNodeCodePointCount) { - // When addsExtraChild is true, split the reallocating PtNode and add new child. - // Reallocating PtNode: abcde, newNode: abcxy. - // abc (1st, not terminal) __ de (2nd) - // \_ xy (extra child, terminal) - // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. - // Reallocating PtNode: abcde, newNode: abc. - // abc (1st, terminal) __ de (2nd) - const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; - const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); - int writingPos = firstPartOfReallocatedPtNodePos; - // Write the 1st part of the reallocating node. The children position will be updated later - // with actual children position. - const int newProbability = addsExtraChild ? NOT_A_PROBABILITY : probabilityOfNewPtNode; - if (!writePtNodeToBuffer(mBuffer, reallocatingPtNode->getParentPos(), - reallocatingPtNodeCodePoints, overlappingCodePointCount, newProbability, - &writingPos)) { - return false; - } - const int actualChildrenPos = writingPos; - // Create new children PtNode array. - const size_t newPtNodeCount = addsExtraChild ? 2 : 1; - if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, - newPtNodeCount, &writingPos)) { - return false; - } - // Write the 2nd part of the reallocating node. - const int secondPartOfReallocatedPtNodePos = writingPos; - if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, reallocatingPtNode, - firstPartOfReallocatedPtNodePos, - reallocatingPtNodeCodePoints + overlappingCodePointCount, - reallocatingPtNode->getCodePointCount() - overlappingCodePointCount, - reallocatingPtNode->getProbability(), &writingPos)) { - return false; - } - if (addsExtraChild) { - if (!writePtNodeToBuffer(mBuffer, firstPartOfReallocatedPtNodePos, - newNodeCodePoints + overlappingCodePointCount, - newNodeCodePointCount - overlappingCodePointCount, probabilityOfNewPtNode, - &writingPos)) { - return false; - } - } - if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - // Update original reallocatingPtNode as moved. - if (!markNodeAsMovedAndSetPosition(reallocatingPtNode, firstPartOfReallocatedPtNodePos, - secondPartOfReallocatedPtNodePos)) { - return false; - } - // Load node info. Information of the 1st part will be fetched. - DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); - nodeReader.fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos); - // Update children position. - int childrenPosFieldPos = nodeReader.getChildrenPosFieldPos(); - if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, - actualChildrenPos, &childrenPosFieldPos)) { - return false; - } - return true; -} - -bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite, - int *const outUnigramCount, int *const outBigramCount) { - DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPatriciaTrieGcEventListeners - ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - headerPolicy, this, mBuffer, mNeedsToDecay); - if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { - return false; - } - if (mNeedsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - .getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { - // TODO: Remove more unigrams. - } - - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability - traversePolicyToUpdateBigramProbability(mBigramPolicy); - if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateBigramProbability)) { - return false; - } - if (mNeedsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() - > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { - // TODO: Remove more bigrams. - } - - // Mapping from positions in mBuffer to positions in bufferToWrite. - DictPositionRelocationMap dictPositionRelocationMap; - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - traversePolicyToPlaceAndWriteValidPtNodesToBuffer(this, bufferToWrite, - &dictPositionRelocationMap); - if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { - return false; - } - - // Create policy instance for the GCed dictionary. - DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); - DynamicBigramListPolicy newDictBigramPolicy(headerPolicy, bufferToWrite, &newDictShortcutPolicy, - mNeedsToDecay); - // Create reading helper for the GCed dictionary. - DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, - &newDictShortcutPolicy); - newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields - traversePolicyToUpdateAllPositionFields(this, &newDictBigramPolicy, bufferToWrite, - &dictPositionRelocationMap); - if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToUpdateAllPositionFields)) { - return false; - } - *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); - *outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount(); - return true; -} - -int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability, - const int newProbability) { - if (mNeedsToDecay) { - return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, - newProbability); - } else { - return newProbability; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h deleted file mode 100644 index ca8664729..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H -#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H - -#include <stdint.h> - -#include "defines.h" -#include "utils/hash_map_compat.h" - -namespace latinime { - -class BufferWithExtendableBuffer; -class DynamicBigramListPolicy; -class DynamicPatriciaTrieNodeReader; -class DynamicPatriciaTrieReadingHelper; -class DynamicShortcutListPolicy; -class HeaderPolicy; - -class DynamicPatriciaTrieWritingHelper { - public: - typedef hash_map_compat<int, int> PtNodeArrayPositionRelocationMap; - typedef hash_map_compat<int, int> PtNodePositionRelocationMap; - struct DictPositionRelocationMap { - public: - DictPositionRelocationMap() - : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} - - PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; - PtNodePositionRelocationMap mPtNodePositionRelocationMap; - - private: - DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); - }; - - static const size_t MAX_DICTIONARY_SIZE; - - DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, - DynamicBigramListPolicy *const bigramPolicy, - DynamicShortcutListPolicy *const shortcutPolicy, const bool needsToDecay) - : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), - mNeedsToDecay(needsToDecay) {} - - ~DynamicPatriciaTrieWritingHelper() {} - - // Add a word to the dictionary. If the word already exists, update the probability. - bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - bool *const outAddedNewUnigram); - - // Add a bigram relation from word0Pos to word1Pos. - bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, - bool *const outAddedNewBigram); - - // Remove a bigram relation from word0Pos to word1Pos. - bool removeBigramWords(const int word0Pos, const int word1Pos); - - void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy, - const int unigramCount, const int bigramCount); - - void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName, - const HeaderPolicy *const headerPolicy); - - // CAVEAT: This method must be called only from inner classes of - // DynamicPatriciaTrieGcEventListeners. - bool markNodeAsDeleted(const DynamicPatriciaTrieNodeReader *const nodeToUpdate); - - // CAVEAT: This method must be called only from this class or inner classes of - // DynamicPatriciaTrieGcEventListeners. - bool writePtNodeToBufferByCopyingPtNodeInfo(BufferWithExtendableBuffer *const bufferToWrite, - const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, - const int *const codePoints, const int codePointCount, const int probability, - int *const writingPos); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); - - static const int CHILDREN_POSITION_FIELD_SIZE; - - BufferWithExtendableBuffer *const mBuffer; - DynamicBigramListPolicy *const mBigramPolicy; - DynamicShortcutListPolicy *const mShortcutPolicy; - const bool mNeedsToDecay; - - bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate, - const int movedPos, const int bigramLinkedNodePos); - - bool writePtNodeWithFullInfoToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const bool isBlacklisted, const bool isNotAWord, - const int parentPos, const int *const codePoints, const int codePointCount, - const int probability, const int childrenPos, const int originalBigramListPos, - const int originalShortcutListPos, int *const writingPos); - - bool writePtNodeToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const int parentPos, const int *const codePoints, const int codePointCount, - const int probability, int *const writingPos); - - bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); - - bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode, - const int probability, const int *const codePoints, bool *const outAddedNewUnigram); - - bool createChildrenPtNodeArrayAndAChildPtNode( - const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, - const int *const codePoints, const int codePointCount); - - bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const int probability); - - bool reallocatePtNodeAndAddNewPtNodes( - const DynamicPatriciaTrieNodeReader *const reallocatingPtNode, - const int *const reallocatingPtNodeCodePoints, const int overlappingCodePointCount, - const int probabilityOfNewPtNode, const int *const newNodeCodePoints, - const int newNodeCodePointCount); - - bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, - BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, - int *const outBigramCount); - - int getUpdatedProbability(const int originalProbability, const int newProbability); -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index eb072fbaf..3ce57d910 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -20,13 +20,19 @@ namespace latinime { // Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; +const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = + "REQUIRES_GERMAN_UMLAUT_PROCESSING"; // TODO: Change attribute string to "IS_DECAYING_DICT". const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; -const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; +const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; +// Historical info is information that is needed to support decaying such as timestamp, level and +// count. +const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; +const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; @@ -40,7 +46,8 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out } std::vector<int> keyCodePointVector; HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); - HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector); + DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it = + mAttributeMap.find(keyCodePointVector); if (it == mAttributeMap.end()) { // The key was not found. outValue[0] = '?'; @@ -54,6 +61,10 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out outValue[terminalIndex] = '\0'; } +const std::vector<int> HeaderPolicy::readLocale() const { + return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY); +} + float HeaderPolicy::readMultipleWordCostMultiplier() const { const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); @@ -63,54 +74,65 @@ float HeaderPolicy::readMultipleWordCostMultiplier() const { return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate); } -bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, const int extendedRegionSize) const { +bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { + return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false); +} + +bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, + const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const { int writingPos = 0; - if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion, + DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); + fillInHeader(updatesLastDecayedTime, unigramCount, bigramCount, + extendedRegionSize, &attributeMapToWrite); + if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, &writingPos)) { return false; } - if (!HeaderReadWriteUtils::writeDictionaryFlags(bufferToWrite, mDictionaryFlags, + if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags, &writingPos)) { return false; } // Temporarily writes a dummy header size. int headerSizeFieldPos = writingPos; - if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, 0 /* size */, + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */, &writingPos)) { return false; } - HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap); - HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount); - HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount); - HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY, - extendedRegionSize); - if (updatesLastUpdatedTime) { - // Set current time as a last updated time. - HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY, - time(0)); - } - if (updatesLastDecayedTime) { - // Set current time as a last updated time. - HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_DECAYED_TIME_KEY, - time(0)); - } - if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, + if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite, &writingPos)) { return false; } - // Writes an actual header size. - if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos, + // Writes the actual header size. + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos, &headerSizeFieldPos)) { return false; } return true; } -/* static */ HeaderReadWriteUtils::AttributeMap +void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int unigramCount, + const int bigramCount, const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, unigramCount); + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, bigramCount); + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, + extendedRegionSize); + // Set the current time as the generation time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY, + TimeKeeper::peekCurrentTime()); + HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale); + if (updatesLastDecayedTime) { + // Set current time as the last updated time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY, + TimeKeeper::peekCurrentTime()); + } +} + +/* static */ DictionaryHeaderStructurePolicy::AttributeMap HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { - HeaderReadWriteUtils::AttributeMap attributeMap; + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); return attributeMap; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index a9c7805a8..fc347618c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -17,69 +17,101 @@ #ifndef LATINIME_HEADER_POLICY_H #define LATINIME_HEADER_POLICY_H -#include <ctime> #include <stdint.h> #include "defines.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "utils/char_utils.h" +#include "utils/time_keeper.h" namespace latinime { class HeaderPolicy : public DictionaryHeaderStructurePolicy { public: // Reads information from existing dictionary buffer. - HeaderPolicy(const uint8_t *const dictBuf, const int dictSize) - : mDictFormatVersion(FormatUtils::detectFormatVersion(dictBuf, dictSize)), + HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion) + : mDictFormatVersion(formatVersion), mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), + mLocale(readLocale()), mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, IS_DECAYING_DICT_KEY, false /* defaultValue */)), - mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_DECAYED_TIME_KEY, time(0) /* defaultValue */)), + LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, BIGRAM_COUNT_KEY, 0 /* defaultValue */)), mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {} + EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)) {} // Constructs header information using an attribute map. HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, - const HeaderReadWriteUtils::AttributeMap *const attributeMap) + const std::vector<int> locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) : mDictFormatVersion(dictFormatVersion), mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( - attributeMap)), mSize(0), mAttributeMap(*attributeMap), + attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale), mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, IS_DECAYING_DICT_KEY, false /* defaultValue */)), - mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), - mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {} + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)) { + } + + // Temporary dummy header. + HeaderPolicy() + : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), + mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), + mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), + mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), + mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false) {} ~HeaderPolicy() {} - AK_FORCE_INLINE int getSize() const { - return mSize; + virtual int getFormatVersionNumber() const { + // Conceptually this converts the symbolic value we use in the code into the + // hardcoded of the bytes in the file. But we want the constants to be the + // same so we use them for both here. + switch (mDictFormatVersion) { + case FormatUtils::VERSION_2: + return FormatUtils::VERSION_2; + case FormatUtils::VERSION_4: + return FormatUtils::VERSION_4; + default: + return FormatUtils::UNKNOWN_VERSION; + } } - AK_FORCE_INLINE bool supportsDynamicUpdate() const { - return HeaderReadWriteUtils::supportsDynamicUpdate(mDictionaryFlags); - } - - AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { - return HeaderReadWriteUtils::requiresGermanUmlautProcessing(mDictionaryFlags); + AK_FORCE_INLINE bool isValid() const { + // Decaying dictionary must have historical information. + if (!mIsDecayingDict) { + return true; + } + if (mHasHistoricalInfoOfWords) { + return true; + } else { + return false; + } } - AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const { - return HeaderReadWriteUtils::requiresFrenchLigatureProcessing(mDictionaryFlags); + AK_FORCE_INLINE int getSize() const { + return mSize; } AK_FORCE_INLINE float getMultiWordCostMultiplier() const { @@ -90,8 +122,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mIsDecayingDict; } - AK_FORCE_INLINE int getLastUpdatedTime() const { - return mLastUpdatedTime; + AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { + return mRequiresGermanUmlautProcessing; + } + + AK_FORCE_INLINE int getDate() const { + return mDate; } AK_FORCE_INLINE int getLastDecayedTime() const { @@ -110,41 +146,66 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mExtendedRegionSize; } + AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { + return mHasHistoricalInfoOfWords; + } + + AK_FORCE_INLINE bool shouldBoostExactMatches() const { + // TODO: Investigate better ways to handle exact matches for personalized dictionaries. + return !isDecayingDict(); + } + + const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const { + return &mAttributeMap; + } + void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; - bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, const int extendedRegionSize) const; + bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, + const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const; + + void fillInHeader(const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; private: - DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); + DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; + static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; static const char *const IS_DECAYING_DICT_KEY; - static const char *const LAST_UPDATED_TIME_KEY; + static const char *const DATE_KEY; static const char *const LAST_DECAYED_TIME_KEY; static const char *const UNIGRAM_COUNT_KEY; static const char *const BIGRAM_COUNT_KEY; static const char *const EXTENDED_REGION_SIZE_KEY; + static const char *const HAS_HISTORICAL_INFO_KEY; + static const char *const LOCALE_KEY; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; const FormatUtils::FORMAT_VERSION mDictFormatVersion; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; const int mSize; - HeaderReadWriteUtils::AttributeMap mAttributeMap; + DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; + const std::vector<int> mLocale; const float mMultiWordCostMultiplier; + const bool mRequiresGermanUmlautProcessing; const bool mIsDecayingDict; - const int mLastUpdatedTime; + const int mDate; const int mLastDecayedTime; const int mUnigramCount; const int mBigramCount; const int mExtendedRegionSize; + const bool mHasHistoricalInfoOfWords; + const std::vector<int> readLocale() const; float readMultipleWordCostMultiplier() const; + bool readRequiresGermanUmlautProcessing() const; - static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes( + static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( const uint8_t *const dictBuf); }; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index 5ded8f6a1..d20accfbc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -35,22 +35,8 @@ const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; -// Flags for special processing -// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or -// something very bad (like, the apocalypse) will happen. Please update both at the same time. -const HeaderReadWriteUtils::DictionaryFlags - HeaderReadWriteUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; -const HeaderReadWriteUtils::DictionaryFlags - HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; -const HeaderReadWriteUtils::DictionaryFlags - HeaderReadWriteUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; - -// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. -const char *const HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_KEY = "SUPPORTS_DYNAMIC_UPDATE"; -const char *const HeaderReadWriteUtils::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = - "REQUIRES_GERMAN_UMLAUT_PROCESSING"; -const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY = - "REQUIRES_FRENCH_LIGATURE_PROCESSING"; + +typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; /* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { // See the format of the header in the comment in @@ -67,18 +53,8 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY /* static */ HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( - const HeaderReadWriteUtils::AttributeMap *const attributeMap) { - const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, - REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */); - const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, - REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */); - const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, - SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */); - DictionaryFlags dictflags = NO_FLAGS; - dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0; - dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0; - dictflags |= supportsDynamicUpdate ? SUPPORTS_DYNAMIC_UPDATE_FLAG : 0; - return dictflags; + const AttributeMap *const attributeMap) { + return NO_FLAGS; } /* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, @@ -115,8 +91,8 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY case FormatUtils::VERSION_2: // Version 2 dictionary writing is not supported. return false; - case FormatUtils::VERSION_3: - return buffer->writeUintAndAdvancePosition(3 /* data */, + case FormatUtils::VERSION_4: + return buffer->writeUintAndAdvancePosition(FormatUtils::VERSION_4 /* data */, HEADER_DICTIONARY_VERSION_SIZE, writingPos); default: return false; @@ -156,6 +132,13 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY return true; } +/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute( + AttributeMap *const headerAttributes, const char *const key, const std::vector<int> value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + (*headerAttributes)[keyVector] = value; +} + /* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, const char *const key, const bool value) { setIntAttribute(headerAttributes, key, value ? 1 : 0); @@ -177,6 +160,18 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY (*headerAttributes)[*key] = valueVector; } +/* static */ const std::vector<int> HeaderReadWriteUtils::readCodePointVectorAttributeValue( + const AttributeMap *const headerAttributes, const char *const key) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return std::vector<int>(); + } else { + return it->second; + } +} + /* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( const AttributeMap *const headerAttributes, const char *const key, const bool defaultValue) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h index 225968323..4185a2e7c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h @@ -17,11 +17,10 @@ #ifndef LATINIME_HEADER_READ_WRITE_UTILS_H #define LATINIME_HEADER_READ_WRITE_UTILS_H -#include <map> #include <stdint.h> -#include <vector> #include "defines.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" namespace latinime { @@ -31,34 +30,21 @@ class BufferWithExtendableBuffer; class HeaderReadWriteUtils { public: typedef uint16_t DictionaryFlags; - typedef std::map<std::vector<int>, std::vector<int> > AttributeMap; static int getHeaderSize(const uint8_t *const dictBuf); static DictionaryFlags getFlags(const uint8_t *const dictBuf); - static AK_FORCE_INLINE bool supportsDynamicUpdate(const DictionaryFlags flags) { - return (flags & SUPPORTS_DYNAMIC_UPDATE_FLAG) != 0; - } - - static AK_FORCE_INLINE bool requiresGermanUmlautProcessing(const DictionaryFlags flags) { - return (flags & GERMAN_UMLAUT_PROCESSING_FLAG) != 0; - } - - static AK_FORCE_INLINE bool requiresFrenchLigatureProcessing(const DictionaryFlags flags) { - return (flags & FRENCH_LIGATURE_PROCESSING_FLAG) != 0; - } - static AK_FORCE_INLINE int getHeaderOptionsPosition() { return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE + HEADER_SIZE_FIELD_SIZE; } static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( - const HeaderReadWriteUtils::AttributeMap *const attributeMap); + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, - AttributeMap *const headerAttributes); + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, int *const writingPos); @@ -70,25 +56,38 @@ class HeaderReadWriteUtils { const int size, int *const writingPos); static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, - const AttributeMap *const headerAttributes, int *const writingPos); + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + int *const writingPos); /** * Methods for header attributes. */ - static void setBoolAttribute(AttributeMap *const headerAttributes, + static void setCodePointVectorAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const std::vector<int> value); + + static void setBoolAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, const char *const key, const bool value); - static void setIntAttribute(AttributeMap *const headerAttributes, + static void setIntAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, const char *const key, const int value); - static bool readBoolAttributeValue(const AttributeMap *const headerAttributes, + static const std::vector<int> readCodePointVectorAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key); + + static bool readBoolAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, const char *const key, const bool defaultValue); - static int readIntAttributeValue(const AttributeMap *const headerAttributes, + static int readIntAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, const char *const key, const int defaultValue); static void insertCharactersIntoVector(const char *const characters, - AttributeMap::key_type *const key); + DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key); private: DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); @@ -101,23 +100,18 @@ class HeaderReadWriteUtils { static const int HEADER_FLAG_SIZE; static const int HEADER_SIZE_FIELD_SIZE; + // Value for the "flags" field. It's unused at the moment. static const DictionaryFlags NO_FLAGS; - // Flags for special processing - // Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAGS) or - // something very bad (like, the apocalypse) will happen. Please update both at the same time. - static const DictionaryFlags GERMAN_UMLAUT_PROCESSING_FLAG; - static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG; - static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG; - - static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY; - static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; - static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY; - - static void setIntAttributeInner(AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const int value); - - static int readIntAttributeValueInner(const AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const int defaultValue); + + static void setIntAttributeInner( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int value); + + static int readIntAttributeValueInner( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int defaultValue); }; } #endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h deleted file mode 100644 index bd3211f6a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H -#define LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H - -#include <stdint.h> - -#include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -/* - * This is a dynamic version of ShortcutListPolicy and supports an additional buffer. - */ -class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { - public: - explicit DynamicShortcutListPolicy(const BufferWithExtendableBuffer *const buffer) - : mBuffer(buffer) {} - - ~DynamicShortcutListPolicy() {} - - int getStartPos(const int pos) const { - if (pos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - return pos + ShortcutListReadingUtils::getShortcutListSizeFieldSize(); - } - - void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, - int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, - int *const pos) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - *pos -= mBuffer->getOriginalBufferSize(); - } - const ShortcutListReadingUtils::ShortcutFlags flags = - ShortcutListReadingUtils::getFlagsAndForwardPointer(buffer, pos); - if (outHasNext) { - *outHasNext = ShortcutListReadingUtils::hasNext(flags); - } - if (outIsWhitelist) { - *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); - } - if (outCodePoint) { - *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( - buffer, maxCodePointCount, outCodePoint, pos); - } - if (usesAdditionalBuffer) { - *pos += mBuffer->getOriginalBufferSize(); - } - } - - void skipAllShortcuts(int *const pos) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - *pos -= mBuffer->getOriginalBufferSize(); - } - const int shortcutListSize = ShortcutListReadingUtils - ::getShortcutListSizeAndForwardPointer(buffer, pos); - *pos += shortcutListSize; - if (usesAdditionalBuffer) { - *pos += mBuffer->getOriginalBufferSize(); - } - } - - // Copy shortcuts from the shortcut list that starts at fromPos in mBuffer to toPos in - // bufferToWrite and advance these positions after the shortcut lists. This returns whether - // the copy was succeeded or not. - bool copyAllShortcutsAndReturnIfSucceededOrNot(BufferWithExtendableBuffer *const bufferToWrite, - int *const fromPos, int *const toPos) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); - if (usesAdditionalBuffer) { - *fromPos -= mBuffer->getOriginalBufferSize(); - } - const int shortcutListSize = ShortcutListReadingUtils - ::getShortcutListSizeAndForwardPointer(mBuffer->getBuffer(usesAdditionalBuffer), - fromPos); - // Copy shortcut list size. - if (!bufferToWrite->writeUintAndAdvancePosition( - shortcutListSize + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), - ShortcutListReadingUtils::getShortcutListSizeFieldSize(), toPos)) { - return false; - } - // Copy shortcut list. - for (int i = 0; i < shortcutListSize; ++i) { - const uint8_t data = ByteArrayUtils::readUint8AndAdvancePosition( - mBuffer->getBuffer(usesAdditionalBuffer), fromPos); - if (!bufferToWrite->writeUintAndAdvancePosition(data, 1 /* size */, toPos)) { - return false; - } - } - if (usesAdditionalBuffer) { - *fromPos += mBuffer->getOriginalBufferSize(); - } - return true; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicShortcutListPolicy); - - const BufferWithExtendableBuffer *const mBuffer; -}; -} // namespace latinime -#endif // LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h new file mode 100644 index 000000000..ae863af57 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_SHORTCUT_LIST_POLICY_H +#define LATINIME_VER4_SHORTCUT_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" + +namespace latinime { + +class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable) + : mShortcutDictContent(shortcutDictContent), + mTerminalPositionLookupTable(terminalPositionLookupTable) {} + + ~Ver4ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + // The first shortcut entry is located at the head position of the shortcut list. + return pos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + int probability = 0; + mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, + outCodePoint, outCodePointCount, &probability, outHasNext, pos); + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); + } + } + + void skipAllShortcuts(int *const pos) const { + // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. + } + + bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, + const int probability) { + const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (shortcutListPos == NOT_A_DICT_POS) { + // Create shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, + false /* hasNext */, writingPos); + } + const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, + codePoints, codePointCount); + if (entryPos == NOT_A_DICT_POS) { + // Add new entry to the shortcut list. + // Create new shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, + codePointCount, probability, true /* hasNext */, &writingPos)) { + AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, + writingPos); + return false; + } + return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); + } + // Overwrite existing entry. + bool hasNext = false; + mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, + 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); + if (!mShortcutDictContent->writeShortcutEntry(codePoints, + codePointCount, probability, hasNext, entryPos)) { + AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, + entryPos); + return false; + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); + + ShortcutDictContent *const mShortcutDictContent; + const TerminalPositionLookupTable *const mTerminalPositionLookupTable; +}; +} // namespace latinime +#endif // LATINIME_VER4_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp new file mode 100644 index 000000000..04f119803 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" + +#include <climits> +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/utils/file_utils.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory + ::newDictionaryStructureWithBufferPolicy(const char *const path, + const int bufOffset, const int size, const bool isUpdatable) { + if (FileUtils::existsDir(path)) { + // Given path represents a directory. + return newPolicyforDirectoryDict(path, isUpdatable); + } else { + if (isUpdatable) { + AKLOGE("One file dictionaries don't support updating. path: %s", path); + ASSERT(false); + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(0); + } + return newPolicyforFileDict(path, bufOffset, size); + } +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyforDirectoryDict( + const char *const path, const bool isUpdatable) { + const int headerFilePathBufSize = PATH_MAX + 1 /* terminator */; + char headerFilePath[headerFilePathBufSize]; + getHeaderFilePathInDictDir(path, headerFilePathBufSize, headerFilePath); + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // MmappedBufferPtr if the instance has the responsibility. + MmappedBuffer::MmappedBufferPtr mmappedBuffer = MmappedBuffer::openBuffer(headerFilePath, + isUpdatable); + if (!mmappedBuffer.get()) { + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(0); + } + switch (FormatUtils::detectFormatVersion(mmappedBuffer.get()->getBuffer(), + mmappedBuffer.get()->getBufferSize())) { + case FormatUtils::VERSION_2: + AKLOGE("Given path is a directory but the format is version 2. path: %s", path); + break; + case FormatUtils::VERSION_4: { + const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */; + char dictPath[dictDirPathBufSize]; + if (!FileUtils::getFilePathWithoutSuffix(headerFilePath, + Ver4DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) { + AKLOGE("Dictionary file name is not valid as a ver4 dictionary. path: %s", path); + ASSERT(false); + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(0); + } + const Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers = + Ver4DictBuffers::openVer4DictBuffers(dictPath, mmappedBuffer); + if (!dictBuffers.get() || !dictBuffers.get()->isValid()) { + AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s", + path); + ASSERT(false); + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(0); + } + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new Ver4PatriciaTriePolicy(dictBuffers)); + } + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); + break; + } + ASSERT(false); + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(0); +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyforFileDict( + const char *const path, const int bufOffset, const int size) { + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // MmappedBufferPtr if the instance has the responsibility. + MmappedBuffer::MmappedBufferPtr mmappedBuffer = MmappedBuffer::openBuffer(path, bufOffset, + size, false /* isUpdatable */); + if (!mmappedBuffer.get()) { + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(0); + } + switch (FormatUtils::detectFormatVersion(mmappedBuffer.get()->getBuffer(), + mmappedBuffer.get()->getBufferSize())) { + case FormatUtils::VERSION_2: + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new PatriciaTriePolicy(mmappedBuffer)); + case FormatUtils::VERSION_4: + AKLOGE("Given path is a file but the format is version 4. path: %s", path); + break; + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); + break; + } + ASSERT(false); + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(0); +} + +/* static */ void DictionaryStructureWithBufferPolicyFactory::getHeaderFilePathInDictDir( + const char *const dictDirPath, const int outHeaderFileBufSize, + char *const outHeaderFilePath) { + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + snprintf(outHeaderFilePath, outHeaderFileBufSize, "%s/%s%s", dictDirPath, + dictName, Ver4DictConstants::HEADER_FILE_EXTENSION); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h index 8cebc3b16..45ab52931 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h @@ -21,16 +21,27 @@ #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/exclusive_ownership_pointer.h" namespace latinime { class DictionaryStructureWithBufferPolicyFactory { public: - static DictionaryStructureWithBufferPolicy *newDictionaryStructureWithBufferPolicy( - const char *const path, const int bufOffset, const int size, const bool isUpdatable); + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newDictionaryStructureWithBufferPolicy(const char *const path, const int bufOffset, + const int size, const bool isUpdatable); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyforDirectoryDict(const char *const path, const bool isUpdatable); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyforFileDict(const char *const path, const int bufOffset, const int size); + + static void getHeaderFilePathInDictDir(const char *const dirPath, + const int outHeaderFileBufSize, char *const outHeaderFilePath); }; } // namespace latinime #endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp new file mode 100644 index 000000000..8f42df6d2 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" + +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" + +namespace latinime { + +bool DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless + // children. + bool isUselessPtNode = !ptNodeParams->isTerminal(); + if (ptNodeParams->isTerminal()) { + bool needsToKeepPtNode = true; + if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(ptNodeParams, + &needsToKeepPtNode)) { + AKLOGE("Cannot update PtNode probability or get needs to keep PtNode after GC."); + return false; + } + if (!needsToKeepPtNode) { + isUselessPtNode = true; + } + } + if (mChildrenValue > 0) { + isUselessPtNode = false; + } else if (ptNodeParams->isTerminal()) { + // Remove children as all children are useless. + if (!mPtNodeWriter->updateChildrenPosition(ptNodeParams, + NOT_A_DICT_POS /* newChildrenPosition */)) { + return false; + } + } + if (isUselessPtNode) { + // Current PtNode is no longer needed. Mark it as deleted. + if (!mPtNodeWriter->markPtNodeAsDeleted(ptNodeParams)) { + return false; + } + } else { + mValueStack.back() += 1; + if (ptNodeParams->isTerminal()) { + mValidUnigramCount += 1; + } + } + return true; +} + +bool DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isDeleted() && ptNodeParams->hasBigrams()) { + int bigramEntryCount = 0; + if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams, + &bigramEntryCount)) { + return false; + } + mValidBigramEntryCount += bigramEntryCount; + } + return true; +} + +// Writes dummy PtNode array size when the head of PtNode array is read. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onDescend(const int ptNodeArrayPos) { + mValidPtNodeCount = 0; + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( + PtNodeWriter::PtNodeArrayPositionRelocationMap::value_type(ptNodeArrayPos, writingPos)); + // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. + // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. + mPtNodeArraySizeFieldPos = writingPos; + return DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, 0 /* arraySize */, &writingPos); +} + +// Write PtNode array terminal and actual PtNode array size. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onReadingPtNodeArrayTail() { + int writingPos = mBufferToWrite->getTailPosition(); + // Write PtNode array terminal. + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( + mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Write actual PtNode array size. + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { + return false; + } + return true; +} + +// Write valid PtNode to buffer and memorize mapping from the old position to the new position. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (ptNodeParams->isDeleted()) { + // Current PtNode is not written in new buffer because it has been deleted. + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + PtNodeWriter::PtNodePositionRelocationMap::value_type( + ptNodeParams->getHeadPos(), NOT_A_DICT_POS)); + return true; + } + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + PtNodeWriter::PtNodePositionRelocationMap::value_type( + ptNodeParams->getHeadPos(), writingPos)); + mValidPtNodeCount++; + // Writes current PtNode. + return mPtNodeWriter->writePtNodeAndAdvancePosition(ptNodeParams, &writingPos); +} + +bool DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + // Updates parent position. + int bigramCount = 0; + if (!mPtNodeWriter->updateAllPositionFields(ptNodeParams, mDictPositionRelocationMap, + &bigramCount)) { + return false; + } + mBigramCount += bigramCount; + if (ptNodeParams->isTerminal()) { + mUnigramCount++; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h index 9755120b0..d8867754d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h @@ -14,37 +14,32 @@ * limitations under the License. */ -#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H -#define LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H +#ifndef LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H +#define LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H #include <vector> #include "defines.h" -#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" #include "utils/hash_map_compat.h" namespace latinime { -class DictionaryHeaderStructurePolicy; +class PtNodeParams; -class DynamicPatriciaTrieGcEventListeners { +class DynamicPtGcEventListeners { public: // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC. // TODO: Concatenate non-terminal PtNodes. class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + : public DynamicPtReadingHelper::TraversingEventListener { public: TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - const DictionaryHeaderStructurePolicy *const headerPolicy, - DynamicPatriciaTrieWritingHelper *const writingHelper, - BufferWithExtendableBuffer *const buffer, const bool isDecayingDict) - : mHeaderPolicy(headerPolicy), mWritingHelper(writingHelper), mBuffer(buffer), - mIsDecayingDict(isDecayingDict), mValueStack(), mChildrenValue(0), + PtNodeWriter *const ptNodeWriter) + : mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {} ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; @@ -66,8 +61,7 @@ class DynamicPatriciaTrieGcEventListeners { bool onReadingPtNodeArrayTail() { return true; } - bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints); + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); int getValidUnigramCount() const { return mValidUnigramCount; @@ -77,10 +71,7 @@ class DynamicPatriciaTrieGcEventListeners { DISALLOW_IMPLICIT_CONSTRUCTORS( TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); - const DictionaryHeaderStructurePolicy *const mHeaderPolicy; - DynamicPatriciaTrieWritingHelper *const mWritingHelper; - BufferWithExtendableBuffer *const mBuffer; - const bool mIsDecayingDict; + PtNodeWriter *const mPtNodeWriter; std::vector<int> mValueStack; int mChildrenValue; int mValidUnigramCount; @@ -89,11 +80,10 @@ class DynamicPatriciaTrieGcEventListeners { // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram // entries. class TraversePolicyToUpdateBigramProbability - : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + : public DynamicPtReadingHelper::TraversingEventListener { public: - TraversePolicyToUpdateBigramProbability( - DynamicBigramListPolicy *const bigramPolicy) - : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {} + TraversePolicyToUpdateBigramProbability(PtNodeWriter *const ptNodeWriter) + : mPtNodeWriter(ptNodeWriter), mValidBigramEntryCount(0) {} bool onAscend() { return true; } @@ -101,8 +91,7 @@ class DynamicPatriciaTrieGcEventListeners { bool onReadingPtNodeArrayTail() { return true; } - bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints); + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); int getValidBigramEntryCount() const { return mValidBigramEntryCount; @@ -111,19 +100,17 @@ class DynamicPatriciaTrieGcEventListeners { private: DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); - DynamicBigramListPolicy *const mBigramPolicy; + PtNodeWriter *const mPtNodeWriter; int mValidBigramEntryCount; }; class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + : public DynamicPtReadingHelper::TraversingEventListener { public: TraversePolicyToPlaceAndWriteValidPtNodesToBuffer( - DynamicPatriciaTrieWritingHelper *const writingHelper, - BufferWithExtendableBuffer *const bufferToWrite, - DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const - dictPositionRelocationMap) - : mWritingHelper(writingHelper), mBufferToWrite(bufferToWrite), + PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const bufferToWrite, + PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) + : mPtNodeWriter(ptNodeWriter), mBufferToWrite(bufferToWrite), mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0), mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {}; @@ -133,31 +120,24 @@ class DynamicPatriciaTrieGcEventListeners { bool onReadingPtNodeArrayTail(); - bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints); + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); private: DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer); - DynamicPatriciaTrieWritingHelper *const mWritingHelper; + PtNodeWriter *const mPtNodeWriter; BufferWithExtendableBuffer *const mBufferToWrite; - DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const - mDictPositionRelocationMap; + PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; int mValidPtNodeCount; int mPtNodeArraySizeFieldPos; }; class TraversePolicyToUpdateAllPositionFields - : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + : public DynamicPtReadingHelper::TraversingEventListener { public: - TraversePolicyToUpdateAllPositionFields( - DynamicPatriciaTrieWritingHelper *const writingHelper, - DynamicBigramListPolicy *const bigramPolicy, - BufferWithExtendableBuffer *const bufferToWrite, - const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const - dictPositionRelocationMap) - : mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy), - mBufferToWrite(bufferToWrite), + TraversePolicyToUpdateAllPositionFields(PtNodeWriter *const ptNodeWriter, + const PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) + : mPtNodeWriter(ptNodeWriter), mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0), mBigramCount(0) {}; @@ -167,8 +147,7 @@ class DynamicPatriciaTrieGcEventListeners { bool onReadingPtNodeArrayTail() { return true; } - bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, - const int *const nodeCodePoints); + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); int getUnigramCount() const { return mUnigramCount; @@ -181,17 +160,14 @@ class DynamicPatriciaTrieGcEventListeners { private: DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); - DynamicPatriciaTrieWritingHelper *const mWritingHelper; - DynamicBigramListPolicy *const mBigramPolicy; - BufferWithExtendableBuffer *const mBufferToWrite; - const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const - mDictPositionRelocationMap; + PtNodeWriter *const mPtNodeWriter; + const PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; int mUnigramCount; int mBigramCount; }; private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieGcEventListeners); + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtGcEventListeners); }; } // namespace latinime -#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H */ +#endif /* LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp new file mode 100644 index 000000000..086d98b4a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" +#include "utils/char_utils.h" + +namespace latinime { + +// To avoid infinite loop caused by invalid or malicious forward links. +const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; + +bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode( + const PtNodeParams *const ptNodeParams) { + if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) { + mTerminalPositions->push_back(ptNodeParams->getHeadPos()); + } + return true; +} + +// Visits all PtNodes in post-order depth first manner. +// For example, visits c -> b -> y -> x -> a for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + if (!alreadyVisitedChildren) { + if (ptNodeParams.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(ptNodeParams.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(ptNodeParams); + } else { + alreadyVisitedChildren = true; + } + } else { + if (!listener->onVisitingPtNode(&ptNodeParams)) { + return false; + } + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + // All PtNodes in current linked PtNode arrays have been visited. + // Return to the parent. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + if (!listener->onAscend()) { + return false; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + } else { + // Process sibling PtNode. + alreadyVisitedChildren = false; + } + } + } + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order +// that PtNodes are written in the dictionary buffer. +// For example, visits a -> b -> x -> c -> y for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedAllPtNodesInArray = false; + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + if (isEnd()) { + // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + } + pushReadingStateToStack(); + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + if (alreadyVisitedAllPtNodesInArray) { + if (alreadyVisitedChildren) { + // Move to next sibling PtNode's children. + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + // Return to the parent PTNode. + if (!listener->onAscend()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + alreadyVisitedAllPtNodesInArray = true; + } else { + alreadyVisitedChildren = false; + } + } else { + if (ptNodeParams.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(ptNodeParams.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(ptNodeParams); + // Push state to return the head of PtNode array. + pushReadingStateToStack(); + alreadyVisitedAllPtNodesInArray = false; + alreadyVisitedChildren = false; + } else { + alreadyVisitedChildren = true; + } + } + } else { + if (!listener->onVisitingPtNode(&ptNodeParams)) { + return false; + } + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + // Return to the head of current PtNode array. + popReadingStateFromStack(); + alreadyVisitedAllPtNodesInArray = true; + } + } + } + popReadingStateFromStack(); + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount( + const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) { + // This method traverses parent nodes from the terminal by following parent pointers; thus, + // node code points are stored in the buffer in the reverse order. + int reverseCodePoints[maxCodePointCount]; + const PtNodeParams terminalPtNodeParams(getPtNodeParams()); + // First, read the terminal node and get its probability. + if (!isValidTerminalNode(terminalPtNodeParams)) { + // Node at the ptNodePos is not a valid terminal node. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Store terminal node probability. + *outUnigramProbability = terminalPtNodeParams.getProbability(); + // Then, following parent node link to the dictionary root and fetch node code points. + int totalCodePointCount = 0; + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + totalCodePointCount = getTotalCodePointCount(ptNodeParams); + if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) { + // The ptNodePos is not a valid terminal node position in the dictionary. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Store node code points to buffer in the reverse order. + fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(), + reverseCodePoints); + // Follow parent node toward the root node. + readParentNode(ptNodeParams); + } + if (isError()) { + // The node position or the dictionary is invalid. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Reverse the stored code points to output them. + for (int i = 0; i < totalCodePointCount; ++i) { + outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1]; + } + return totalCodePointCount; +} + +int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) { + int searchCodePoints[length]; + for (int i = 0; i < length; ++i) { + searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; + } + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + const int matchedCodePointCount = getPrevTotalCodePointCount(); + if (getTotalCodePointCount(ptNodeParams) > length + || !isMatchedCodePoint(ptNodeParams, 0 /* index */, + searchCodePoints[matchedCodePointCount])) { + // Current node has too many code points or its first code point is different from + // target code point. Skip this node and read the next sibling node. + readNextSiblingNode(ptNodeParams); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) { + // Different code point is found. The given word is not included in the dictionary. + return NOT_A_DICT_POS; + } + } + // All characters are matched. + if (length == getTotalCodePointCount(ptNodeParams)) { + if (!ptNodeParams.isTerminal()) { + return NOT_A_DICT_POS; + } + // Terminal position is found. + return ptNodeParams.getHeadPos(); + } + if (!ptNodeParams.hasChildren()) { + return NOT_A_DICT_POS; + } + // Advance to the children nodes. + readChildNode(ptNodeParams); + } + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + return NOT_A_DICT_POS; +} + +// Read node array size and process empty node arrays. Nodes and arrays are counted up in this +// method to avoid an infinite loop. +void DynamicPtReadingHelper::nextPtNodeArray() { + int ptNodeCountInArray = 0; + int firstPtNodePos = NOT_A_DICT_POS; + if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid( + mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) { + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos; + mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray; + mReadingState.mPos = firstPtNodePos; + // Count up nodes and node arrays to avoid infinite loop. + mReadingState.mTotalPtNodeIndexInThisArrayChain += + mReadingState.mRemainingPtNodeCountInThisArray; + mReadingState.mPtNodeArrayIndexInThisArrayChain++; + if (mReadingState.mRemainingPtNodeCountInThisArray < 0 + || mReadingState.mTotalPtNodeIndexInThisArrayChain + > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mReadingState.mPtNodeArrayIndexInThisArrayChain + > MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + // Invalid dictionary. + AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" + "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", + mReadingState.mRemainingPtNodeCountInThisArray, + mReadingState.mTotalPtNodeIndexInThisArrayChain, + MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, + mReadingState.mPtNodeArrayIndexInThisArrayChain, + MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + if (mReadingState.mRemainingPtNodeCountInThisArray == 0) { + // Empty node array. Try following forward link. + followForwardLink(); + } +} + +// Follow the forward link and read the next node array if exists. +void DynamicPtReadingHelper::followForwardLink() { + int nextPtNodeArrayPos = NOT_A_DICT_POS; + if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid( + mReadingState.mPos, &nextPtNodeArrayPos)) { + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; + if (nextPtNodeArrayPos != NOT_A_DICT_POS) { + // Follow the forward link. + mReadingState.mPos = nextPtNodeArrayPos; + nextPtNodeArray(); + } else { + // All node arrays have been read. + mReadingState.mPos = NOT_A_DICT_POS; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h new file mode 100644 index 000000000..cc7b5ff70 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h @@ -0,0 +1,284 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_READING_HELPER_H +#define LATINIME_DYNAMIC_PT_READING_HELPER_H + +#include <cstddef> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { + +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; +class PtNodeArrayReader; + +/* + * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and + * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. + */ +class DynamicPtReadingHelper { + public: + class TraversingEventListener { + public: + virtual ~TraversingEventListener() {}; + + // Returns whether the event handling was succeeded or not. + virtual bool onAscend() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onDescend(const int ptNodeArrayPos) = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onReadingPtNodeArrayTail() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onVisitingPtNode(const PtNodeParams *const node) = 0; + + protected: + TraversingEventListener() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); + }; + + class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener { + public: + TraversePolicyToGetAllTerminalPtNodePositions(std::vector<int> *const terminalPositions) + : mTerminalPositions(terminalPositions) {} + bool onAscend() { return true; } + bool onDescend(const int ptNodeArrayPos) { return true; } + bool onReadingPtNodeArrayTail() { return true; } + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions); + + std::vector<int> *const mTerminalPositions; + }; + + DynamicPtReadingHelper(const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader) + : mIsError(false), mReadingState(), mPtNodeReader(ptNodeReader), + mPtNodeArrayReader(ptNodeArrayReader), mReadingStateStack() {} + + ~DynamicPtReadingHelper() {} + + AK_FORCE_INLINE bool isError() const { + return mIsError; + } + + AK_FORCE_INLINE bool isEnd() const { + return mReadingState.mPos == NOT_A_DICT_POS; + } + + // Initialize reading state with the head position of a PtNode array. + AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { + if (ptNodeArrayPos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodeArrayPos; + mReadingState.mTotalCodePointCountSinceInitialization = 0; + mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingStateStack.clear(); + nextPtNodeArray(); + } + } + + // Initialize reading state with the head position of a node. + AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { + if (ptNodePos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodePos; + mReadingState.mRemainingPtNodeCountInThisArray = 1; + mReadingState.mTotalCodePointCountSinceInitialization = 0; + mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; + mReadingStateStack.clear(); + } + } + + AK_FORCE_INLINE const PtNodeParams getPtNodeParams() const { + if (isEnd()) { + return PtNodeParams(); + } + return mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(mReadingState.mPos); + } + + AK_FORCE_INLINE bool isValidTerminalNode(const PtNodeParams &ptNodeParams) const { + return !isEnd() && !ptNodeParams.isDeleted() && ptNodeParams.isTerminal(); + } + + AK_FORCE_INLINE bool isMatchedCodePoint(const PtNodeParams &ptNodeParams, const int index, + const int codePoint) const { + return ptNodeParams.getCodePoints()[index] == codePoint; + } + + // Return code point count exclude the last read node's code points. + AK_FORCE_INLINE int getPrevTotalCodePointCount() const { + return mReadingState.mTotalCodePointCountSinceInitialization; + } + + // Return code point count include the last read node's code points. + AK_FORCE_INLINE int getTotalCodePointCount(const PtNodeParams &ptNodeParams) const { + return mReadingState.mTotalCodePointCountSinceInitialization + + ptNodeParams.getCodePointCount(); + } + + AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder(const PtNodeParams &ptNodeParams, + const int index, int *const outCodePoints) const { + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + const int *const nodeCodePoints = ptNodeParams.getCodePoints(); + for (int i = 0; i < nodeCodePointCount; ++i) { + outCodePoints[index + i] = nodeCodePoints[nodeCodePointCount - 1 - i]; + } + } + + AK_FORCE_INLINE void readNextSiblingNode(const PtNodeParams &ptNodeParams) { + mReadingState.mRemainingPtNodeCountInThisArray -= 1; + mReadingState.mPos = ptNodeParams.getSiblingNodePos(); + if (mReadingState.mRemainingPtNodeCountInThisArray <= 0) { + // All nodes in the current node array have been read. + followForwardLink(); + } + } + + // Read the first child node of the current node. + AK_FORCE_INLINE void readChildNode(const PtNodeParams &ptNodeParams) { + if (ptNodeParams.hasChildren()) { + mReadingState.mTotalCodePointCountSinceInitialization += + ptNodeParams.getCodePointCount(); + mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; + mReadingState.mPos = ptNodeParams.getChildrenPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + // Read children node array. + nextPtNodeArray(); + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + // Read the parent node of the current node. + AK_FORCE_INLINE void readParentNode(const PtNodeParams &ptNodeParams) { + if (ptNodeParams.getParentPos() != NOT_A_DICT_POS) { + mReadingState.mTotalCodePointCountSinceInitialization += + ptNodeParams.getCodePointCount(); + mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; + mReadingState.mRemainingPtNodeCountInThisArray = 1; + mReadingState.mPos = ptNodeParams.getParentPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { + return mReadingState.mPosOfLastForwardLinkField; + } + + AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { + return mReadingState.mPosOfThisPtNodeArrayHead; + } + + bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); + + bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener); + + int getCodePointsAndProbabilityAndReturnCodePointCount(const int maxCodePointCount, + int *const outCodePoints, int *const outUnigramProbability); + + int getTerminalPtNodePositionOfWord(const int *const inWord, const int length, + const bool forceLowerCaseSearch); + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPtReadingHelper); + + // This class encapsulates the reading state of a position in the dictionary. It points at a + // specific PtNode in the dictionary. + class PtNodeReadingState { + public: + // Note that copy constructor and assignment operator are used for this class to use + // std::vector. + PtNodeReadingState() : mPos(NOT_A_DICT_POS), mRemainingPtNodeCountInThisArray(0), + mTotalCodePointCountSinceInitialization(0), mTotalPtNodeIndexInThisArrayChain(0), + mPtNodeArrayIndexInThisArrayChain(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), + mPosOfThisPtNodeArrayHead(NOT_A_DICT_POS) {} + + int mPos; + // Remaining node count in the current array. + int mRemainingPtNodeCountInThisArray; + int mTotalCodePointCountSinceInitialization; + // Counter of PtNodes used to avoid infinite loops caused by broken or malicious links. + int mTotalPtNodeIndexInThisArrayChain; + // Counter of PtNode arrays used to avoid infinite loops caused by cyclic links of empty + // PtNode arrays. + int mPtNodeArrayIndexInThisArrayChain; + int mPosOfLastForwardLinkField; + int mPosOfThisPtNodeArrayHead; + }; + + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + static const int MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + static const size_t MAX_READING_STATE_STACK_SIZE; + + // TODO: Introduce error code to track what caused the error. + bool mIsError; + PtNodeReadingState mReadingState; + const PtNodeReader *const mPtNodeReader; + const PtNodeArrayReader *const mPtNodeArrayReader; + std::vector<PtNodeReadingState> mReadingStateStack; + + void nextPtNodeArray(); + + void followForwardLink(); + + AK_FORCE_INLINE void pushReadingStateToStack() { + if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { + AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingStateStack.push_back(mReadingState); + } + } + + AK_FORCE_INLINE void popReadingStateFromStack() { + if (mReadingStateStack.empty()) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingState = mReadingStateStack.back(); + mReadingStateStack.pop_back(); + } + } +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_READING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp index d68446db6..3586b50ab 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp @@ -14,38 +14,38 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "defines.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { -typedef DynamicPatriciaTrieReadingUtils DptReadingUtils; - -const DptReadingUtils::NodeFlags DptReadingUtils::MASK_MOVED = 0xC0; -const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; -const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_MOVED = 0x40; -const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_DELETED = 0x80; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::MASK_MOVED = 0xC0; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_MOVED = 0x40; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_DELETED = 0x80; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_WILL_BECOME_NON_TERMINAL = 0x00; // TODO: Make DICT_OFFSET_ZERO_OFFSET = 0. // Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum // value of offsets, which is 0x7FFFFF is used to represent 0 offset. -const int DptReadingUtils::DICT_OFFSET_INVALID = 0; -const int DptReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; +const int DynamicPtReadingUtils::DICT_OFFSET_INVALID = 0; +const int DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; -/* static */ int DptReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, +/* static */ int DynamicPtReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, const int pos) { int linkAddressPos = pos; return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos); } -/* static */ int DptReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( +/* static */ int DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( const uint8_t *const buffer, int *const pos) { return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); } -/* static */ int DptReadingUtils::getParentPtNodePos(const int parentOffset, const int ptNodePos) { +/* static */ int DynamicPtReadingUtils::getParentPtNodePos(const int parentOffset, + const int ptNodePos) { if (parentOffset == DICT_OFFSET_INVALID) { return NOT_A_DICT_POS; } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) { @@ -55,7 +55,7 @@ const int DptReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; } } -/* static */ int DptReadingUtils::readChildrenPositionAndAdvancePosition( +/* static */ int DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( const uint8_t *const buffer, int *const pos) { const int base = *pos; const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h index 67c3cc57e..89ae12c0b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H -#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H +#ifndef LATINIME_DYNAMIC_PT_READING_UTILS_H +#define LATINIME_DYNAMIC_PT_READING_UTILS_H #include <stdint.h> @@ -23,7 +23,7 @@ namespace latinime { -class DynamicPatriciaTrieReadingUtils { +class DynamicPtReadingUtils { public: typedef uint8_t NodeFlags; @@ -54,22 +54,30 @@ class DynamicPatriciaTrieReadingUtils { return FLAG_IS_DELETED == (MASK_MOVED & flags); } + static AK_FORCE_INLINE bool willBecomeNonTerminal(const NodeFlags flags) { + return FLAG_WILL_BECOME_NON_TERMINAL == (MASK_MOVED & flags); + } + static AK_FORCE_INLINE NodeFlags updateAndGetFlags(const NodeFlags originalFlags, - const bool isMoved, const bool isDeleted) { + const bool isMoved, const bool isDeleted, const bool willBecomeNonTerminal) { NodeFlags flags = originalFlags; + flags = willBecomeNonTerminal ? + ((flags & (~MASK_MOVED)) | FLAG_WILL_BECOME_NON_TERMINAL) : flags; flags = isMoved ? ((flags & (~MASK_MOVED)) | FLAG_IS_MOVED) : flags; flags = isDeleted ? ((flags & (~MASK_MOVED)) | FLAG_IS_DELETED) : flags; - flags = (!isMoved && !isDeleted) ? ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags; + flags = (!isMoved && !isDeleted && !willBecomeNonTerminal) ? + ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags; return flags; } private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieReadingUtils); + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtReadingUtils); static const NodeFlags MASK_MOVED; static const NodeFlags FLAG_IS_NOT_MOVED; static const NodeFlags FLAG_IS_MOVED; static const NodeFlags FLAG_IS_DELETED; + static const NodeFlags FLAG_WILL_BECOME_NON_TERMINAL; }; } // namespace latinime -#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H */ +#endif /* LATINIME_DYNAMIC_PT_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp new file mode 100644 index 000000000..2457b49c8 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp @@ -0,0 +1,294 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool DynamicPtUpdatingHelper::addUnigramWord( + DynamicPtReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability, + const bool isNotAWord, const bool isBlacklisted, const int timestamp, + bool *const outAddedNewUnigram) { + int parentPos = NOT_A_DICT_POS; + while (!readingHelper->isEnd()) { + const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(ptNodeParams); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + const int nextIndex = matchedCodePointCount + j; + if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j, + wordCodePoints[matchedCodePointCount + j])) { + *outAddedNewUnigram = true; + return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, isNotAWord, isBlacklisted, + probability, timestamp, wordCodePoints + matchedCodePointCount, + codePointCount - matchedCodePointCount); + } + } + // All characters are matched. + if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { + return setPtNodeProbability(&ptNodeParams, isNotAWord, isBlacklisted, probability, + timestamp, outAddedNewUnigram); + } + if (!ptNodeParams.hasChildren()) { + *outAddedNewUnigram = true; + return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, + isNotAWord, isBlacklisted, probability, timestamp, + wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), + codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); + } + // Advance to the children nodes. + parentPos = ptNodeParams.getHeadPos(); + readingHelper->readChildNode(ptNodeParams); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + int pos = readingHelper->getPosOfLastForwardLinkField(); + *outAddedNewUnigram = true; + return createAndInsertNodeIntoPtNodeArray(parentPos, + wordCodePoints + readingHelper->getPrevTotalCodePointCount(), + codePointCount - readingHelper->getPrevTotalCodePointCount(), + isNotAWord, isBlacklisted, probability, timestamp, &pos); +} + +bool DynamicPtUpdatingHelper::addBigramWords(const int word0Pos, const int word1Pos, + const int probability, const int timestamp, bool *const outAddedNewBigram) { + const PtNodeParams sourcePtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); + const PtNodeParams targetPtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); + return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, probability, + timestamp, outAddedNewBigram); +} + +// Remove a bigram relation from word0Pos to word1Pos. +bool DynamicPtUpdatingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { + const PtNodeParams sourcePtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); + const PtNodeParams targetPtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); + return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams); +} + +bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + const PtNodeParams ptNodeParams(mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(wordPos)); + return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount, + shortcutProbability); +} + +bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, + const int *const nodeCodePoints, const int nodeCodePointCount, + const bool isNotAWord, const bool isBlacklisted, const int probability, + const int timestamp, int *const forwardLinkFieldPos) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, forwardLinkFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, + isNotAWord, isBlacklisted, probability, timestamp); +} + +bool DynamicPtUpdatingHelper::setPtNodeProbability( + const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, + const bool isBlacklisted, const int probability, const int timestamp, + bool *const outAddedNewUnigram) { + if (originalPtNodeParams->isTerminal()) { + // Overwrites the probability. + *outAddedNewUnigram = false; + return mPtNodeWriter->updatePtNodeProbability(originalPtNodeParams, probability, timestamp); + } else { + // Make the node terminal and write the probability. + *outAddedNewUnigram = true; + const int movedPos = mBuffer->getTailPosition(); + int writingPos = movedPos; + const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, + isNotAWord, isBlacklisted, true /* isTerminal */, + originalPtNodeParams->getParentPos(), originalPtNodeParams->getCodePointCount(), + originalPtNodeParams->getCodePoints(), probability)); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + timestamp, &writingPos)) { + return false; + } + if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { + return false; + } + } + return true; +} + +bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( + const PtNodeParams *const parentPtNodeParams, const bool isNotAWord, + const bool isBlacklisted, const int probability, const int timestamp, + const int *const codePoints, const int codePointCount) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, + codePointCount, isNotAWord, isBlacklisted, probability, timestamp); +} + +bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( + const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, + const bool isNotAWord, const bool isBlacklisted, const int probability, + const int timestamp) { + int writingPos = mBuffer->getTailPosition(); + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + 1 /* arraySize */, &writingPos)) { + return false; + } + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + isNotAWord, isBlacklisted, true /* isTerminal */, + parentPtNodePos, nodeCodePointCount, nodeCodePoints, probability)); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, timestamp, + &writingPos)) { + return false; + } + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + return true; +} + +// Returns whether the dictionary updating was succeeded or not. +bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( + const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, + const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode, + const int timestamp, const int *const newNodeCodePoints, const int newNodeCodePointCount) { + // When addsExtraChild is true, split the reallocating PtNode and add new child. + // Reallocating PtNode: abcde, newNode: abcxy. + // abc (1st, not terminal) __ de (2nd) + // \_ xy (extra child, terminal) + // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. + // Reallocating PtNode: abcde, newNode: abc. + // abc (1st, terminal) __ de (2nd) + const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; + const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); + int writingPos = firstPartOfReallocatedPtNodePos; + // Write the 1st part of the reallocating node. The children position will be updated later + // with actual children position. + if (addsExtraChild) { + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */, + reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, + reallocatingPtNodeParams->getCodePoints(), NOT_A_PROBABILITY)); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { + return false; + } + } else { + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + isNotAWord, isBlacklisted, true /* isTerminal */, + reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, + reallocatingPtNodeParams->getCodePoints(), probabilityOfNewPtNode)); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + timestamp, &writingPos)) { + return false; + } + } + const int actualChildrenPos = writingPos; + // Create new children PtNode array. + const size_t newPtNodeCount = addsExtraChild ? 2 : 1; + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + newPtNodeCount, &writingPos)) { + return false; + } + // Write the 2nd part of the reallocating node. + const int secondPartOfReallocatedPtNodePos = writingPos; + const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, + reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(), + reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, + reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount, + reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount, + reallocatingPtNodeParams->getProbability())); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { + return false; + } + if (addsExtraChild) { + const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( + isNotAWord, isBlacklisted, true /* isTerminal */, + firstPartOfReallocatedPtNodePos, newNodeCodePointCount - overlappingCodePointCount, + newNodeCodePoints + overlappingCodePointCount, probabilityOfNewPtNode)); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, + timestamp, &writingPos)) { + return false; + } + } + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Update original reallocating PtNode as moved. + if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, + secondPartOfReallocatedPtNodePos)) { + return false; + } + // Load node info. Information of the 1st part will be fetched. + const PtNodeParams ptNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); + // Update children position. + return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); +} + +const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( + const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, + const bool isBlacklisted, const bool isTerminal, const int parentPos, + const int codePointCount, const int *const codePoints, const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + isBlacklisted, isNotAWord, isTerminal, originalPtNodeParams->hasShortcutTargets(), + originalPtNodeParams->hasBigrams(), codePointCount > 1 /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints, + probability); +} + +const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode( + const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, + const int parentPos, const int codePointCount, const int *const codePoints, + const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h new file mode 100644 index 000000000..71f473096 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_UPDATING_HELPER_H +#define LATINIME_DYNAMIC_PT_UPDATING_HELPER_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DynamicPtReadingHelper; +class PtNodeReader; +class PtNodeWriter; + +class DynamicPtUpdatingHelper { + public: + DynamicPtUpdatingHelper(BufferWithExtendableBuffer *const buffer, + const PtNodeReader *const ptNodeReader, PtNodeWriter *const ptNodeWriter) + : mBuffer(buffer), mPtNodeReader(ptNodeReader), mPtNodeWriter(ptNodeWriter) {} + + ~DynamicPtUpdatingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPtReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability, + const bool isNotAWord, const bool isBlacklisted, const int timestamp, + bool *const outAddedNewUnigram); + + // Add a bigram relation from word0Pos to word1Pos. + bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, + const int timestamp, bool *const outAddedNewBigram); + + // Remove a bigram relation from word0Pos to word1Pos. + bool removeBigramWords(const int word0Pos, const int word1Pos); + + // Add a shortcut target. + bool addShortcutTarget(const int wordPos, const int *const targetCodePoints, + const int targetCodePointCount, const int shortcutProbability); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtUpdatingHelper); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mBuffer; + const PtNodeReader *const mPtNodeReader; + PtNodeWriter *const mPtNodeWriter; + + bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted, + const int probability, const int timestamp, int *const forwardLinkFieldPos); + + bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, + const bool isBlacklisted, const int probability, const int timestamp, + bool *const outAddedNewUnigram); + + bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, + const bool isNotAWord, const bool isBlacklisted, const int probability, + const int timestamp, const int *const codePoints, const int codePointCount); + + bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted, + const int probability, const int timestamp); + + bool reallocatePtNodeAndAddNewPtNodes( + const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, + const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode, + const int timestamp, const int *const newNodeCodePoints, + const int newNodeCodePointCount); + + const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, + const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, + const int parentPos, const int codePointCount, + const int *const codePoints, const int probability) const; + + const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, const bool isBlacklisted, + const bool isTerminal, const int parentPos, + const int codePointCount, const int *const codePoints, const int probability) const; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp index 30ff10cd6..ebbdc2ea2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include <cstddef> #include <cstdlib> @@ -24,19 +24,18 @@ namespace latinime { -const size_t DynamicPatriciaTrieWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F; -const size_t DynamicPatriciaTrieWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF; -const int DynamicPatriciaTrieWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1; -const int DynamicPatriciaTrieWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2; -const int DynamicPatriciaTrieWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; -const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_FIELD_SIZE = 3; -const int DynamicPatriciaTrieWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF; -const int DynamicPatriciaTrieWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF; -const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; -const int DynamicPatriciaTrieWritingUtils::PROBABILITY_FIELD_SIZE = 1; -const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; +const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F; +const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF; +const int DynamicPtWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1; +const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2; +const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; +const int DynamicPtWritingUtils::DICT_OFFSET_FIELD_SIZE = 3; +const int DynamicPtWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF; +const int DynamicPtWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF; +const int DynamicPtWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; +const int DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE = 1; -/* static */ bool DynamicPatriciaTrieWritingUtils::writeEmptyDictionary( +/* static */ bool DynamicPtWritingUtils::writeEmptyDictionary( BufferWithExtendableBuffer *const buffer, const int rootPos) { int writingPos = rootPos; if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) { @@ -46,13 +45,13 @@ const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; &writingPos); } -/* static */ bool DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( +/* static */ bool DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, int *const forwardLinkFieldPos) { return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos); } -/* static */ bool DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( +/* static */ bool DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const size_t arraySize, int *const arraySizeFieldPos) { // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to @@ -74,20 +73,20 @@ const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; } } -/* static */ bool DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition( +/* static */ bool DynamicPtWritingUtils::writeFlagsAndAdvancePosition( BufferWithExtendableBuffer *const buffer, - const DynamicPatriciaTrieReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) { + const DynamicPtReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) { return buffer->writeUintAndAdvancePosition(nodeFlags, NODE_FLAG_FIELD_SIZE, nodeFlagsFieldPos); } // Note that parentOffset is offset from node's head position. -/* static */ bool DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( +/* static */ bool DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos, int *const parentPosFieldPos) { return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos); } -/* static */ bool DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition( +/* static */ bool DynamicPtWritingUtils::writeCodePointsAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int *const codePoints, const int codePointCount, int *const codePointFieldPos) { if (codePointCount <= 0) { @@ -101,34 +100,20 @@ const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; hasMultipleCodePoints, codePointFieldPos); } -/* static */ bool DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const int probability, - int *const probabilityFieldPos) { - if (probability < 0 || probability > MAX_PROBABILITY) { - AKLOGI("probability cannot be written because the probability is invalid: %d", - probability); - ASSERT(false); - return false; - } - return buffer->writeUintAndAdvancePosition(probability, PROBABILITY_FIELD_SIZE, - probabilityFieldPos); -} - -/* static */ bool DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( +/* static */ bool DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int childrenPosition, int *const childrenPositionFieldPos) { return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos), childrenPositionFieldPos); } -/* static */ bool DynamicPatriciaTrieWritingUtils::writeDictOffset( - BufferWithExtendableBuffer *const buffer, const int targetPos, const int basePos, - int *const offsetFieldPos) { +/* static */ bool DynamicPtWritingUtils::writeDictOffset(BufferWithExtendableBuffer *const buffer, + const int targetPos, const int basePos, int *const offsetFieldPos) { int offset = targetPos - basePos; if (targetPos == NOT_A_DICT_POS) { - offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; + offset = DynamicPtReadingUtils::DICT_OFFSET_INVALID; } else if (offset == 0) { - offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; + offset = DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET; } if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { AKLOGI("offset cannot be written because the offset is too large or too small: %d", diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h index af76bc6b5..362fbd1cc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h @@ -14,19 +14,19 @@ * limitations under the License. */ -#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H -#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H +#ifndef LATINIME_DYNAMIC_PT_WRITING_UTILS_H +#define LATINIME_DYNAMIC_PT_WRITING_UTILS_H #include <cstddef> #include "defines.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" namespace latinime { class BufferWithExtendableBuffer; -class DynamicPatriciaTrieWritingUtils { +class DynamicPtWritingUtils { public: static const int NODE_FLAG_FIELD_SIZE; @@ -39,8 +39,15 @@ class DynamicPatriciaTrieWritingUtils { static bool writePtNodeArraySizeAndAdvancePosition(BufferWithExtendableBuffer *const buffer, const size_t arraySize, int *const arraySizeFieldPos); + static bool writeFlags(BufferWithExtendableBuffer *const buffer, + const DynamicPtReadingUtils::NodeFlags nodeFlags, + const int nodeFlagsFieldPos) { + int writingPos = nodeFlagsFieldPos; + return writeFlagsAndAdvancePosition(buffer, nodeFlags, &writingPos); + } + static bool writeFlagsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const DynamicPatriciaTrieReadingUtils::NodeFlags nodeFlags, + const DynamicPtReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos); static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, @@ -49,14 +56,11 @@ class DynamicPatriciaTrieWritingUtils { static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, const int *const codePoints, const int codePointCount, int *const codePointFieldPos); - static bool writeProbabilityAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const int probability, int *const probabilityFieldPos); - static bool writeChildrenPositionAndAdvancePosition(BufferWithExtendableBuffer *const buffer, const int childrenPosition, int *const childrenPositionFieldPos); private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingUtils); + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtWritingUtils); static const size_t MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD; static const size_t MAX_PTNODE_ARRAY_SIZE; @@ -67,10 +71,9 @@ class DynamicPatriciaTrieWritingUtils { static const int MAX_DICT_OFFSET_VALUE; static const int MIN_DICT_OFFSET_VALUE; static const int DICT_OFFSET_NEGATIVE_FLAG; - static const int PROBABILITY_FIELD_SIZE; static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, const int basePos, int *const offsetFieldPos); }; } // namespace latinime -#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H */ +#endif /* LATINIME_DYNAMIC_PT_WRITING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h new file mode 100644 index 000000000..6078d8285 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_ARRAY_READER_H +#define LATINIME_PT_NODE_ARRAY_READER_H + +#include "defines.h" + +namespace latinime { + +// Interface class used to read PtNode array information. +class PtNodeArrayReader { + public: + virtual ~PtNodeArrayReader() {} + + // Returns if the position is valid or not. + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const = 0; + + // Returns if the position is valid or not. NOT_A_DICT_POS is set to outNextPtNodeArrayPos when + // the next array doesn't exist. + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const = 0; + + protected: + PtNodeArrayReader() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeArrayReader); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h new file mode 100644 index 000000000..e4847fcf9 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_PARAMS_H +#define LATINIME_PT_NODE_PARAMS_H + +#include <cstring> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +// This class has information of a PtNode. This class is immutable. +class PtNodeParams { + public: + // Invalid PtNode. + PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mHasMovedFlag(false), + mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mCodePoints(), + mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {} + + PtNodeParams(const PtNodeParams& ptNodeParams) + : mHeadPos(ptNodeParams.mHeadPos), mFlags(ptNodeParams.mFlags), + mHasMovedFlag(ptNodeParams.mHasMovedFlag), mParentPos(ptNodeParams.mParentPos), + mCodePointCount(ptNodeParams.mCodePointCount), mCodePoints(), + mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos), + mTerminalId(ptNodeParams.mTerminalId), + mProbabilityFieldPos(ptNodeParams.mProbabilityFieldPos), + mProbability(ptNodeParams.mProbability), + mChildrenPosFieldPos(ptNodeParams.mChildrenPosFieldPos), + mChildrenPos(ptNodeParams.mChildrenPos), + mBigramLinkedNodePos(ptNodeParams.mBigramLinkedNodePos), + mShortcutPos(ptNodeParams.mShortcutPos), mBigramPos(ptNodeParams.mBigramPos), + mSiblingPos(ptNodeParams.mSiblingPos) { + memcpy(mCodePoints, ptNodeParams.getCodePoints(), sizeof(int) * mCodePointCount); + } + + // PtNode read from version 2 dictionary. + PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, + const int codePointCount, const int *const codePoints, const int probability, + const int childrenPos, const int shortcutPos, const int bigramPos, + const int siblingPos) + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(false), mParentPos(NOT_A_DICT_POS), + mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), + mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(childrenPos), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(shortcutPos), + mBigramPos(bigramPos), mSiblingPos(siblingPos) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + // PtNode with a terminal id. + PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, + const int parentPos, const int codePointCount, const int *const codePoints, + const int terminalIdFieldPos, const int terminalId, const int probability, + const int childrenPosFieldPos, const int childrenPos, const int siblingPos) + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), + mCodePointCount(codePointCount), mCodePoints(), + mTerminalIdFieldPos(terminalIdFieldPos), mTerminalId(terminalId), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(childrenPosFieldPos), mChildrenPos(childrenPos), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(terminalId), + mBigramPos(terminalId), mSiblingPos(siblingPos) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + // Construct new params by updating existing PtNode params. + PtNodeParams(const PtNodeParams *const ptNodeParams, + const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, + const int codePointCount, const int *const codePoints, const int probability) + : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true), + mParentPos(parentPos), mCodePointCount(codePointCount), mCodePoints(), + mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()), + mTerminalId(ptNodeParams->getTerminalId()), + mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()), + mProbability(probability), + mChildrenPosFieldPos(ptNodeParams->getChildrenPosFieldPos()), + mChildrenPos(ptNodeParams->getChildrenPos()), + mBigramLinkedNodePos(ptNodeParams->getBigramLinkedNodePos()), + mShortcutPos(ptNodeParams->getShortcutPos()), + mBigramPos(ptNodeParams->getBigramsPos()), + mSiblingPos(ptNodeParams->getSiblingNodePos()) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, + const int codePointCount, const int *const codePoints, const int probability) + : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), + mCodePointCount(codePointCount), mCodePoints(), + mTerminalIdFieldPos(NOT_A_DICT_POS), + mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + AK_FORCE_INLINE bool isValid() const { + return mCodePointCount > 0; + } + + // Head position of the PtNode + AK_FORCE_INLINE int getHeadPos() const { + return mHeadPos; + } + + // Flags + AK_FORCE_INLINE bool isDeleted() const { + return mHasMovedFlag && DynamicPtReadingUtils::isDeleted(mFlags); + } + + AK_FORCE_INLINE bool willBecomeNonTerminal() const { + return mHasMovedFlag && DynamicPtReadingUtils::willBecomeNonTerminal(mFlags); + } + + AK_FORCE_INLINE bool hasChildren() const { + return mChildrenPos != NOT_A_DICT_POS; + } + + AK_FORCE_INLINE bool isTerminal() const { + return PatriciaTrieReadingUtils::isTerminal(mFlags); + } + + AK_FORCE_INLINE bool isBlacklisted() const { + return PatriciaTrieReadingUtils::isBlacklisted(mFlags); + } + + AK_FORCE_INLINE bool isNotAWord() const { + return PatriciaTrieReadingUtils::isNotAWord(mFlags); + } + + AK_FORCE_INLINE bool hasBigrams() const { + return PatriciaTrieReadingUtils::hasBigrams(mFlags); + } + + AK_FORCE_INLINE bool hasShortcutTargets() const { + return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags); + } + + // Parent node position + AK_FORCE_INLINE int getParentPos() const { + return mParentPos; + } + + // Number of code points + AK_FORCE_INLINE uint8_t getCodePointCount() const { + return mCodePointCount; + } + + AK_FORCE_INLINE const int *getCodePoints() const { + return mCodePoints; + } + + // Probability + AK_FORCE_INLINE int getTerminalIdFieldPos() const { + return mTerminalIdFieldPos; + } + + AK_FORCE_INLINE int getTerminalId() const { + return mTerminalId; + } + + // Probability + AK_FORCE_INLINE int getProbabilityFieldPos() const { + return mProbabilityFieldPos; + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + // Children PtNode array position + AK_FORCE_INLINE int getChildrenPosFieldPos() const { + return mChildrenPosFieldPos; + } + + AK_FORCE_INLINE int getChildrenPos() const { + return mChildrenPos; + } + + // Bigram linked node position. + AK_FORCE_INLINE int getBigramLinkedNodePos() const { + return mBigramLinkedNodePos; + } + + // Shortcutlist position + AK_FORCE_INLINE int getShortcutPos() const { + return mShortcutPos; + } + + // Bigrams position + AK_FORCE_INLINE int getBigramsPos() const { + return mBigramPos; + } + + // Sibling node position + AK_FORCE_INLINE int getSiblingNodePos() const { + return mSiblingPos; + } + + private: + // This class have a public copy constructor to be used as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(PtNodeParams); + + const int mHeadPos; + const PatriciaTrieReadingUtils::NodeFlags mFlags; + const bool mHasMovedFlag; + const int mParentPos; + const uint8_t mCodePointCount; + int mCodePoints[MAX_WORD_LENGTH]; + const int mTerminalIdFieldPos; + const int mTerminalId; + const int mProbabilityFieldPos; + const int mProbability; + const int mChildrenPosFieldPos; + const int mChildrenPos; + const int mBigramLinkedNodePos; + const int mShortcutPos; + const int mBigramPos; + const int mSiblingPos; +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_PARAMS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h new file mode 100644 index 000000000..c6b2a8bed --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_READER_H +#define LATINIME_PT_NODE_READER_H + +#include "defines.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" + +namespace latinime { + +// Interface class used to read PtNode information. +class PtNodeReader { + public: + virtual ~PtNodeReader() {} + virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const = 0; + + protected: + PtNodeReader() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeReader); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h new file mode 100644 index 000000000..84dd6870e --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_WRITER_H +#define LATINIME_PT_NODE_WRITER_H + +#include "defines.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +// Interface class used to write PtNode information. +class PtNodeWriter { + public: + typedef hash_map_compat<int, int> PtNodeArrayPositionRelocationMap; + typedef hash_map_compat<int, int> PtNodePositionRelocationMap; + struct DictPositionRelocationMap { + public: + DictPositionRelocationMap() + : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} + + PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; + PtNodePositionRelocationMap mPtNodePositionRelocationMap; + + private: + DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); + }; + + virtual ~PtNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) = 0; + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; + + virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int probability, const int timestamp) = 0; + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, + bool *const outNeedsToKeepPtNode) = 0; + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition) = 0; + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos) = 0; + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const int timestamp, int *const ptNodeWritingPos) = 0; + + virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, + bool *const outAddedNewBigram) = 0; + + virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam) = 0; + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) = 0; + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) = 0; + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) = 0; + + protected: + PtNodeWriter() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeWriter); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_WRITER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 8a84bd261..84a6ccf33 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -15,25 +15,28 @@ */ -#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { -void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, +void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { if (!dicNode->hasChildren()) { return; } - int nextPos = dicNode->getChildrenPos(); + int nextPos = dicNode->getChildrenPtNodeArrayPos(); if (nextPos < 0 || nextPos >= mDictBufferSize) { AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", nextPos, mDictBufferSize); + mIsCorrupted = true; ASSERT(false); return; } @@ -43,6 +46,7 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, if (nextPos < 0 || nextPos >= mDictBufferSize) { AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d", nextPos, mDictBufferSize, i, childCount); + mIsCorrupted = true; ASSERT(false); return; } @@ -52,14 +56,14 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, // This retrieves code points and the probability of the word by its terminal position. // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, -// it is possible to check for this with advantageous complexity. For each node, we search +// it is possible to check for this with advantageous complexity. For each PtNode array, we search // for PtNodes with children and compare the children position with the position we look for. // When we shoot the position we look for, it means the word we look for is in the children // of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a // PtNode array with the last PtNode's children position still less than what we are searching for, // we must descend the last PtNode's children (for example, if the word we are searching for starts // with a z, it's the last PtNode of the root array, so all children addresses will be smaller -// than the position we look for, and we have to descend the z node). +// than the position we look for, and we have to descend the z PtNode). /* Parameters : * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is * what is stored as the "bigram position" in each bigram) @@ -74,18 +78,33 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( int pos = getRootPosition(); int wordPos = 0; // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will - // only traverse nodes that are actually a part of the terminal we are searching, so each time - // we enter this loop we are one depth level further than last time. - // The only reason we count nodes is because we want to reduce the probability of infinite + // only traverse PtNodes that are actually a part of the terminal we are searching, so each + // time we enter this loop we are one depth level further than last time. + // The only reason we count PtNodes is because we want to reduce the probability of infinite // looping in case there is a bug. Since we know there is an upper bound to the depth we are // supposed to traverse, it does not hurt to count iterations. for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { int lastCandidatePtNodePos = 0; // Let's loop through PtNodes in this PtNode array searching for either the terminal // or one of its ascendants. + if (pos < 0 || pos >= mDictBufferSize) { + AKLOGE("PtNode array position is invalid. pos: %d, dict size: %d", + pos, mDictBufferSize); + mIsCorrupted = true; + ASSERT(false); + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) { const int startPos = pos; + if (pos < 0 || pos >= mDictBufferSize) { + AKLOGE("PtNode position is invalid. pos: %d, dict size: %d", pos, mDictBufferSize); + mIsCorrupted = true; + ASSERT(false); + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( @@ -140,8 +159,9 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( found = true; } else if (1 >= ptNodeCount) { // However if we are on the LAST PtNode of this array, and we have NOT shot the - // position we should descend THIS node. So we trick the lastCandidatePtNodePos - // so that we will descend this PtNode, not the previous one. + // position we should descend THIS PtNode. So we trick the + // lastCandidatePtNodePos so that we will descend this PtNode, not the previous + // one. lastCandidatePtNodePos = startPos; found = true; } else { @@ -149,7 +169,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( found = false; } } else { - // Even if we don't have children here, we could still be on the last PtNode of / + // Even if we don't have children here, we could still be on the last PtNode of // this array. If this is the case, we should descend the last PtNode that had // children, and their position is already in lastCandidatePtNodePos. found = (1 >= ptNodeCount); @@ -230,93 +250,19 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( return 0; } -// This function gets the position of the terminal node of the exact matching word in the +// This function gets the position of the terminal PtNode of the exact matching word in the // dictionary. If no match is found, it returns NOT_A_DICT_POS. -int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, +int PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const { - int pos = getRootPosition(); - int wordPos = 0; - - while (true) { - // If we already traversed the tree further than the word is long, there means - // there was no match (or we would have found it). - if (wordPos >= length) return NOT_A_DICT_POS; - int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot, - &pos); - const int wChar = forceLowerCaseSearch - ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; - while (true) { - // If there are no more PtNodes in this array, it means we could not - // find a matching character for this depth, therefore there is no match. - if (0 >= ptNodeCount) return NOT_A_DICT_POS; - const int ptNodePos = pos; - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); - int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, - &pos); - if (character == wChar) { - // This is the correct PtNode. Only one PtNode may start with the same char within - // a PtNode array, so either we found our match in this array, or there is - // no match and we can return NOT_A_DICT_POS. So we will check all the - // characters in this PtNode indeed does match. - if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { - character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, - &pos); - while (NOT_A_CODE_POINT != character) { - ++wordPos; - // If we shoot the length of the word we search for, or if we find a single - // character that does not match, as explained above, it means the word is - // not in the dictionary (by virtue of this PtNode being the only one to - // match the word on the first character, but not matching the whole word). - if (wordPos >= length) return NOT_A_DICT_POS; - if (inWord[wordPos] != character) return NOT_A_DICT_POS; - character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mDictRoot, &pos); - } - } - // If we come here we know that so far, we do match. Either we are on a terminal - // and we match the length, in which case we found it, or we traverse children. - // If we don't match the length AND don't have children, then a word in the - // dictionary fully matches a prefix of the searched word but not the full word. - ++wordPos; - if (PatriciaTrieReadingUtils::isTerminal(flags)) { - if (wordPos == length) { - return ptNodePos; - } - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); - } - if (!PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - return NOT_A_DICT_POS; - } - // We have children and we are still shorter than the word we are searching for, so - // we need to traverse children. Put the pointer on the children position, and - // break - pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, - flags, &pos); - break; - } else { - // This PtNode does not match, so skip the remaining part and go to the next. - if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { - PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, - &pos); - } - if (PatriciaTrieReadingUtils::isTerminal(flags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); - } - if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, - flags, &pos); - } - if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - mShortcutListPolicy.skipAllShortcuts(&pos); - } - if (PatriciaTrieReadingUtils::hasBigrams(flags)) { - mBigramListPolicy.skipAllBigrams(&pos); - } - } - --ptNodeCount; - } + DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = + readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); } + return ptNodePos; } int PatriciaTriePolicy::getProbability(const int unigramProbability, @@ -335,99 +281,138 @@ int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_PROBABILITY; } - int pos = ptNodePos; - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); - if (!PatriciaTrieReadingUtils::isTerminal(flags)) { - return NOT_A_PROBABILITY; - } - if (PatriciaTrieReadingUtils::isNotAWord(flags) - || PatriciaTrieReadingUtils::isBlacklisted(flags)) { + const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) { // If this is not a word, or if it's a blacklisted entry, it should behave as // having no probability outside of the suggestion process (where it should be used // for shortcuts). return NOT_A_PROBABILITY; } - PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); - return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( - mDictRoot, &pos), NOT_A_PROBABILITY); + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); } int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - int pos = ptNodePos; - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); - if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - return NOT_A_DICT_POS; - } - PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); - if (PatriciaTrieReadingUtils::isTerminal(flags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); - } - if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); - } - return pos; + return mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos).getShortcutPos(); } int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - int pos = ptNodePos; - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); - if (!PatriciaTrieReadingUtils::hasBigrams(flags)) { - return NOT_A_DICT_POS; - } - PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); - if (PatriciaTrieReadingUtils::isTerminal(flags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); - } - if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); - } - if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - mShortcutListPolicy.skipAllShortcuts(&pos);; - } - return pos; + return mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos).getBigramsPos(); } int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, DicNodeVector *childDicNodes) const { - int pos = ptNodePos; - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + PatriciaTrieReadingUtils::NodeFlags flags; + int mergedNodeCodePointCount = 0; int mergedNodeCodePoints[MAX_WORD_LENGTH]; - const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - mDictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos); - const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))? - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos) - : NOT_A_PROBABILITY; - const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ? - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - mDictRoot, flags, &pos) : NOT_A_DICT_POS; - if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - getShortcutsStructurePolicy()->skipAllShortcuts(&pos); - } - if (PatriciaTrieReadingUtils::hasBigrams(flags)) { - getBigramsStructurePolicy()->skipAllBigrams(&pos); - } - if (mergedNodeCodePointCount <= 0) { - AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); - ASSERT(false); - return pos; - } + int probability = NOT_A_PROBABILITY; + int childrenPos = NOT_A_DICT_POS; + int shortcutPos = NOT_A_DICT_POS; + int bigramPos = NOT_A_DICT_POS; + int siblingPos = NOT_A_DICT_POS; + PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(), + getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, + &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, PatriciaTrieReadingUtils::isTerminal(flags), PatriciaTrieReadingUtils::hasChildrenInFlags(flags), - PatriciaTrieReadingUtils::isBlacklisted(flags) || - PatriciaTrieReadingUtils::isNotAWord(flags), + PatriciaTrieReadingUtils::isBlacklisted(flags) + || PatriciaTrieReadingUtils::isNotAWord(flags), mergedNodeCodePointCount, mergedNodeCodePoints); - return pos; + return siblingPos; +} + +const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoints, + const int codePointCount) const { + const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, + false /* forceLowerCaseSearch */); + if (ptNodePos == NOT_A_DICT_POS) { + AKLOGE("getWordProperty was called for invalid word."); + return WordProperty(); + } + const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + std::vector<int> codePointVector(ptNodeParams.getCodePoints(), + ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); + // Fetch bigram information. + std::vector<WordProperty::BigramProperty> bigrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + BinaryDictionaryBigramsIterator bigramsIt(getBigramsStructurePolicy(), bigramListPos); + while (bigramsIt.hasNext()) { + // Fetch the next bigram information and forward the iterator. + bigramsIt.next(); + // Skip the entry if the entry has been deleted. This never happens for ver2 dicts. + if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { + int word1Probability = NOT_A_PROBABILITY; + int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints, + &word1Probability); + std::vector<int> word1(bigramWord1CodePoints, + bigramWord1CodePoints + word1CodePointCount); + bigrams.push_back(WordProperty::BigramProperty(&word1, bigramsIt.getProbability(), + NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */)); + } + } + // Fetch shortcut information. + std::vector<WordProperty::ShortcutProperty> shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTargetCodePoints[MAX_WORD_LENGTH]; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mDictRoot, &shortcutPos); + bool hasNext = true; + while (hasNext) { + const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, &shortcutPos); + hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); + const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( + mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); + std::vector<int> shortcutTarget(shortcutTargetCodePoints, + shortcutTargetCodePoints + shortcutTargetLength); + const int shortcutProbability = + ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); + shortcuts.push_back( + WordProperty::ShortcutProperty(&shortcutTarget, shortcutProbability)); + } + } + return WordProperty(&codePointVector, ptNodeParams.isNotAWord(), + ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(), + ptNodeParams.hasShortcutTargets(), ptNodeParams.getProbability(), + NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, + &bigrams, &shortcuts); +} + +int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { + if (token == 0) { + // Start iterating the dictionary. + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + int unigramProbability = NOT_A_PROBABILITY; + getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, + outCodePoints, &unigramProbability); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 0f8662aea..6a2345a05 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -18,12 +18,16 @@ #define LATINIME_PATRICIA_TRIE_POLICY_H #include <stdint.h> +#include <vector> #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" #include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" namespace latinime { @@ -33,28 +37,29 @@ class DicNodeVector; class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: - PatriciaTriePolicy(const MmappedBuffer *const buffer) - : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), - mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()), - mDictBufferSize(mBuffer->getBufferSize() - mHeaderPolicy.getSize()), - mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {} - - ~PatriciaTriePolicy() { - delete mBuffer; - } + PatriciaTriePolicy(const MmappedBuffer::MmappedBufferPtr &mmappedBuffer) + : mMmappedBuffer(mmappedBuffer), + mHeaderPolicy(mMmappedBuffer.get()->getBuffer(), FormatUtils::VERSION_2), + mDictRoot(mMmappedBuffer.get()->getBuffer() + mHeaderPolicy.getSize()), + mDictBufferSize(mMmappedBuffer.get()->getBufferSize() + - mHeaderPolicy.getSize()), + mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot), + mPtNodeReader(mDictRoot, mDictBufferSize, &mBigramListPolicy, &mShortcutListPolicy), + mPtNodeArrayReader(mDictRoot, mDictBufferSize), + mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {} AK_FORCE_INLINE int getRootPosition() const { return 0; } - void createAndGetAllChildNodes(const DicNode *const dicNode, + void createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const; int getCodePointsAndProbabilityAndReturnCodePointCount( const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const; - int getTerminalNodePositionOfWord(const int *const inWord, + int getTerminalPtNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const; int getProbability(const int unigramProbability, const int bigramProbability) const; @@ -77,14 +82,17 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return &mShortcutListPolicy; } - bool addUnigramWord(const int *const word, const int length, const int probability) { + bool addUnigramWord(const int *const word, const int length, const int probability, + const int *const shortcutTargetCodePoints, const int shortcutLength, + const int shortcutProbability, const bool isNotAWord, const bool isBlacklisted, + const int timestamp) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); return false; } bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability) { + const int length1, const int probability, const int timestamp) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); return false; @@ -113,7 +121,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return false; } - void getProperty(const char *const query, char *const outResult, + void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength) { // getProperty is not supported for this class. if (maxResultLength > 0) { @@ -121,15 +129,28 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { } } + const WordProperty getWordProperty(const int *const codePoints, + const int codePointCount) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints); + + bool isCorrupted() const { + return mIsCorrupted; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); - const MmappedBuffer *const mBuffer; + const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; const HeaderPolicy mHeaderPolicy; const uint8_t *const mDictRoot; const int mDictBufferSize; const BigramListPolicy mBigramListPolicy; const ShortcutListPolicy mShortcutListPolicy; + const Ver2ParticiaTrieNodeReader mPtNodeReader; + const Ver2PtNodeArrayReader mPtNodeArrayReader; + std::vector<int> mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, DicNodeVector *const childDicNodes) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.cpp index 7df55815f..b4eee5572 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.cpp @@ -14,9 +14,11 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" #include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { @@ -130,4 +132,32 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; return base + offset; } +/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, + NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, + int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, + int *const outBigramPos, int *const outSiblingPos) { + int readingPos = ptNodePos; + const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); + *outFlags = flags; + *outCodePointCount = getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos); + *outProbability = isTerminal(flags) ? + readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; + *outChildrenPos = hasChildrenInFlags(flags) ? + readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS; + *outShortcutPos = NOT_A_DICT_POS; + if (hasShortcutTargets(flags)) { + *outShortcutPos = readingPos; + shortcutPolicy->skipAllShortcuts(&readingPos); + } + *outBigramPos = NOT_A_DICT_POS; + if (hasBigrams(flags)) { + *outBigramPos = readingPos; + bigramPolicy->skipAllBigrams(&readingPos); + } + *outSiblingPos = readingPos; +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h index 8420ee95a..fa1430ce6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h @@ -23,6 +23,10 @@ namespace latinime { +class DictionaryShortcutsStructurePolicy; +class DictionaryBigramsStructurePolicy; + +// TODO: Move to pt_common class PatriciaTrieReadingUtils { public: typedef uint8_t NodeFlags; @@ -100,6 +104,13 @@ class PatriciaTrieReadingUtils { return nodeFlags; } + static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, + NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, + int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, + int *const outBigramPos, int *const outSiblingPos); + private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..778d7a408 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" + +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" + +namespace latinime { + +const PtNodeParams Ver2ParticiaTrieNodeReader::fetchNodeInfoInBufferFromPtNodePos( + const int ptNodePos) const { + if (ptNodePos < 0 || ptNodePos >= mDictSize) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mDictSize); + ASSERT(false); + return PtNodeParams(); + } + PatriciaTrieReadingUtils::NodeFlags flags; + int mergedNodeCodePointCount = 0; + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + int probability = NOT_A_PROBABILITY; + int childrenPos = NOT_A_DICT_POS; + int shortcutPos = NOT_A_DICT_POS; + int bigramPos = NOT_A_DICT_POS; + int siblingPos = NOT_A_DICT_POS; + PatriciaTrieReadingUtils::readPtNodeInfo(mDictBuffer, ptNodePos, mShortuctPolicy, + mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability, + &childrenPos, &shortcutPos, &bigramPos, &siblingPos); + if (mergedNodeCodePointCount <= 0) { + AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); + ASSERT(false); + return PtNodeParams(); + } + return PtNodeParams(ptNodePos, flags, mergedNodeCodePointCount, mergedNodeCodePoints, + probability, childrenPos, shortcutPos, bigramPos, siblingPos); +} + +} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h new file mode 100644 index 000000000..dd1a0da51 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { + +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +class Ver2ParticiaTrieNodeReader : public PtNodeReader { + public: + Ver2ParticiaTrieNodeReader(const uint8_t *const dictBuffer, const int dictSize, + const DictionaryBigramsStructurePolicy *const bigramPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy) + : mDictBuffer(dictBuffer), mDictSize(dictSize), mBigramPolicy(bigramPolicy), + mShortuctPolicy(shortcutPolicy) {} + + virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader); + + const uint8_t *const mDictBuffer; + const int mDictSize; + const DictionaryBigramsStructurePolicy *const mBigramPolicy; + const DictionaryShortcutsStructurePolicy *const mShortuctPolicy; +}; +} // namespace latinime +#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp new file mode 100644 index 000000000..125ea31dc --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" + +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" + +namespace latinime { + +bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mDictSize) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + ptNodeArrayPos, mDictSize); + ASSERT(false); + return false; + } + int readingPos = ptNodeArrayPos; + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mDictBuffer, &readingPos); + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver2PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= mDictSize) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + forwordLinkPos, mDictSize); + ASSERT(false); + return false; + } + // Ver2 dicts don't have forward links. + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h new file mode 100644 index 000000000..77404adf8 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER2_PT_NODE_ARRAY_READER_H +#define LATINIME_VER2_PT_NODE_ARRAY_READER_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" + +namespace latinime { + +class Ver2PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver2PtNodeArrayReader(const uint8_t *const dictBuffer, const int dictSize) + : mDictBuffer(dictBuffer), mDictSize(dictSize) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver2PtNodeArrayReader); + + const uint8_t *const mDictBuffer; + const int mDictSize; +}; +} // namespace latinime +#endif /* LATINIME_VER2_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp new file mode 100644 index 000000000..cb9d450ec --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( + int *const bigramEntryPos) const { + const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); + const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos); + const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0; + int probability = NOT_A_PROBABILITY; + int timestamp = NOT_A_TIMESTAMP; + int level = 0; + int count = 0; + if (mHasHistoricalInfo) { + probability = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); + timestamp = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos); + level = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos); + count = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos); + } else { + probability = bigramFlags & Ver4DictConstants::BIGRAM_PROBABILITY_MASK; + } + const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos); + const int targetTerminalId = + (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? + Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; + if (mHasHistoricalInfo) { + const HistoricalInfo historicalInfo(timestamp, level, count); + return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId); + } else { + return BigramEntry(hasNext, probability, targetTerminalId); + } +} + +bool BigramDictContent::writeBigramEntryAndAdvancePosition( + const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) { + BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer(); + const int bigramFlags = createAndGetBigramFlags( + mHasHistoricalInfo ? 0 : bigramEntryToWrite->getProbability(), + bigramEntryToWrite->hasNext()); + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags, + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags); + return false; + } + if (mHasHistoricalInfo) { + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, + bigramEntryToWrite->getProbability()); + return false; + } + const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo(); + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos, + historicalInfo->getTimeStamp()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos, + historicalInfo->getLevel()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos, + historicalInfo->getCount()); + return false; + } + } + const int targetTerminalIdToWrite = + (bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ? + Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : + bigramEntryToWrite->getTargetTerminalId(); + if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite, + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d", + *entryWritingPos, bigramEntryToWrite->getTargetTerminalId()); + return false; + } + return true; +} + +bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos) { + int readingPos = bigramListPos; + int writingPos = toPos; + bool hasNext = true; + while (hasNext) { + const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalBigramListPos = + originalBigramDictContent->getBigramListHeadPos(it->first); + if (originalBigramListPos == NOT_A_DICT_POS) { + // This terminal does not have a bigram list. + continue; + } + const int bigramListPos = getContentBuffer()->getTailPosition(); + int bigramEntryCount = 0; + // Copy bigram list with GC from original content. + if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, + terminalIdMap, &bigramEntryCount)) { + AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d", + originalBigramListPos, bigramListPos); + return false; + } + if (bigramEntryCount == 0) { + // All bigram entries are useless. This terminal does not have a bigram list. + continue; + } + *outBigramEntryCount += bigramEntryCount; + // Set bigram list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { + AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d", + it->second, bigramListPos); + return false; + } + } + return true; +} + +// Returns whether GC for the bigram list was succeeded or not. +bool BigramDictContent::runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntrycount) { + bool hasNext = true; + int readingPos = bigramListPos; + int writingPos = toPos; + int lastEntryPos = NOT_A_DICT_POS; + while (hasNext) { + const BigramEntry originalBigramEntry = + sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = originalBigramEntry.hasNext(); + if (originalBigramEntry.getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) { + continue; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + terminalIdMap->find(originalBigramEntry.getTargetTerminalId()); + if (it == terminalIdMap->end()) { + // Target word has been removed. + continue; + } + lastEntryPos = hasNext ? writingPos : NOT_A_DICT_POS; + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second); + if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos); + return false; + } + *outEntrycount += 1; + } + if (lastEntryPos != NOT_A_DICT_POS) { + // Update has next flag in the last written entry. + const BigramEntry bigramEntry = getBigramEntry(lastEntryPos).updateHasNextAndGetEntry( + false /* hasNext */); + if (!writeBigramEntry(&bigramEntry, lastEntryPos)) { + AKLOGE("Cannot write bigram entry to set hasNext flag after GC. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h new file mode 100644 index 000000000..ba2a05209 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_DICT_CONTENT_H +#define LATINIME_BIGRAM_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +class BigramDictContent : public SparseTableDictContent { + public: + BigramDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + BigramDictContent(const bool hasHistoricalInfo) + : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + const BigramEntry getBigramEntry(const int bigramEntryPos) const { + int readingPos = bigramEntryPos; + return getBigramEntryAndAdvancePosition(&readingPos); + } + + const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const; + + // Returns head position of bigram list for a PtNode specified by terminalId. + int getBigramListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); + } + + bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { + int writingPos = entryWritingPos; + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + + bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite, + int *const entryWritingPos); + + bool createNewBigramList(const int terminalId) { + const int bigramListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); + } + + bool copyBigramList(const int bigramListPos, const int toPos); + + bool flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION); + } + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount); + + private: + DISALLOW_COPY_AND_ASSIGN(BigramDictContent); + + int createAndGetBigramFlags(const int probability, const bool hasNext) const { + return (probability & Ver4DictConstants::BIGRAM_PROBABILITY_MASK) + | (hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0); + } + + bool runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntryCount); + + bool mHasHistoricalInfo; +}; +} // namespace latinime +#endif /* LATINIME_BIGRAM_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h new file mode 100644 index 000000000..2b0cbd93b --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_ENTRY_H +#define LATINIME_BIGRAM_ENTRY_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/historical_info.h" + +namespace latinime { + +class BigramEntry { + public: + BigramEntry(const BigramEntry& bigramEntry) + : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability), + mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(), + mTargetTerminalId(targetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, + const HistoricalInfo *const historicalInfo, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo), + mTargetTerminalId(targetTerminalId) {} + + const BigramEntry getInvalidatedEntry() const { + return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID); + } + + const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const { + return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const { + return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId); + } + + const BigramEntry updateProbabilityAndGetEntry(const int probability) const { + return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateHistoricalInfoAndGetEntry( + const HistoricalInfo *const historicalInfo) const { + return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId); + } + + bool isValid() const { + return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + } + + bool hasNext() const { + return mHasNext; + } + + int getProbability() const { + return mProbability; + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + int getTargetTerminalId() const { + return mTargetTerminalId; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry); + DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry); + + const bool mHasNext; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + const int mTargetTerminalId; +}; +} // namespace latinime +#endif /* LATINIME_BIGRAM_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dict_content.h new file mode 100644 index 000000000..0c2f47073 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dict_content.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_CONTENT_H +#define LATINIME_DICT_CONTENT_H + +#include "defines.h" + +namespace latinime { + +class DictContent { + public: + virtual ~DictContent() {} + virtual bool isValid() const = 0; + + protected: + DictContent() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictContent); +}; +} // namespace latinime +#endif /* LATINIME_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp new file mode 100644 index 000000000..3b7c70efd --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h" + +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + // This method can be called with invalid terminal id during GC. + return ProbabilityEntry(0 /* flags */, NOT_A_PROBABILITY); + } + const BufferWithExtendableBuffer *const buffer = getBuffer(); + int entryPos = getEntryPos(terminalId); + const int flags = buffer->readUintAndAdvancePosition( + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos); + const int probability = buffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, &entryPos); + if (mHasHistoricalInfo) { + const int timestamp = buffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos); + const int level = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); + const int count = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); + const HistoricalInfo historicalInfo(timestamp, level, count); + return ProbabilityEntry(flags, probability, &historicalInfo); + } else { + return ProbabilityEntry(flags, probability); + } +} + +bool ProbabilityDictContent::setProbabilityEntry(const int terminalId, + const ProbabilityEntry *const probabilityEntry) { + if (terminalId < 0) { + return false; + } + const int entryPos = getEntryPos(terminalId); + if (terminalId >= mSize) { + ProbabilityEntry dummyEntry; + // Write new entry. + int writingPos = getBuffer()->getTailPosition(); + while (writingPos <= entryPos) { + // Fulfilling with dummy entries until writingPos. + if (!writeEntry(&dummyEntry, writingPos)) { + AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize); + return false; + } + writingPos += getEntrySize(); + mSize++; + } + } + return writeEntry(probabilityEntry, entryPos); +} + +bool ProbabilityDictContent::flushToFile(const char *const dictPath) const { + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo); + for (int i = 0; i < mSize; ++i) { + const ProbabilityEntry probabilityEntry = getProbabilityEntry(i); + if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i); + return false; + } + } + return probabilityDictContentToWrite.flush(dictPath, + Ver4DictConstants::FREQ_FILE_EXTENSION); + } else { + return flush(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION); + } +} + +bool ProbabilityDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent) { + mSize = 0; + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const ProbabilityEntry probabilityEntry = + originalProbabilityDictContent->getProbabilityEntry(it->first); + if (!setProbabilityEntry(it->second, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second); + return false; + } + mSize++; + } + return true; +} + +int ProbabilityDictContent::getEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE; + } else { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE; + } +} + +int ProbabilityDictContent::getEntryPos(const int terminalId) const { + return terminalId * getEntrySize(); +} + +bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry, + const int entryPos) { + BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer(); + int writingPos = entryPos; + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(), + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { + AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { + AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos); + return false; + } + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo(); + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h new file mode 100644 index 000000000..b065bc954 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_DICT_CONTENT_H +#define LATINIME_PROBABILITY_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +class ProbabilityEntry; + +class ProbabilityDictContent : public SingleDictContent { + public: + ProbabilityDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SingleDictContent(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable), + mHasHistoricalInfo(hasHistoricalInfo), + mSize(getBuffer()->getTailPosition() / getEntrySize()) {} + + ProbabilityDictContent(const bool hasHistoricalInfo) + : mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {} + + const ProbabilityEntry getProbabilityEntry(const int terminalId) const; + + bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry); + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent); + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent); + + int getEntrySize() const; + + int getEntryPos(const int terminalId) const; + + bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos); + + bool mHasHistoricalInfo; + int mSize; +}; +} // namespace latinime +#endif /* LATINIME_PROBABILITY_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h new file mode 100644 index 000000000..36ba82be1 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_ENTRY_H +#define LATINIME_PROBABILITY_ENTRY_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/historical_info.h" + +namespace latinime { + +class ProbabilityEntry { + public: + ProbabilityEntry(const ProbabilityEntry &probabilityEntry) + : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), + mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} + + // Dummy entry + ProbabilityEntry() + : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {} + + // Entry without historical information + ProbabilityEntry(const int flags, const int probability) + : mFlags(flags), mProbability(probability), mHistoricalInfo() {} + + // Entry with historical information. + ProbabilityEntry(const int flags, const int probability, + const HistoricalInfo *const historicalInfo) + : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {} + + const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const { + return ProbabilityEntry(mFlags, probability, &mHistoricalInfo); + } + + const ProbabilityEntry createEntryWithUpdatedHistoricalInfo( + const HistoricalInfo *const historicalInfo) const { + return ProbabilityEntry(mFlags, mProbability, historicalInfo); + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + int getFlags() const { + return mFlags; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); + + const int mFlags; + const int mProbability; + const HistoricalInfo mHistoricalInfo; +}; +} // namespace latinime +#endif /* LATINIME_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp new file mode 100644 index 000000000..29972a4e8 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const { + const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); + const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + if (outProbability) { + *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; + } + if (outhasNext) { + *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + } + if (outCodePoint && outCodePointCount) { + shortcutListBuffer->readCodePointsAndAdvancePosition( + maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); + } +} + +int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); +} + +bool ShortcutDictContent::flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION); +} + +bool ShortcutDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalShortcutListPos = + originalShortcutDictContent->getShortcutListHeadPos(it->first); + if (originalShortcutListPos == NOT_A_DICT_POS) { + continue; + } + const int shortcutListPos = getContentBuffer()->getTailPosition(); + // Copy shortcut list from original content. + if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, + shortcutListPos)) { + AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", + originalShortcutListPos, shortcutListPos); + return false; + } + // Set shortcut list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { + AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", + it->second, shortcutListPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::createNewShortcutList(const int terminalId) { + const int shortcutListListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); +} + +bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { + return copyShortcutListFromDictContent(shortcutListPos, this, toPos); +} + +bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { + bool hasNext = true; + int readingPos = shortcutListPos; + int writingPos = toPos; + int codePoints[MAX_WORD_LENGTH]; + while (hasNext) { + int probability = 0; + int codePointCount = 0; + sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, + codePoints, &codePointCount, &probability, &hasNext, &readingPos); + if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, + hasNext, &writingPos)) { + AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = shortcutListBuffer->readUint( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); + return shortcutListBuffer->writeUint(shortcutFlagsToWrite, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); +} + +bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); + if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); + return false; + } + if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, + true /* writesTerminator */, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); + return false; + } + return true; +} + +// Find a shortcut entry that has specified target and return its position. +int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const { + bool hasNext = true; + int readingPos = shortcutListPos; + int targetCodePoints[MAX_WORD_LENGTH]; + while (hasNext) { + const int entryPos = readingPos; + int probability = 0; + int targetCodePointCount = 0; + getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, + &probability, &hasNext, &readingPos); + if (targetCodePointCount != codePointCount) { + continue; + } + bool matched = true; + for (int i = 0; i < codePointCount; ++i) { + if (targetCodePointsToFind[i] != targetCodePoints[i]) { + matched = false; + break; + } + } + if (matched) { + return entryPos; + } + } + return NOT_A_DICT_POS; +} + +int ShortcutDictContent::createAndGetShortcutFlags(const int probability, + const bool hasNext) const { + return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) + | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h new file mode 100644 index 000000000..eaafc27bc --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_DICT_CONTENT_H +#define LATINIME_SHORTCUT_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +class ShortcutDictContent : public SparseTableDictContent { + public: + ShortcutDictContent(const char *const dictPath, const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + ShortcutDictContent() + : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, int *const outProbability, bool *const outhasNext, + const int shortcutEntryPos) { + int readingPos = shortcutEntryPos; + return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, + outCodePointCount, outProbability, outhasNext, &readingPos); + } + + void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const; + + // Returns head position of shortcut list for a PtNode specified by terminalId. + int getShortcutListHeadPos(const int terminalId) const; + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent); + + bool createNewShortcutList(const int terminalId); + + bool copyShortcutList(const int shortcutListPos, const int toPos); + + bool setProbability(const int probability, const int shortcutEntryPos); + + bool writeShortcutEntry(const int *const codePoint, const int codePointCount, + const int probability, const bool hasNext, const int shortcutEntryPos) { + int writingPos = shortcutEntryPos; + return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, + hasNext, &writingPos); + } + + bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos); + + int findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); + + bool copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); + + int createAndGetShortcutFlags(const int probability, const bool hasNext) const; +}; +} // namespace latinime +#endif /* LATINIME_SHORTCUT_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h new file mode 100644 index 000000000..9064b7e72 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SINGLE_DICT_CONTENT_H +#define LATINIME_SINGLE_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class SingleDictContent : public DictContent { + public: + SingleDictContent(const char *const dictPath, const char *const contentFileName, + const bool isUpdatable) + : mMmappedBuffer(MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableContentBuffer(mMmappedBuffer.get() ? mMmappedBuffer.get()->getBuffer() : 0, + mMmappedBuffer.get() ? mMmappedBuffer.get()->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mIsValid(mMmappedBuffer.get() != 0) {} + + SingleDictContent() + : mMmappedBuffer(0), mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mIsValid(true) {} + + virtual ~SingleDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + BufferWithExtendableBuffer *getWritableBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictPath, const char *const contentFileNameSuffix) const { + return DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + contentFileNameSuffix, &mExpandableContentBuffer); + } + + private: + DISALLOW_COPY_AND_ASSIGN(SingleDictContent); + + const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + const bool mIsValid; +}; +} // namespace latinime +#endif /* LATINIME_SINGLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp new file mode 100644 index 000000000..63c6ea3a4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" + +namespace latinime { + +bool SparseTableDictContent::flush(const char *const dictPath, + const char *const lookupTableFileNameSuffix, const char *const addressTableFileNameSuffix, + const char *const contentFileNameSuffix) const { + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, lookupTableFileNameSuffix, + &mExpandableLookupTableBuffer)){ + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, addressTableFileNameSuffix, + &mExpandableAddressTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, contentFileNameSuffix, + &mExpandableContentBuffer)) { + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h new file mode 100644 index 000000000..a82e3f50a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SPARSE_TABLE_DICT_CONTENT_H +#define LATINIME_SPARSE_TABLE_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "suggest/policyimpl/dictionary/utils/sparse_table.h" + +namespace latinime { + +// TODO: Support multiple contents. +class SparseTableDictContent : public DictContent { + public: + AK_FORCE_INLINE SparseTableDictContent(const char *const dictPath, + const char *const lookupTableFileName, const char *const addressTableFileName, + const char *const contentFileName, const bool isUpdatable, + const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer( + MmappedBuffer::openBuffer(dictPath, lookupTableFileName, isUpdatable)), + mAddressTableBuffer( + MmappedBuffer::openBuffer(dictPath, addressTableFileName, isUpdatable)), + mContentBuffer(MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableLookupTableBuffer( + mLookupTableBuffer.get() ? mLookupTableBuffer.get()->getBuffer() : 0, + mLookupTableBuffer.get() ? mLookupTableBuffer.get()->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableAddressTableBuffer( + mAddressTableBuffer.get() ? mAddressTableBuffer.get()->getBuffer() : 0, + mAddressTableBuffer.get() ? mAddressTableBuffer.get()->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableContentBuffer(mContentBuffer.get() ? mContentBuffer.get()->getBuffer() : 0, + mContentBuffer.get() ? mContentBuffer.get()->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), + mIsValid(mLookupTableBuffer.get() != 0 && mAddressTableBuffer.get() != 0 + && mContentBuffer.get() != 0) {} + + SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer(0), mAddressTableBuffer(0), mContentBuffer(0), + mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), mIsValid(true) {} + + virtual ~SparseTableDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableLookupTableBuffer.isNearSizeLimit() + || mExpandableAddressTableBuffer.isNearSizeLimit() + || mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + SparseTable *getUpdatableAddressLookupTable() { + return &mAddressLookupTable; + } + + const SparseTable *getAddressLookupTable() const { + return &mAddressLookupTable; + } + + BufferWithExtendableBuffer *getWritableContentBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getContentBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictDirPath, const char *const lookupTableFileName, + const char *const addressTableFileName, const char *const contentFileName) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); + + const MmappedBuffer::MmappedBufferPtr mLookupTableBuffer; + const MmappedBuffer::MmappedBufferPtr mAddressTableBuffer; + const MmappedBuffer::MmappedBufferPtr mContentBuffer; + BufferWithExtendableBuffer mExpandableLookupTableBuffer; + BufferWithExtendableBuffer mExpandableAddressTableBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + SparseTable mAddressLookupTable; + const bool mIsValid; +}; +} // namespace latinime +#endif /* LATINIME_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp new file mode 100644 index 000000000..0b17a009d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + return NOT_A_DICT_POS; + } + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); + return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? + NOT_A_DICT_POS : terminalPos; +} + +bool TerminalPositionLookupTable::setTerminalPtNodePosition( + const int terminalId, const int terminalPtNodePos) { + if (terminalId < 0) { + return NOT_A_DICT_POS; + } + while (terminalId >= mSize) { + // Write new entry. + if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { + return false; + } + mSize++; + } + const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? + terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; + return getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); +} + +bool TerminalPositionLookupTable::flushToFile(const char *const dictPath) const { + // If the used buffer size is smaller than the actual buffer size, regenerate the lookup + // table and write the new table to the file. + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + TerminalPositionLookupTable lookupTableToWrite; + for (int i = 0; i < mSize; ++i) { + const int terminalPtNodePosition = getTerminalPtNodePosition(i); + if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { + AKLOGE("Cannot set terminal position to lookupTableToWrite." + " terminalId: %d, position: %d", i, terminalPtNodePosition); + return false; + } + } + return lookupTableToWrite.flush(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } else { + // We can simply use this lookup table because the buffer size has not been + // changed. + return flush(dictPath, Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } +} + +bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { + int removedEntryCount = 0; + int nextNewTerminalId = 0; + for (int i = 0; i < mSize; ++i) { + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); + if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { + // This entry is a garbage. + removedEntryCount++; + } else { + // Give a new terminal id to the entry. + if (!getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, + getEntryPos(nextNewTerminalId))) { + return false; + } + // Memorize the mapping to the old terminal id to the new terminal id. + terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); + nextNewTerminalId++; + } + } + mSize = nextNewTerminalId; + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h new file mode 100644 index 000000000..f73e22754 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H +#define LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class TerminalPositionLookupTable : public SingleDictContent { + public: + typedef hash_map_compat<int, int> TerminalIdMap; + + TerminalPositionLookupTable(const char *const dictPath, const bool isUpdatable) + : SingleDictContent(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION, isUpdatable), + mSize(getBuffer()->getTailPosition() + / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} + + TerminalPositionLookupTable() : mSize(0) {} + + int getTerminalPtNodePosition(const int terminalId) const; + + bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); + + int getNextTerminalId() const { + return mSize; + } + + bool flushToFile(const char *const dictPath) const; + + bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); + + private: + DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); + + int getEntryPos(const int terminalId) const { + return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + } + + int mSize; +}; +} // namespace latinime +#endif // LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp new file mode 100644 index 000000000..59dedee72 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" + +#include <cerrno> +#include <cstring> +#include <sys/stat.h> +#include <sys/types.h> + +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/file_utils.h" + +namespace latinime { + +/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( + const char *const dictPath, const MmappedBuffer::MmappedBufferPtr &headerBuffer) { + if (!headerBuffer.get()) { + ASSERT(false); + AKLOGE("The header buffer must be valid to open ver4 dict buffers."); + return Ver4DictBuffersPtr(0); + } + // TODO: take only dictDirPath, and open both header and trie files in the constructor below + return Ver4DictBuffersPtr(new Ver4DictBuffers( + dictPath, headerBuffer, headerBuffer.get()->isUpdatable())); +} + +bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const { + // Create temporary directory. + const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + char tmpDirPath[tmpDirPathBufSize]; + FileUtils::getFilePathWithSuffix(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, + tmpDirPath); + if (FileUtils::existsDir(tmpDirPath)) { + if (!FileUtils::removeDirAndFiles(tmpDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); + ASSERT(false); + return false; + } + } + if (mkdir(tmpDirPath, S_IRWXU) == -1) { + AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); + return false; + } + // Get dictionary base path. + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); + char dictPath[dictPathBufSize]; + FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); + + // Write header file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { + AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::HEADER_FILE_EXTENSION); + return false; + } + // Write trie file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, &mExpandableTrieBuffer)) { + AKLOGE("Dictionary trie file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::TRIE_FILE_EXTENSION); + return false; + } + // Write dictionary contents. + if (!mTerminalPositionLookupTable.flushToFile(dictPath)) { + AKLOGE("Terminal position lookup table cannot be written. %s", tmpDirPath); + return false; + } + if (!mProbabilityDictContent.flushToFile(dictPath)) { + AKLOGE("Probability dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mBigramDictContent.flushToFile(dictPath)) { + AKLOGE("Bigram dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mShortcutDictContent.flushToFile(dictPath)) { + AKLOGE("Shortcut dict content cannot be written. %s", tmpDirPath); + return false; + } + // Remove existing dictionary. + if (!FileUtils::removeDirAndFiles(dictDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", dictDirPath); + ASSERT(false); + return false; + } + // Rename temporary directory. + if (rename(tmpDirPath, dictDirPath) != 0) { + AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); + ASSERT(false); + return false; + } + return true; +} + +Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath, + const MmappedBuffer::MmappedBufferPtr &headerBuffer, const bool isUpdatable) + : mHeaderBuffer(headerBuffer), + mDictBuffer(MmappedBuffer::openBuffer(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)), + mHeaderPolicy(headerBuffer.get()->getBuffer(), FormatUtils::VERSION_4), + mExpandableHeaderBuffer(headerBuffer.get() ? headerBuffer.get()->getBuffer() : 0, + mHeaderPolicy.getSize(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableTrieBuffer(mDictBuffer.get() ? mDictBuffer.get()->getBuffer() : 0, + mDictBuffer.get() ? mDictBuffer.get()->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mTerminalPositionLookupTable(dictPath, isUpdatable), + mProbabilityDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), + isUpdatable), + mBigramDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), + isUpdatable), + mShortcutDictContent(dictPath, isUpdatable), + mIsUpdatable(isUpdatable) {} + +Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy) + : mHeaderBuffer(0), mDictBuffer(0), mHeaderPolicy(), + mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableTrieBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mTerminalPositionLookupTable(), + mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()), + mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(), + mIsUpdatable(true) {} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h new file mode 100644 index 000000000..776bb9882 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_DICT_BUFFER_H +#define LATINIME_VER4_DICT_BUFFER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class Ver4DictBuffers { + public: + typedef ExclusiveOwnershipPointer<Ver4DictBuffers> Ver4DictBuffersPtr; + + static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, + const MmappedBuffer::MmappedBufferPtr &headerBuffer); + + static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( + const HeaderPolicy *const headerPolicy) { + return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy)); + } + + AK_FORCE_INLINE bool isValid() const { + return mHeaderBuffer.get() && mDictBuffer.get() && mHeaderPolicy.isValid() + && mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid() + && mBigramDictContent.isValid() && mShortcutDictContent.isValid(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mExpandableTrieBuffer.isNearSizeLimit() + || mTerminalPositionLookupTable.isNearSizeLimit() + || mProbabilityDictContent.isNearSizeLimit() + || mBigramDictContent.isNearSizeLimit() + || mShortcutDictContent.isNearSizeLimit(); + } + + AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { + return &mHeaderPolicy; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { + return &mExpandableHeaderBuffer; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE ProbabilityDictContent *getMutableProbabilityDictContent() { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE const ProbabilityDictContent *getProbabilityDictContent() const { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() { + return &mBigramDictContent; + } + + AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const { + return &mBigramDictContent; + } + + AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + bool flush(const char *const dictDirPath) const { + return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); + } + + bool flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); + + Ver4DictBuffers(const char *const dictDirPath, + const MmappedBuffer::MmappedBufferPtr &headerBuffer, const bool isUpdatable); + + Ver4DictBuffers(const HeaderPolicy *const headerPolicy); + + const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; + const MmappedBuffer::MmappedBufferPtr mDictBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mExpandableHeaderBuffer; + BufferWithExtendableBuffer mExpandableTrieBuffer; + TerminalPositionLookupTable mTerminalPositionLookupTable; + ProbabilityDictContent mProbabilityDictContent; + BigramDictContent mBigramDictContent; + ShortcutDictContent mShortcutDictContent; + const int mIsUpdatable; +}; +} // namespace latinime +#endif /* LATINIME_VER4_DICT_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp new file mode 100644 index 000000000..deed010cd --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +// These values MUST match the definitions in FormatSpec.java. +const char *const Ver4DictConstants::TRIE_FILE_EXTENSION = ".trie"; +const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; +const char *const Ver4DictConstants::FREQ_FILE_EXTENSION = ".freq"; +// tat = Terminal Address Table +const char *const Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; +const char *const Ver4DictConstants::BIGRAM_FILE_EXTENSION = ".bigram_freq"; +const char *const Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup"; +const char *const Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION = ".bigram_index_freq"; +const char *const Ver4DictConstants::SHORTCUT_FILE_EXTENSION = ".shortcut_shortcut"; +const char *const Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION = ".shortcut_lookup"; +const char *const Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION = + ".shortcut_index_shortcut"; + +// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. +const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; +// Extended region size, which is not GCed region size in dict file + additional buffer size, is +// limited to 1MB to prevent from inefficient traversing. +const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; + +const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; +const int Ver4DictConstants::PROBABILITY_SIZE = 1; +const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1; +const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; +const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; +const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; +const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; + +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; + +const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3; +// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing +// invalid terminal ID in bigram lists. +const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID = + (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1; +const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::BIGRAM_HAS_NEXT_MASK = 0x80; +const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1; + +const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h new file mode 100644 index 000000000..d6d22c5c1 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_DICT_CONSTANTS_H +#define LATINIME_VER4_DICT_CONSTANTS_H + +#include "defines.h" + +namespace latinime { + +// TODO: Create PtConstants under the pt_common and move some constant values there. +// Note that there are corresponding definitions in FormatSpec.java. +class Ver4DictConstants { + public: + static const char *const TRIE_FILE_EXTENSION; + static const char *const HEADER_FILE_EXTENSION; + static const char *const FREQ_FILE_EXTENSION; + static const char *const TERMINAL_ADDRESS_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_FILE_EXTENSION; + static const char *const BIGRAM_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_CONTENT_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_FILE_EXTENSION; + static const char *const SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_CONTENT_TABLE_FILE_EXTENSION; + + static const int MAX_DICTIONARY_SIZE; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + + static const int NOT_A_TERMINAL_ID; + static const int PROBABILITY_SIZE; + static const int FLAGS_IN_PROBABILITY_FILE_SIZE; + static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + static const int NOT_A_TERMINAL_ADDRESS; + static const int TERMINAL_ID_FIELD_SIZE; + static const int TIME_STAMP_FIELD_SIZE; + static const int WORD_LEVEL_FIELD_SIZE; + static const int WORD_COUNT_FIELD_SIZE; + + static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE; + static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; + + static const int BIGRAM_FLAGS_FIELD_SIZE; + static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + static const int INVALID_BIGRAM_TARGET_TERMINAL_ID; + static const int BIGRAM_PROBABILITY_MASK; + static const int BIGRAM_HAS_NEXT_MASK; + // Used when bigram list has time stamp. + static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE; + + static const int SHORTCUT_FLAGS_FIELD_SIZE; + static const int SHORTCUT_PROBABILITY_MASK; + static const int SHORTCUT_HAS_NEXT_MASK; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); +}; +} // namespace latinime +#endif /* LATINIME_VER4_DICT_CONSTANTS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..17fc9483b --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int siblingNodePos) const { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + return PtNodeParams(); + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + const int headPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + dictBuf, &pos); + const int parentPos = + DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); + int codePoints[MAX_WORD_LENGTH]; + const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); + int terminalIdFieldPos = NOT_A_DICT_POS; + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + int probability = NOT_A_PROBABILITY; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + terminalIdFieldPos = pos; + if (usesAdditionalBuffer) { + terminalIdFieldPos += mBuffer->getOriginalBufferSize(); + } + terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); + const ProbabilityEntry probabilityEntry = + mProbabilityDictContent->getProbabilityEntry(terminalId); + if (probabilityEntry.hasHistoricalInfo()) { + probability = ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo()); + } else { + probability = probabilityEntry.getProbability(); + } + } + int childrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + childrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { + childrenPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + // Sibling position is the tail position of original PtNode. + int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; + // Read destination node if the read node is a moved node. + if (DynamicPtReadingUtils::isMoved(flags)) { + // The destination position is stored at the same place as the parent position. + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); + } else { + return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints, + terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, + newSiblingNodePos); + } +} + +} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h new file mode 100644 index 000000000..9d932457c --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class ProbabilityDictContent; + +/* + * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved + * node and reads node attributes including probability form probabilityBuffer. + */ +class Ver4PatriciaTrieNodeReader : public PtNodeReader { + public: + Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, + const ProbabilityDictContent *const probabilityDictContent) + : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent) {} + + ~Ver4PatriciaTrieNodeReader() {} + + virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const { + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, + NOT_A_DICT_POS /* siblingNodePos */); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + const ProbabilityDictContent *const mProbabilityDictContent; + + const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int siblingNodePos) const; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp new file mode 100644 index 000000000..32576cf0a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -0,0 +1,411 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" + +#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->isTerminal()) { + // The PtNode is a terminal. Delete entry from the terminal position lookup table. + return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); + } else { + return true; + } +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->hasChildren()) { + // Update children's parent position. + mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); + while (!mReadingHelper.isEnd()) { + const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); + int parentOffsetFieldPos = childPtNodeParams.getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), + &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + mReadingHelper.readNextSiblingNode(childPtNodeParams); + } + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + false /* isDeleted */, true /* willBecomeNonTerminal */); + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { + AKLOGE("Cannot update terminal position lookup table. terminal id: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + // Update flags. + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability( + const PtNodeParams *const toBeUpdatedPtNodeParams, const int newProbability, + const int timestamp) { + if (!toBeUpdatedPtNodeParams->isTerminal()) { + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, + newProbability, timestamp); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { + if (!toBeUpdatedPtNodeParams->isTerminal()) { + AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + if (originalProbabilityEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + originalProbabilityEntry.getHistoricalInfo()); + const ProbabilityEntry probabilityEntry = + originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); + if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { + AKLOGE("Cannot write updated probability entry. terminalId: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo); + if (!isValid) { + if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); + return false; + } + } + *outNeedsToKeepPtNode = isValid; + } else { + // No need to update probability. + *outNeedsToKeepPtNode = true; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( + const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { + int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); + return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + newChildrenPosition, &childrenPosFieldPos); +} + +bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId) { + return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, + toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { + return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, + ptNodeWritingPos); +} + + +bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, const int timestamp, int *const ptNodeWritingPos) { + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, + ptNodeWritingPos)) { + return false; + } + // Write probability. + ProbabilityEntry newProbabilityEntry; + const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( + &newProbabilityEntry, ptNodeParams->getProbability(), timestamp); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId, + &probabilityEntryToWrite); +} + +bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry( + const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, + bool *const outAddedNewBigram) { + if (!mBigramPolicy->addNewEntry(sourcePtNodeParams->getTerminalId(), + targetPtNodeParam->getTerminalId(), probability, timestamp, outAddedNewBigram)) { + AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", + sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId()); + return false; + } + if (!sourcePtNodeParams->hasBigrams()) { + // Update has bigrams flag. + return updatePtNodeFlags(sourcePtNodeParams->getHeadPos(), + sourcePtNodeParams->isBlacklisted(), sourcePtNodeParams->isNotAWord(), + sourcePtNodeParams->isTerminal(), sourcePtNodeParams->hasShortcutTargets(), + true /* hasBigrams */, + sourcePtNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::removeBigramEntry( + const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam) { + return mBigramPolicy->removeEntry(sourcePtNodeParams->getTerminalId(), + targetPtNodeParam->getTerminalId()); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { + return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries( + sourcePtNodeParams->getTerminalId(), outBigramEntryCount); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) { + int parentPos = toBeUpdatedPtNodeParams->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = toBeUpdatedPtNodeParams->getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { + return false; + } + + // Counts bigram entries. + if (outBigramEntryCount) { + *outBigramEntryCount = mBigramPolicy->getBigramEntryConut( + toBeUpdatedPtNodeParams->getTerminalId()); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), + targetCodePoints, targetCodePointCount, shortcutProbability)) { + AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId()); + return false; + } + if (!ptNodeParams->hasShortcutTargets()) { + // Update has shortcut targets flag. + return updatePtNodeFlags(ptNodeParams->getHeadPos(), + ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), + ptNodeParams->isTerminal(), true /* hasShortcutTargets */, + ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags( + const PtNodeParams *const ptNodeParams) { + const bool hasBigrams = mBuffers->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isBlacklisted(), + ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets, + hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos) { + const int nodePos = *ptNodeWritingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, + 0 /* nodeFlags */, ptNodeWritingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { + return false; + } + // Write code points + if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, + ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { + return false; + } + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!ptNodeParams->willBecomeNonTerminal()) { + if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { + terminalId = ptNodeParams->getTerminalId(); + } else if (ptNodeParams->isTerminal()) { + // Write terminal information using a new terminal id. + // Get a new unused terminal id. + terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); + } + } + const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + if (isTerminal) { + // Update the lookup table. + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + terminalId, nodePos)) { + return false; + } + // Write terminal Id. + if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, + Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { + return false; + } + if (outTerminalId) { + *outTerminalId = terminalId; + } + } + // Write children position + if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { + return false; + } + return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), + isTerminal, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, + const int timestamp) const { + // TODO: Consolidate historical info and probability. + if (mBuffers->getHeaderPolicy()->hasHistoricalInfoOfWords()) { + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp); + return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( + &updatedHistoricalInfo); + } else { + return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability); + } +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, + const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, + const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars) { + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, + hasShortcutTargets, hasBigrams, hasMultipleChars, + CHILDREN_POSITION_FIELD_SIZE); + if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { + AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); + return false; + } + return true; +} + +} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h new file mode 100644 index 000000000..66845bbd6 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H +#define LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class Ver4BigramListPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PtNodeArrayReader; +class Ver4ShortcutListPolicy; + +/* + * This class is used for helping to writes nodes of ver4 patricia trie. + */ +class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { + public: + Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, + Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader, + Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) + : mTrieBuffer(trieBuffer), mBuffers(buffers), + mReadingHelper(ptNodeReader, ptNodeArrayReader), mBigramPolicy(bigramPolicy), + mShortcutPolicy(shortcutPolicy) {} + + virtual ~Ver4PatriciaTrieNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos); + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newProbability, const int timestamp); + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition); + + bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId); + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos); + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const int timestamp, int *const ptNodeWritingPos); + + virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, + bool *const outAddedNewBigram); + + virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam); + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount); + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability); + + bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); + + bool writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos); + + // Create updated probability entry using given probability and timestamp. In addition to the + // probability, this method updates historical information if needed. + const ProbabilityEntry createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, + const int timestamp) const; + + bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, + const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, + const bool hasMultipleChars); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mTrieBuffer; + Ver4DictBuffers *const mBuffers; + DynamicPtReadingHelper mReadingHelper; + Ver4BigramListPolicy *const mBigramPolicy; + Ver4ShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp new file mode 100644 index 000000000..b5d80be1d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -0,0 +1,444 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" + +#include <vector> + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/word_property.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; +const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + +void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); + while (!readingHelper.isEnd()) { + const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); + if (!ptNodeParams.isValid()) { + break; + } + bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); + if (isTerminal && mHeaderPolicy->isDecayingDict()) { + // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose + // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a + // valid terminal DicNode. + isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; + } + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), + ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, + ptNodeParams.hasChildren(), + ptNodeParams.isBlacklisted() + || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, + ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); + readingHelper.readNextSiblingNode(ptNodeParams); + } + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } +} + +int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodePos(ptNodePos); + const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( + maxCodePointCount, outCodePoints, outUnigramProbability); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); + } + return codePointCount; +} + +int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = + readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } + return ptNodePos; +} + +int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + if (mHeaderPolicy->isDecayingDict()) { + // Both probabilities are encoded. Decode them and get probability. + return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); + } else { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + // bigramProbability is a bigram probability delta. + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } + } +} + +int Ver4PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_PROBABILITY; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { + return NOT_A_PROBABILITY; + } + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); +} + +int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers.get()->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams.getTerminalId()); +} + +int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers.get()->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams.getTerminalId()); +} + +bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int length, + const int probability, const int *const shortcutTargetCodePoints, const int shortcutLength, + const int shortcutProbability, const bool isNotAWord, const bool isBlacklisted, + const int timestamp) { + if (!mBuffers.get()->isUpdatable()) { + AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (length > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %d", length); + return false; + } + if (shortcutLength > MAX_WORD_LENGTH) { + AKLOGE("The shortcutTarget is too long to insert to the dictionary, length: %d", + shortcutLength); + return false; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + bool addedNewUnigram = false; + if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, probability, isNotAWord, + isBlacklisted, timestamp, &addedNewUnigram)) { + if (addedNewUnigram) { + mUnigramCount++; + } + if (shortcutLength > 0) { + // Add shortcut target. + const int wordPos = getTerminalPtNodePositionOfWord(word, length, + false /* forceLowerCaseSearch */); + if (wordPos == NOT_A_DICT_POS) { + AKLOGE("Cannot find terminal PtNode position to add shortcut target."); + return false; + } + if (!mUpdatingHelper.addShortcutTarget(wordPos, shortcutTargetCodePoints, + shortcutLength, shortcutProbability)) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, probability: %d", + wordPos, shortcutLength, shortcutProbability); + return false; + } + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1, const int probability, + const int timestamp) { + if (!mBuffers.get()->isUpdatable()) { + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) { + AKLOGE("Either src word or target word is too long to insert the bigram to the dictionary. " + "length0: %d, length1: %d", length0, length1); + return false; + } + const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_DICT_POS) { + return false; + } + const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_DICT_POS) { + return false; + } + bool addedNewBigram = false; + if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, probability, timestamp, + &addedNewBigram)) { + if (addedNewBigram) { + mBigramCount++; + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1) { + if (!mBuffers.get()->isUpdatable()) { + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) { + AKLOGE("Either src word or target word is too long to remove the bigram to from the " + "dictionary. length0: %d, length1: %d", length0, length1); + return false; + } + const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_DICT_POS) { + return false; + } + const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_DICT_POS) { + return false; + } + if (mUpdatingHelper.removeBigramWords(word0Pos, word1Pos)) { + mBigramCount--; + return true; + } else { + return false; + } +} + +void Ver4PatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffers.get()->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); + return; + } + if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { + AKLOGE("Cannot flush the dictionary to file."); + mIsCorrupted = true; + } +} + +void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffers.get()->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return; + } + if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { + AKLOGE("Cannot flush the dictionary to file with GC."); + mIsCorrupted = true; + } +} + +bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffers.get()->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBuffers.get()->isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() + > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size of the trie exceeds the limit. + return true; + } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mDictBuffer->getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy->isDecayingDict()) { + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, + mHeaderPolicy); + } + return false; +} + +void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, + char *const outResult, const int maxResultLength) { + const int compareLength = queryLength + 1 /* terminator */; + if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mUnigramCount); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mBigramCount); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : + static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : + static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } +} + +const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, + const int codePointCount) const { + const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, + false /* forceLowerCaseSearch */); + if (ptNodePos == NOT_A_DICT_POS) { + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); + } + const PtNodeParams ptNodeParams = mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + std::vector<int> codePointVector(ptNodeParams.getCodePoints(), + ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); + const ProbabilityEntry probabilityEntry = + mBuffers.get()->getProbabilityDictContent()->getProbabilityEntry( + ptNodeParams.getTerminalId()); + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + // Fetch bigram information. + std::vector<WordProperty::BigramProperty> bigrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + if (bigramListPos != NOT_A_DICT_POS) { + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + const BigramDictContent *const bigramDictContent = mBuffers.get()->getBigramDictContent(); + const TerminalPositionLookupTable *const terminalPositionLookupTable = + mBuffers.get()->getTerminalPositionLookupTable(); + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + const int word1TerminalId = bigramEntry.getTargetTerminalId(); + const int word1TerminalPtNodePos = + terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); + if (word1TerminalPtNodePos == NOT_A_DICT_POS) { + continue; + } + // Word (unigram) probability + int word1Probability = NOT_A_PROBABILITY; + const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, + &word1Probability); + std::vector<int> word1(bigramWord1CodePoints, + bigramWord1CodePoints + codePointCount); + const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); + const int probability = bigramEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) : + bigramEntry.getProbability(); + bigrams.push_back(WordProperty::BigramProperty(&word1, probability, + historicalInfo->getTimeStamp(), historicalInfo->getLevel(), + historicalInfo->getCount())); + } + } + // Fetch shortcut information. + std::vector<WordProperty::ShortcutProperty> shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers.get()->getShortcutDictContent(); + bool hasNext = true; + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); + shortcuts.push_back(WordProperty::ShortcutProperty(&target, shortcutProbability)); + } + } + return WordProperty(&codePointVector, ptNodeParams.isNotAWord(), + ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(), + ptNodeParams.hasShortcutTargets(), ptNodeParams.getProbability(), + historicalInfo->getTimeStamp(), historicalInfo->getLevel(), + historicalInfo->getCount(), &bigrams, &shortcuts); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + int unigramProbability = NOT_A_PROBABILITY; + getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, + outCodePoints, &unigramProbability); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h new file mode 100644 index 000000000..7796e2ddc --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H +#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H + +#include <vector> + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +// TODO: Implement. +class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + Ver4PatriciaTriePolicy(const Ver4DictBuffers::Ver4DictBuffersPtr &buffers) + : mBuffers(buffers), mHeaderPolicy(mBuffers.get()->getHeaderPolicy()), + mDictBuffer(mBuffers.get()->getWritableTrieBuffer()), + mBigramPolicy(mBuffers.get()->getMutableBigramDictContent(), + mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy), + mShortcutPolicy(mBuffers.get()->getMutableShortcutDictContent(), + mBuffers.get()->getTerminalPositionLookupTable()), + mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()), + mPtNodeArrayReader(mDictBuffer), + mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader, + &mBigramPolicy, &mShortcutPolicy), + mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), + mWritingHelper(mBuffers.get()), + mUnigramCount(mHeaderPolicy->getUnigramCount()), + mBigramCount(mHeaderPolicy->getBigramCount()), + mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndProbabilityAndReturnCodePointCount( + const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + + int getTerminalPtNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; + + int getShortcutPositionOfPtNode(const int ptNodePos) const; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramPolicy; + } + + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutPolicy; + } + + bool addUnigramWord(const int *const word, const int length, const int probability, + const int *const shortcutTargetCodePoints, const int shortcutLength, + const int shortcutProbability, const bool isNotAWord, const bool isBlacklisted, + const int timestamp); + + bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability, const int timestamp); + + bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1); + + void flush(const char *const filePath); + + void flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC) const; + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength); + + const WordProperty getWordProperty(const int *const codePoints, + const int codePointCount) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); + + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + // When the dictionary size is near the maximum size, we have to refuse dynamic operations to + // prevent the dictionary from overflowing. + static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + + Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; + const HeaderPolicy *const mHeaderPolicy; + BufferWithExtendableBuffer *const mDictBuffer; + Ver4BigramListPolicy mBigramPolicy; + Ver4ShortcutListPolicy mShortcutPolicy; + Ver4PatriciaTrieNodeReader mNodeReader; + Ver4PtNodeArrayReader mPtNodeArrayReader; + Ver4PatriciaTrieNodeWriter mNodeWriter; + DynamicPtUpdatingHelper mUpdatingHelper; + Ver4PatriciaTrieWritingHelper mWritingHelper; + int mUnigramCount; + int mBigramCount; + std::vector<int> mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; +}; +} // namespace latinime +#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..254022db4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( + const uint8_t *const buffer, int *pos) { + return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h new file mode 100644 index 000000000..e418c4933 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class Ver4PatriciaTrieReadingUtils { + public: + static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..93053c38d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" + +#include <cstring> +#include <queue> + +#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/file_utils.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, + const int unigramCount, const int bigramCount) const { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " + "extendedRegionSize: %d", false, unigramCount, bigramCount, + extendedRegionSize); + return false; + } + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const dictDirPath) { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( + Ver4DictBuffers::createVer4DictBuffers(headerPolicy)); + int unigramCount = 0; + int bigramCount = 0; + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { + return false; + } + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { + return false; + } + return dictBuffers.get()->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, + int *const outUnigramCount, int *const outBigramCount) { + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), + mBuffers->getProbabilityDictContent()); + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); + Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), + mBuffers->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), + mBuffers, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, &shortcutPolicy); + + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + &ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount(); + if (headerPolicy->isDecayingDict() + && unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { + if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, + ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) { + AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, + ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC); + return false; + } + } + + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability + traversePolicyToUpdateBigramProbability(&ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateBigramProbability)) { + return false; + } + const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); + if (headerPolicy->isDecayingDict() + && bigramCount > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { + if (!truncateBigrams(ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC)) { + AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, + ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC); + return false; + } + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, &shortcutPolicy); + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instances for the GCed dictionary. + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), + buffersToWrite->getProbabilityDictContent()); + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); + Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), + buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), + buffersToWrite->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, + &newShortcutPolicy); + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + // Run GC for probability dict content. + if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, + mBuffers->getProbabilityDictContent())) { + return false; + } + // Run GC for bigram dict content. + if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, + mBuffers->getBigramDictContent(), outBigramCount)) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { + return false; + } + *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( + const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> + priorityQueue; + for (int i = 0; i < nextTerminalId; ++i) { + const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); + if (terminalPos == NOT_A_DICT_POS) { + continue; + } + const ProbabilityEntry probabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); + const int probability = probabilityEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo()) : + probabilityEntry.getProbability(); + priorityQueue.push(DictProbability(terminalPos, probability, + probabilityEntry.getHistoricalInfo()->getTimeStamp())); + } + + // Delete unigrams. + while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) { + const int ptNodePos = priorityQueue.top().getDictPos(); + const PtNodeParams ptNodeParams = + ptNodeReader->fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); + return false; + } + priorityQueue.pop(); + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> + priorityQueue; + BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); + for (int i = 0; i < nextTerminalId; ++i) { + const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); + if (bigramListPos == NOT_A_DICT_POS) { + continue; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int probability = bigramEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) : + bigramEntry.getProbability(); + priorityQueue.push(DictProbability(entryPos, probability, + bigramEntry.getHistoricalInfo()->getTimeStamp())); + } + } + + // Delete bigrams. + while (static_cast<int>(priorityQueue.size()) > maxBigramCount) { + const int entryPos = priorityQueue.top().getDictPos(); + const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); + const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { + AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); + return false; + } + priorityQueue.pop(); + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + } + return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h new file mode 100644 index 000000000..bb464ad28 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" + +namespace latinime { + +class HeaderPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PatriciaTrieNodeWriter; + +class Ver4PatriciaTrieWritingHelper { + public: + Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) + : mBuffers(buffers) {} + + bool writeToDictFile(const char *const dictDirPath, const int unigramCount, + const int bigramCount) const; + + // This method cannot be const because the original dictionary buffer will be updated to detect + // useless PtNodes during GC. + bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); + + class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) + : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); + + Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; + const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbability { + public: + DictProbability(const int dictPos, const int probability, const int timestamp) + : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {} + + int getDictPos() const { + return mDictPos; + } + + int getProbability() const { + return mProbability; + } + + int getTimestamp() const { + return mTimestamp; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability); + + int mDictPos; + int mProbability; + int mTimestamp; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbabilityComparator { + public: + bool operator()(const DictProbability &left, const DictProbability &right) { + if (left.getProbability() != right.getProbability()) { + return left.getProbability() > right.getProbability(); + } + if (left.getTimestamp() != right.getTimestamp()) { + return left.getTimestamp() < right.getTimestamp(); + } + return left.getDictPos() > right.getDictPos(); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator); + }; + + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, + int *const outBigramCount); + + bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount); + + bool truncateBigrams(const int maxBigramCount); + + Ver4DictBuffers *const mBuffers; +}; +} // namespace latinime + +#endif /* LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp new file mode 100644 index 000000000..bbdf40cdd --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + ptNodeArrayPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = ptNodeArrayPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &readingPos); + if (usesAdditionalBuffer) { + readingPos += mBuffer->getOriginalBufferSize(); + } + if (ptNodeCountInArray < 0) { + AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); + return false; + } + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + forwordLinkPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = forwordLinkPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int nextPtNodeArrayOffset = + DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); + if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { + *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; + } else { + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h new file mode 100644 index 000000000..d81808efc --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PT_NODE_ARRAY_READER_H +#define LATINIME_VER4_PT_NODE_ARRAY_READER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class Ver4PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp index f692882f2..259dae4c6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -18,11 +18,42 @@ namespace latinime { -const size_t BufferWithExtendableBuffer::MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const size_t BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; // TODO: Needs to allocate larger memory corresponding to the current vector size. const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; +uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(pos); + const int posInBuffer = readingPosIsInAdditionalBuffer ? pos - mOriginalBufferSize : pos; + return ByteArrayUtils::readUint(getBuffer(readingPosIsInAdditionalBuffer), size, posInBuffer); +} + +uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size, + int *const pos) const { + const int value = readUint(size, *pos); + *pos += size; + return value; +} + +void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(*pos); + if (readingPosIsInAdditionalBuffer) { + *pos -= mOriginalBufferSize; + } + *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( + getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos); + if (readingPosIsInAdditionalBuffer) { + *pos += mOriginalBufferSize; + } +} + +bool BufferWithExtendableBuffer::writeUint(const uint32_t data, const int size, const int pos) { + int writingPos = pos; + return writeUintAndAdvancePosition(data, size, &writingPos); +} + bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos) { if (!(size >= 1 && size <= 4)) { @@ -46,7 +77,7 @@ bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data } bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints, - const int codePointCount, const bool writesTerminator ,int *const pos) { + const int codePointCount, const bool writesTerminator, int *const pos) { const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints( codePoints, codePointCount, writesTerminator); if (!checkAndPrepareWriting(*pos, size)) { @@ -100,4 +131,21 @@ bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int return true; } +bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) { + int copyingPos = 0; + const int tailPos = sourceBuffer->getTailPosition(); + const int maxDataChunkSize = sizeof(uint32_t); + while (copyingPos < tailPos) { + const int remainingSize = tailPos - copyingPos; + const int copyingSize = (remainingSize >= maxDataChunkSize) ? + maxDataChunkSize : remainingSize; + const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos); + if (!writeUint(data, copyingSize, copyingPos)) { + return false; + } + copyingPos += copyingSize; + } + return true; +} + } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h index 9dc34823c..76be16518 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h @@ -32,12 +32,20 @@ namespace latinime { // raw pointer but provides several methods that handle boundary checking for writing data. class BufferWithExtendableBuffer { public: + static const size_t DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE; + BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize, - const int maxAdditionalBufferSize = MAX_ADDITIONAL_BUFFER_SIZE) + const int maxAdditionalBufferSize) : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + // Without original buffer. + BufferWithExtendableBuffer(const int maxAdditionalBufferSize) + : mOriginalBuffer(0), mOriginalBufferSize(0), + mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + AK_FORCE_INLINE int getTailPosition() const { return mOriginalBufferSize + mUsedAdditionalBufferSize; } @@ -63,6 +71,13 @@ class BufferWithExtendableBuffer { } } + uint32_t readUint(const int size, const int pos) const; + + uint32_t readUintAndAdvancePosition(const int size, int *const pos) const; + + void readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const; + AK_FORCE_INLINE int getOriginalBufferSize() const { return mOriginalBufferSize; } @@ -78,15 +93,18 @@ class BufferWithExtendableBuffer { * Writing is allowed for original buffer, already written region of additional buffer and the * tail of additional buffer. */ + bool writeUint(const uint32_t data, const int size, const int pos); + bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos); bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, const bool writesTerminator, int *const pos); + bool copy(const BufferWithExtendableBuffer *const sourceBuffer); + private: DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); - static const size_t MAX_ADDITIONAL_BUFFER_SIZE; static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h index 0c1576818..ebdd523e1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h @@ -114,6 +114,24 @@ class ByteArrayUtils { return buffer[(*pos)++]; } + static AK_FORCE_INLINE int readUint(const uint8_t *const buffer, + const int size, const int pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + return ByteArrayUtils::readUint8(buffer, pos); + case 2: + return ByteArrayUtils::readUint16(buffer, pos); + case 3: + return ByteArrayUtils::readUint24(buffer, pos); + case 4: + return ByteArrayUtils::readUint32(buffer, pos); + default: + return 0; + } + } + /** * Code Point Reading * diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 994826fa8..faef72079 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -17,73 +17,98 @@ #include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" #include <cstdio> -#include <cstring> #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/file_utils.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "utils/time_keeper.h" namespace latinime { const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; /* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, - const int dictVersion, const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + const int dictVersion, const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + TimeKeeper::setCurrentTime(); switch (dictVersion) { - case 3: - return createEmptyV3DictFile(filePath, attributeMap); + case FormatUtils::VERSION_4: + return createEmptyV4DictFile(filePath, localeAsCodePointVector, attributeMap); default: - // Only version 3 dictionary is supported for now. + AKLOGE("Cannot create dictionary %s because format version %d is not supported.", + filePath, dictVersion); return false; } } -/* static */ bool DictFileWritingUtils::createEmptyV3DictFile(const char *const filePath, - const HeaderReadWriteUtils::AttributeMap *const attributeMap) { - BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); - HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap); - headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, - true /* updatesLastDecayedTime */, 0 /* unigramCount */, 0 /* bigramCount */, - 0 /* extendedRegionSize */); - BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); - if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) { +/* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath, + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + HeaderPolicy headerPolicy(FormatUtils::VERSION_4, localeAsCodePointVector, attributeMap); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers = + Ver4DictBuffers::createVer4DictBuffers(&headerPolicy); + headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + 0 /* unigramCount */, 0 /* bigramCount */, + 0 /* extendedRegionSize */, dictBuffers.get()->getWritableHeaderBuffer()); + if (!DynamicPtWritingUtils::writeEmptyDictionary( + dictBuffers.get()->getWritableTrieBuffer(), 0 /* rootPos */)) { + AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); return false; } - return flushAllHeaderAndBodyToFile(filePath, &headerBuffer, &bodyBuffer); + return dictBuffers.get()->flush(dirPath); } /* static */ bool DictFileWritingUtils::flushAllHeaderAndBodyToFile(const char *const filePath, BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody) { - const int tmpFileNameBufSize = strlen(filePath) - + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */; + const int tmpFileNameBufSize = FileUtils::getFilePathWithSuffixBufSize(filePath, + TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); // Name of a temporary file used for writing that is a connected string of original name and // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE. char tmpFileName[tmpFileNameBufSize]; - snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", filePath, - TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); - FILE *const file = fopen(tmpFileName, "wb"); - if (!file) { - AKLOGE("Dictionary file %s cannnot be opened.", tmpFileName); - ASSERT(false); + FileUtils::getFilePathWithSuffix(filePath, TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, + tmpFileNameBufSize, tmpFileName); + if (!DictFileWritingUtils::flushBufferToFile(tmpFileName, dictHeader)) { + AKLOGE("Dictionary header cannot be written to %s.", tmpFileName); + return false; + } + if (!DictFileWritingUtils::flushBufferToFile(tmpFileName, dictBody)) { + AKLOGE("Dictionary structure cannot be written to %s.", tmpFileName); + return false; + } + if (rename(tmpFileName, filePath) != 0) { + AKLOGE("Dictionary file %s cannot be renamed to %s", tmpFileName, filePath);; return false; } - // Write the dictionary header. - if (!writeBufferToFile(file, dictHeader)) { - remove(tmpFileName); - AKLOGE("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition()); + return true; +} + +/* static */ bool DictFileWritingUtils::flushBufferToFileWithSuffix(const char *const basePath, + const char *const suffix, const BufferWithExtendableBuffer *const buffer) { + const int filePathBufSize = FileUtils::getFilePathWithSuffixBufSize(basePath, suffix); + char filePath[filePathBufSize]; + FileUtils::getFilePathWithSuffix(basePath, suffix, filePathBufSize, filePath); + return flushBufferToFile(filePath, buffer); +} + +/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer) { + FILE *const file = fopen(filePath, "wb"); + if (!file) { + AKLOGE("File %s cannot be opened.", filePath); ASSERT(false); return false; } - // Write the dictionary body. - if (!writeBufferToFile(file, dictBody)) { - remove(tmpFileName); - AKLOGE("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition()); + if (!writeBufferToFile(file, buffer)) { + remove(filePath); + AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath, + buffer->getTailPosition()); ASSERT(false); return false; } fclose(file); - rename(tmpFileName, filePath); return true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h index bd4ac66fd..54ec651f7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h @@ -28,20 +28,28 @@ class BufferWithExtendableBuffer; class DictFileWritingUtils { public: + static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + static bool createEmptyDictFile(const char *const filePath, const int dictVersion, - const HeaderReadWriteUtils::AttributeMap *const attributeMap); + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); static bool flushAllHeaderAndBodyToFile(const char *const filePath, BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody); + static bool flushBufferToFileWithSuffix(const char *const basePath, const char *const suffix, + const BufferWithExtendableBuffer *const buffer); + private: DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); - static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + static bool createEmptyV4DictFile(const char *const filePath, + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); - static bool createEmptyV3DictFile(const char *const filePath, - const HeaderReadWriteUtils::AttributeMap *const attributeMap); + static bool flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer); static bool writeBufferToFile(FILE *const file, const BufferWithExtendableBuffer *const buffer); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp new file mode 100644 index 000000000..9441a75fc --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/file_utils.h" + +#include <cstdio> +#include <cstring> +#include <dirent.h> +#include <fcntl.h> +#include <libgen.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +namespace latinime { + +// Returns -1 on error. +/* static */ int FileUtils::getFileSize(const char *const filePath) { + const int fd = open(filePath, O_RDONLY); + if (fd == -1) { + return -1; + } + struct stat statBuf; + if (fstat(fd, &statBuf) != 0) { + close(fd); + return -1; + } + close(fd); + return static_cast<int>(statBuf.st_size); +} + +/* static */ bool FileUtils::existsDir(const char *const dirPath) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + return false; + } + closedir(dir); + return true; +} + +// Remove a directory and all files in the directory. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath) { + return removeDirAndFiles(dirPath, 5 /* maxTries */); +} + +// Remove a directory and all files in the directory, trying up to maxTimes. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath, const int maxTries) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + AKLOGE("Cannot open dir %s.", dirPath); + return true; + } + struct dirent *dirent; + while ((dirent = readdir(dir)) != NULL) { + if (dirent->d_type == DT_DIR) { + continue; + } + const int filePathBufSize = getFilePathBufSize(dirPath, dirent->d_name); + char filePath[filePathBufSize]; + getFilePath(dirPath, dirent->d_name, filePathBufSize, filePath); + if (remove(filePath) != 0) { + AKLOGE("Cannot remove file %s.", filePath); + closedir(dir); + return false; + } + } + closedir(dir); + if (remove(dirPath) != 0) { + if (maxTries > 0) { + // On NFS, deleting files sometimes creates new files. I'm not sure what the + // correct way of dealing with this is, but for the time being, this seems to work. + removeDirAndFiles(dirPath, maxTries - 1); + } else { + AKLOGE("Cannot remove directory %s.", dirPath); + return false; + } + } + return true; +} + +/* static */ int FileUtils::getFilePathWithSuffixBufSize(const char *const filePath, + const char *const suffix) { + return strlen(filePath) + strlen(suffix) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePathWithSuffix(const char *const filePath, + const char *const suffix, const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s%s", filePath, suffix); +} + +/* static */ int FileUtils::getFilePathBufSize(const char *const dirPath, + const char *const fileName) { + return strlen(dirPath) + 1 /* '/' */ + strlen(fileName) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s/%s", dirPath, fileName); +} + +/* static */ bool FileUtils::getFilePathWithoutSuffix(const char *const filePath, + const char *const suffix, const int outDirPathBufSize, char *const outDirPath) { + const int filePathLength = strlen(filePath); + const int suffixLength = strlen(suffix); + if (filePathLength <= suffixLength) { + AKLOGE("File path length (%s:%d) is shorter that suffix length (%s:%d).", + filePath, filePathLength, suffix, suffixLength); + return false; + } + const int resultFilePathLength = filePathLength - suffixLength; + if (outDirPathBufSize <= resultFilePathLength) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, suffix: %s, outDirPathBufSize: %d", + filePath, suffix, outDirPathBufSize); + return false; + } + if (strncmp(filePath + resultFilePathLength, suffix, suffixLength) != 0) { + AKLOGE("File Path %s does not have %s as a suffix", filePath, suffix); + return false; + } + snprintf(outDirPath, resultFilePathLength + 1 /* terminator */, "%s", filePath); + return true; +} + +/* static */ void FileUtils::getDirPath(const char *const filePath, const int outDirPathBufSize, + char *const outDirPath) { + for (int i = strlen(filePath) - 1; i >= 0; --i) { + if (filePath[i] == '/') { + if (i >= outDirPathBufSize) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, outDirPathBufSize: %d", + filePath, outDirPathBufSize); + ASSERT(false); + return; + } + snprintf(outDirPath, i + 1 /* terminator */, "%s", filePath); + return; + } + } +} + +/* static */ void FileUtils::getBasename(const char *const filePath, + const int outNameBufSize, char *const outName) { + const int filePathBufSize = strlen(filePath) + 1 /* terminator */; + char filePathBuf[filePathBufSize]; + snprintf(filePathBuf, filePathBufSize, "%s", filePath); + const char *const baseName = basename(filePathBuf); + const int baseNameLength = strlen(baseName); + if (baseNameLength >= outNameBufSize) { + AKLOGE("outNameBufSize is too small. filePath: %s, outNameBufSize: %d", + filePath, outNameBufSize); + return; + } + snprintf(outName, baseNameLength + 1 /* terminator */, "%s", baseName); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h new file mode 100644 index 000000000..4f1b93a6a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FILE_UTILS_H +#define LATINIME_FILE_UTILS_H + +#include "defines.h" + +namespace latinime { + +class FileUtils { + public: + // Returns -1 on error. + static int getFileSize(const char *const filePath); + + static bool existsDir(const char *const dirPath); + + // Remove a directory and all files in the directory. + static bool removeDirAndFiles(const char *const dirPath); + + static int getFilePathWithSuffixBufSize(const char *const filePath, const char *const suffix); + + static void getFilePathWithSuffix(const char *const filePath, const char *const suffix, + const int filePathBufSize, char *const outFilePath); + + static int getFilePathBufSize(const char *const dirPath, const char *const fileName); + + static void getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath); + + // Returns whether the filePath have the suffix. + static bool getFilePathWithoutSuffix(const char *const filePath, const char *const suffix, + const int dirPathBufSize, char *const outDirPath); + + static void getDirPath(const char *const filePath, const int dirPathBufSize, + char *const outDirPath); + + static void getBasename(const char *const filePath, const int outNameBufSize, + char *const outName); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FileUtils); + + static bool removeDirAndFiles(const char *const dirPath, const int maxTries); +}; +} // namespace latinime +#endif /* LATINIME_FILE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index 1632fd072..d58d25989 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -14,14 +14,14 @@ * limitations under the License. */ +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + #include <cmath> -#include <ctime> #include <stdlib.h> -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" +#include "utils/time_keeper.h" namespace latinime { @@ -31,76 +31,87 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; -const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15; -const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3; -const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; -// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected -// duration of the decay is approximately 66hours. -const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; -const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; -ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper; +const int ForgettingCurveUtils::MAX_LEVEL = 3; +const int ForgettingCurveUtils::MAX_COUNT = 3; +const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1; +const int ForgettingCurveUtils::TIME_STEP_DURATION_IN_SECONDS = 6 * 60 * 60; +const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15; +const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14; -void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { - mCurrentTime = time(0); -} +const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; -/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, - const int encodedBigramProbability) { - if (encodedUnigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (encodedBigramProbability == NOT_A_PROBABILITY) { - return backoff(decodeProbability(encodedUnigramProbability)); +// TODO: Revise the logic to decide the initial probability depending on the given probability. +/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, + const int newProbability, const int timestamp) { + if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { + return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */); + } else if (!originalHistoricalInfo->isValid()) { + // Initial information. + return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */); } else { - const int unigramProbability = decodeProbability(encodedUnigramProbability); - const int bigramProbability = decodeProbability(encodedBigramProbability); - return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); + const int updatedCount = originalHistoricalInfo->getCount() + 1; + if (updatedCount > MAX_COUNT) { + // The count exceeds the max value the level can be incremented. + if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { + // The level is already max. + return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), + originalHistoricalInfo->getCount()); + } else { + // Level up. + return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel() + 1, + 0 /* count */); + } + } else { + return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); + } } } -// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding -// (i.e. unigram probability + bigram probability delta). -/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability( - const int originalEncodedProbability, const int newProbability) { - if (originalEncodedProbability == NOT_A_PROBABILITY) { - // The bigram relation is not in this dictionary. - if (newProbability == NOT_A_PROBABILITY) { - // The bigram target is not in other dictionaries. - return 0; - } else { - return MIN_VALID_ENCODED_PROBABILITY; - } +/* static */ int ForgettingCurveUtils::decodeProbability( + const HistoricalInfo *const historicalInfo) { + const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp()); + return sProbabilityTable.getProbability(historicalInfo->getLevel(), + min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT)); +} + +/* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability, + const int bigramProbability) { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return min(backoff(unigramProbability), MAX_COMPUTED_PROBABILITY); } else { - if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) { - return MIN_VALID_ENCODED_PROBABILITY; - } - return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY); + return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); } } -/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) { - return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; +/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) { + return historicalInfo->getLevel() > 0 + || getElapsedTimeStepCount(historicalInfo->getTimeStamp()) + < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; } -/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability, - const DictionaryHeaderStructurePolicy *const headerPolicy) { - const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime(); - const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1); - int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); - // TODO: Implement the decay in more proper way. - for (int i = 0; i < decayIterationCount; ++i) { - const float currentRate = static_cast<float>(currentEncodedProbability) - / static_cast<float>(MAX_ENCODED_PROBABILITY); - const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; - const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); - if (thresholdToDecay < randValue) { - currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, - 0); - } +/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo) { + if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) { + return HistoricalInfo(); } - return currentEncodedProbability; + const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp()); + if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { + // No need to update historical info. + return *originalHistoricalInfo; + } + // Level down. + const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? + originalHistoricalInfo->getLevel() : maxLevelDownAmonut; + const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() + + levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS; + return HistoricalInfo(adjustedTimestamp, + originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); } /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, @@ -116,21 +127,14 @@ void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { if (mindsBlockByDecay) { return false; } - if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) { + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS + < TimeKeeper::peekCurrentTime()) { // Time to decay. return true; } return false; } -/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) { - if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) { - return NOT_A_PROBABILITY; - } else { - return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY); - } -} - // See comments in ProbabilityUtils::backoff(). /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { if (unigramProbability == NOT_A_PROBABILITY) { @@ -140,15 +144,29 @@ void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { } } +/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp) { + return (TimeKeeper::peekCurrentTime() - timestamp) / TIME_STEP_DURATION_IN_SECONDS; +} + ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() { - // Table entry is as follows: - // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127. - // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used. - mTable.resize(MAX_ENCODED_PROBABILITY + 1); - for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) { - const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY), - static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY))); - mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability)); + mTable.resize(MAX_LEVEL + 1); + for (int level = 0; level <= MAX_LEVEL; ++level) { + mTable[level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); + const float initialProbability = + static_cast<float>(MAX_COMPUTED_PROBABILITY / (1 << (MAX_LEVEL - level))); + for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; ++timeStepCount) { + if (level == 0) { + mTable[level][timeStepCount] = NOT_A_PROBABILITY; + continue; + } + const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS; + const float probability = initialProbability + * powf(2.0f, -1.0f * static_cast<float>(elapsedTime) + / static_cast<float>(TIME_STEP_DURATION_IN_SECONDS + * (MAX_ELAPSED_TIME_STEP_COUNT + 1))); + mTable[level][timeStepCount] = + min(max(static_cast<int>(probability), 1), MAX_COMPUTED_PROBABILITY); + } } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index 2ad423874..b37353455 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -20,45 +20,32 @@ #include <vector> #include "defines.h" +#include "suggest/policyimpl/dictionary/utils/historical_info.h" namespace latinime { class DictionaryHeaderStructurePolicy; -// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is -// required to introduced to each terminal PtNode and bigram entry. -// TODO: Quit using bigram probability to indicate the delta. class ForgettingCurveUtils { public: - class TimeKeeper { - public: - TimeKeeper() : mCurrentTime(0) {} - void setCurrentTime(); - int peekCurrentTime() const { return mCurrentTime; }; - - private: - DISALLOW_COPY_AND_ASSIGN(TimeKeeper); - - int mCurrentTime; - }; - static const int MAX_UNIGRAM_COUNT; static const int MAX_UNIGRAM_COUNT_AFTER_GC; static const int MAX_BIGRAM_COUNT; static const int MAX_BIGRAM_COUNT_AFTER_GC; - static TimeKeeper sTimeKeeper; + static const HistoricalInfo createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const int timestamp); - static int getProbability(const int encodedUnigramProbability, - const int encodedBigramProbability); + static const HistoricalInfo createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo); - static int getUpdatedEncodedProbability(const int originalEncodedProbability, - const int newProbability); + static int decodeProbability(const HistoricalInfo *const historicalInfo); - static int isValidEncodedProbability(const int encodedProbability); + static int getProbability(const int encodedUnigramProbability, + const int encodedBigramProbability); - static int getEncodedProbabilityToSave(const int encodedProbability, - const DictionaryHeaderStructurePolicy *const headerPolicy); + static bool needsToKeep(const HistoricalInfo *const historicalInfo); static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy); @@ -70,31 +57,32 @@ class ForgettingCurveUtils { public: ProbabilityTable(); - int getProbability(const int encodedProbability) const { - if (encodedProbability < 0 || encodedProbability > static_cast<int>(mTable.size())) { - return NOT_A_PROBABILITY; - } - return mTable[encodedProbability]; + int getProbability(const int level, const int elapsedTimeStepCount) const { + return mTable[level][elapsedTimeStepCount]; } private: DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); - std::vector<int> mTable; + std::vector<std::vector<int> > mTable; }; static const int MAX_COMPUTED_PROBABILITY; - static const int MAX_ENCODED_PROBABILITY; - static const int MIN_VALID_ENCODED_PROBABILITY; - static const int ENCODED_PROBABILITY_STEP; - static const float MIN_PROBABILITY_TO_DECAY; static const int DECAY_INTERVAL_SECONDS; - static const ProbabilityTable sProbabilityTable; + static const int MAX_LEVEL; + static const int MAX_COUNT; + static const int MIN_VALID_LEVEL; + static const int TIME_STEP_DURATION_IN_SECONDS; + static const int MAX_ELAPSED_TIME_STEP_COUNT; + static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; + static const int HALF_LIFE_TIME_IN_SECONDS; - static int decodeProbability(const int encodedProbability); + static const ProbabilityTable sProbabilityTable; static int backoff(const int unigramProbability); + + static int getElapsedTimeStepCount(const int timestamp); }; } // namespace latinime #endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index 1d77d5c27..cd3c403fa 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -41,10 +41,13 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; // Dictionary format version number (2 bytes) // Options (2 bytes) // Header size (4 bytes) : integer, big endian - if (ByteArrayUtils::readUint16(dict, 4) == 2) { + // Conceptually this converts the hardcoded value of the bytes in the file into + // the symbolic value we use in the code. But we want the constants to be the + // same so we use them for both here. + if (ByteArrayUtils::readUint16(dict, 4) == VERSION_2) { return VERSION_2; - } else if (ByteArrayUtils::readUint16(dict, 4) == 3) { - return VERSION_3; + } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4) { + return VERSION_4; } else { return UNKNOWN_VERSION; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index 79ed0de29..7c6a21dd7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -29,9 +29,10 @@ namespace latinime { class FormatUtils { public: enum FORMAT_VERSION { - VERSION_2, - VERSION_3, - UNKNOWN_VERSION + // These MUST have the same values as the relevant constants in FormatSpec.java. + VERSION_2 = 2, + VERSION_4 = 401, + UNKNOWN_VERSION = -1 }; // 32 bit magic number is stored at the beginning of the dictionary header to reject diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h b/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h new file mode 100644 index 000000000..428ca8626 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HISTORICAL_INFO_H +#define LATINIME_HISTORICAL_INFO_H + +#include "defines.h" + +namespace latinime { + +class HistoricalInfo { + public: + // Invalid historical info. + HistoricalInfo() + : mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0) {} + + HistoricalInfo(const int timestamp, const int level, const int count) + : mTimestamp(timestamp), mLevel(level), mCount(count) {} + + bool isValid() const { + return mTimestamp != NOT_A_TIMESTAMP; + } + + int getTimeStamp() const { + return mTimestamp; + } + + int getLevel() const { + return mLevel; + } + + int getCount() const { + return mCount; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_ASSIGNMENT_OPERATOR(HistoricalInfo); + + const int mTimestamp; + const int mLevel; + const int mCount; +}; +} // namespace latinime +#endif /* LATINIME_HISTORICAL_INFO_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp new file mode 100644 index 000000000..e88d6e0a9 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +#include <cerrno> +#include <climits> +#include <cstdio> +#include <fcntl.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "suggest/policyimpl/dictionary/utils/file_utils.h" + +namespace latinime { + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const int bufferOffset, const int bufferSize, + const bool isUpdatable) { + const int mmapFd = open(path, O_RDONLY); + if (mmapFd < 0) { + AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); + return MmappedBufferPtr(0); + } + const int pagesize = sysconf(_SC_PAGESIZE); + const int offset = bufferOffset % pagesize; + int alignedOffset = bufferOffset - offset; + int alignedSize = bufferSize + offset; + const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; + void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, + alignedOffset); + if (mmappedBuffer == MAP_FAILED) { + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + close(mmapFd); + return MmappedBufferPtr(0); + } + uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; + if (!buffer) { + AKLOGE("DICT: buffer is null"); + close(mmapFd); + return MmappedBufferPtr(0); + } + return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, + mmapFd, isUpdatable)); +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const bool isUpdatable) { + const int fileSize = FileUtils::getFileSize(path); + if (fileSize == -1) { + return MmappedBufferPtr(0); + } else if (fileSize == 0) { + return MmappedBufferPtr(new MmappedBuffer(isUpdatable)); + } else { + return openBuffer(path, 0 /* bufferOffset */, fileSize, isUpdatable); + } +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const dirPath, const char *const fileName, const bool isUpdatable) { + const int filePathBufferSize = PATH_MAX + 1 /* terminator */; + char filePath[filePathBufferSize]; + const int filePathLength = snprintf(filePath, filePathBufferSize, "%s%s", dirPath, + fileName); + if (filePathLength >= filePathBufferSize) { + return 0; + } + return openBuffer(filePath, isUpdatable); +} + +MmappedBuffer::~MmappedBuffer() { + if (mAlignedSize == 0) { + return; + } + int ret = munmap(mMmappedBuffer, mAlignedSize); + if (ret != 0) { + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + } + ret = close(mMmapFd); + if (ret != 0) { + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h index 6b69116eb..73a733b0c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h @@ -17,58 +17,27 @@ #ifndef LATINIME_MMAPPED_BUFFER_H #define LATINIME_MMAPPED_BUFFER_H -#include <cerrno> -#include <fcntl.h> #include <stdint.h> -#include <sys/mman.h> -#include <unistd.h> #include "defines.h" +#include "utils/exclusive_ownership_pointer.h" namespace latinime { class MmappedBuffer { public: - static MmappedBuffer* openBuffer(const char *const path, const int bufferOffset, - const int bufferSize, const bool isUpdatable) { - const int openMode = isUpdatable ? O_RDWR : O_RDONLY; - const int mmapFd = open(path, openMode); - if (mmapFd < 0) { - AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); - return 0; - } - const int pagesize = getpagesize(); - const int offset = bufferOffset % pagesize; - int alignedOffset = bufferOffset - offset; - int alignedSize = bufferSize + offset; - const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; - void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, - alignedOffset); - if (mmappedBuffer == MAP_FAILED) { - AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); - close(mmapFd); - return 0; - } - uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; - if (!buffer) { - AKLOGE("DICT: buffer is null"); - close(mmapFd); - return 0; - } - return new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, mmapFd, - isUpdatable); - } + typedef ExclusiveOwnershipPointer<MmappedBuffer> MmappedBufferPtr; - ~MmappedBuffer() { - int ret = munmap(mMmappedBuffer, mAlignedSize); - if (ret != 0) { - AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); - } - ret = close(mMmapFd); - if (ret != 0) { - AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); - } - } + static MmappedBufferPtr openBuffer(const char *const path, + const int bufferOffset, const int bufferSize, const bool isUpdatable); + + // Mmap entire file. + static MmappedBufferPtr openBuffer(const char *const path, const bool isUpdatable); + + static MmappedBufferPtr openBuffer(const char *const dirPath, const char *const fileName, + const bool isUpdatable); + + ~MmappedBuffer(); AK_FORCE_INLINE uint8_t *getBuffer() const { return mBuffer; @@ -89,6 +58,11 @@ class MmappedBuffer { : mBuffer(buffer), mBufferSize(bufferSize), mMmappedBuffer(mmappedBuffer), mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} + // Empty file. We have to handle an empty file as a valid part of a dictionary. + AK_FORCE_INLINE MmappedBuffer(const bool isUpdatable) + : mBuffer(0), mBufferSize(0), mMmappedBuffer(0), mAlignedSize(0), mMmapFd(0), + mIsUpdatable(isUpdatable) {} + DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); uint8_t *const mBuffer; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h index 21fe355b8..14fdf53cb 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h @@ -23,6 +23,7 @@ namespace latinime { +// TODO: Quit using bigram probability to indicate the delta. class ProbabilityUtils { public: static AK_FORCE_INLINE int backoff(const int unigramProbability) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp new file mode 100644 index 000000000..c380429d9 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/sparse_table.h" + +namespace latinime { + +const int SparseTable::NOT_EXIST = -1; +const int SparseTable::INDEX_SIZE = 4; + +bool SparseTable::contains(const int id) const { + const int readingPos = getPosInIndexTable(id); + if (id < 0 || mIndexTableBuffer->getTailPosition() <= readingPos) { + return false; + } + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, readingPos); + return index != NOT_EXIST; +} + +uint32_t SparseTable::get(const int id) const { + const int indexTableReadingPos = getPosInIndexTable(id); + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, indexTableReadingPos); + const int contentTableReadingPos = getPosInContentTable(id, index); + const int contentValue = mContentTableBuffer->readUint(mDataSize, contentTableReadingPos); + return contentValue == NOT_EXIST ? NOT_A_DICT_POS : contentValue; +} + +bool SparseTable::set(const int id, const uint32_t value) { + const int posInIndexTable = getPosInIndexTable(id); + // Extends the index table if needed. + if (mIndexTableBuffer->getTailPosition() < posInIndexTable) { + int tailPos = mIndexTableBuffer->getTailPosition(); + while(tailPos < posInIndexTable) { + if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) { + AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable); + return false; + } + } + } + if (contains(id)) { + // The entry is already in the content table. + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable); + if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) { + AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value, + getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(), + mDataSize); + return false; + } + return true; + } + // The entry is not in the content table. + // Create new entry in the content table. + const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition()); + if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) { + AKLOGE("cannot write index %d. pos %d", index, posInIndexTable); + return false; + } + // Write a new block that containing the entry to be set. + int writingPos = getPosInContentTable(0 /* id */, index); + for (int i = 0; i < mBlockSize; ++i) { + if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, mDataSize, + &writingPos)) { + AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, " + "mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize); + return false; + } + } + return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index)); +} + +int SparseTable::getIndexFromContentTablePos(const int contentTablePos) const { + return contentTablePos / mDataSize / mBlockSize; +} + +int SparseTable::getPosInIndexTable(const int id) const { + return (id / mBlockSize) * INDEX_SIZE; +} + +int SparseTable::getPosInContentTable(const int id, const int index) const { + const int offset = id % mBlockSize; + return (index * mBlockSize + offset) * mDataSize; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h new file mode 100644 index 000000000..21c167506 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SPARSE_TABLE_H +#define LATINIME_SPARSE_TABLE_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// Note that there is a corresponding implementation in SparseTable.java. +// TODO: Support multiple content buffers. +class SparseTable { + public: + SparseTable(BufferWithExtendableBuffer *const indexTableBuffer, + BufferWithExtendableBuffer *const contentTableBuffer, const int blockSize, + const int dataSize) + : mIndexTableBuffer(indexTableBuffer), mContentTableBuffer(contentTableBuffer), + mBlockSize(blockSize), mDataSize(dataSize) {} + + bool contains(const int id) const; + + uint32_t get(const int id) const; + + bool set(const int id, const uint32_t value); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTable); + + int getIndexFromContentTablePos(const int contentTablePos) const; + + int getPosInIndexTable(const int id) const; + + int getPosInContentTable(const int id, const int index) const; + + static const int NOT_EXIST; + static const int INDEX_SIZE; + + BufferWithExtendableBuffer *const mIndexTableBuffer; + BufferWithExtendableBuffer *const mContentTableBuffer; + const int mBlockSize; + const int mDataSize; +}; +} // namespace latinime +#endif /* LATINIME_SPARSE_TABLE_H */ diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index 104eb2a7a..7b332064c 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -22,6 +22,12 @@ const float ScoringParams::MAX_SPATIAL_DISTANCE = 1.0f; const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY = 40; const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120; const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f; + +const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f; +const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f; +const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f; +const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f; + // TODO: Unlimit max cache dic node size const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h index 7d4b5c3c7..de7410d39 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h @@ -32,6 +32,11 @@ class ScoringParams { static const int MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT; static const int THRESHOLD_SHORT_WORD_LENGTH; + static const float EXACT_MATCH_PROMOTION; + static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; + static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH; + // Numerically optimized parameters (currently for tap typing only). // TODO: add ability to modify these constants programmatically. // TODO: explore optimization of gesture parameters. diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h index 56ffcc93e..7ef905df7 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h @@ -18,7 +18,9 @@ #define LATINIME_TYPING_SCORING_H #include "defines.h" +#include "suggest/core/dictionary/error_type_utils.h" #include "suggest/core/policy/scoring.h" +#include "suggest/core/session/dic_traverse_session.h" #include "suggest/policyimpl/typing/scoring_params.h" namespace latinime { @@ -37,13 +39,8 @@ class TypingScoring : public Scoring { return false; } - AK_FORCE_INLINE void safetyNetForMostProbableString(const int terminalSize, - const int maxScore, int *const outputCodePoints, int *const frequencies) const { - } - - AK_FORCE_INLINE void searchWordWithDoubleLetter(DicNode *terminals, - const int terminalSize, int *doubleLetterTerminalIndex, - DoubleLetterLevel *doubleLetterLevel) const { + AK_FORCE_INLINE void safetyNetForMostProbableString(const int scoreCount, + const int maxScore, int *const outputCodePoints, int *const scores) const { } AK_FORCE_INLINE float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession, @@ -52,18 +49,31 @@ class TypingScoring : public Scoring { } AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, - const int inputSize, const bool forceCommit) const { + const int inputSize, const ErrorTypeUtils::ErrorType containedErrorTypes, + const bool forceCommit, const bool boostExactMatches) const { const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE + static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT; - const float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - - compoundDistance / maxDistance - + (forceCommit ? ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD : 0.0f); + float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance; + if (forceCommit) { + score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD; + } + if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { + score += ScoringParams::EXACT_MATCH_PROMOTION; + if ((ErrorTypeUtils::MATCH_WITH_CASE_ERROR & containedErrorTypes) != 0) { + score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR & containedErrorTypes) != 0) { + score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { + score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } + } return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE); } - AK_FORCE_INLINE float getDoubleLetterDemotionDistanceCost(const int terminalIndex, - const int doubleLetterTerminalIndex, - const DoubleLetterLevel doubleLetterLevel) const { + AK_FORCE_INLINE float getDoubleLetterDemotionDistanceCost( + const DicNode *const terminalDicNode) const { return 0.0f; } @@ -71,6 +81,16 @@ class TypingScoring : public Scoring { return false; } + AK_FORCE_INLINE bool autoCorrectsToMultiWordSuggestionIfTop() const { + return true; + } + + AK_FORCE_INLINE bool sameAsTyped( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + return traverseSession->getProximityInfoState(0)->sameAsTyped( + dicNode->getOutputWordBuf(), dicNode->getNodeCodePointCount()); + } + private: DISALLOW_COPY_AND_ASSIGN(TypingScoring); static const TypingScoring sInstance; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h index 007c19e0a..3db00ad3a 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h @@ -81,7 +81,7 @@ class TypingTraversal : public Traversal { return false; } const int point0Index = dicNode->getInputIndex(0); - return dicNode->isTerminalWordNode() + return dicNode->isTerminalDicNode() && traverseSession->getProximityInfoState(0)-> hasSpaceProximity(point0Index); } @@ -96,7 +96,7 @@ class TypingTraversal : public Traversal { if (dicNode->isCompletion(inputSize)) { return false; } - if (!dicNode->isTerminalWordNode()) { + if (!dicNode->isTerminalDicNode()) { return false; } const int16_t pointIndex = dicNode->getInputIndex(0); @@ -137,20 +137,10 @@ class TypingTraversal : public Traversal { return ScoringParams::MAX_SPATIAL_DISTANCE; } - AK_FORCE_INLINE bool autoCorrectsToMultiWordSuggestionIfTop() const { - return true; - } - AK_FORCE_INLINE int getDefaultExpandDicNodeSize() const { return DicNodeVector::DEFAULT_NODES_SIZE_FOR_OPTIMIZATION; } - AK_FORCE_INLINE bool sameAsTyped( - const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { - return traverseSession->getProximityInfoState(0)->sameAsTyped( - dicNode->getOutputWordBuf(), dicNode->getNodeCodePointCount()); - } - AK_FORCE_INLINE int getMaxCacheSize(const int inputSize) const { return (inputSize <= 1) ? ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT : ScoringParams::MAX_CACHE_DIC_NODE_SIZE; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp index 5b6b5e874..54f65c786 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp @@ -23,39 +23,64 @@ namespace latinime { const TypingWeighting TypingWeighting::sInstance; -ErrorType TypingWeighting::getErrorType(const CorrectionType correctionType, +ErrorTypeUtils::ErrorType TypingWeighting::getErrorType(const CorrectionType correctionType, const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, const DicNode *const dicNode) const { switch (correctionType) { case CT_MATCH: if (isProximityDicNode(traverseSession, dicNode)) { - return ET_PROXIMITY_CORRECTION; + return ErrorTypeUtils::PROXIMITY_CORRECTION; + } else if (dicNode->isInDigraph()) { + return ErrorTypeUtils::MATCH_WITH_DIGRAPH; } else { - return ET_NOT_AN_ERROR; + // Compare the node code point with original primary code point on the keyboard. + const ProximityInfoState *const pInfoState = + traverseSession->getProximityInfoState(0); + const int primaryOriginalCodePoint = pInfoState->getPrimaryOriginalCodePointAt( + dicNode->getInputIndex(0)); + const int nodeCodePoint = dicNode->getNodeCodePoint(); + if (primaryOriginalCodePoint == nodeCodePoint) { + // Node code point is same as original code point on the keyboard. + return ErrorTypeUtils::NOT_AN_ERROR; + } else if (CharUtils::toLowerCase(primaryOriginalCodePoint) == + CharUtils::toLowerCase(nodeCodePoint)) { + // Only cases of the code points are different. + return ErrorTypeUtils::MATCH_WITH_CASE_ERROR; + } else if (CharUtils::toBaseCodePoint(primaryOriginalCodePoint) == + CharUtils::toBaseCodePoint(nodeCodePoint)) { + // Node code point is a variant of original code point. + return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR; + } else { + // Node code point is a variant of original code point and the cases are also + // different. + return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR + | ErrorTypeUtils::MATCH_WITH_CASE_ERROR; + } } + break; case CT_ADDITIONAL_PROXIMITY: - return ET_PROXIMITY_CORRECTION; + return ErrorTypeUtils::PROXIMITY_CORRECTION; case CT_OMISSION: if (parentDicNode->canBeIntentionalOmission()) { - return ET_INTENTIONAL_OMISSION; + return ErrorTypeUtils::INTENTIONAL_OMISSION; } else { - return ET_EDIT_CORRECTION; + return ErrorTypeUtils::EDIT_CORRECTION; } break; case CT_SUBSTITUTION: case CT_INSERTION: case CT_TERMINAL_INSERTION: case CT_TRANSPOSITION: - return ET_EDIT_CORRECTION; + return ErrorTypeUtils::EDIT_CORRECTION; case CT_NEW_WORD_SPACE_OMISSION: case CT_NEW_WORD_SPACE_SUBSTITUTION: - return ET_NEW_WORD; + return ErrorTypeUtils::NEW_WORD; case CT_TERMINAL: - return ET_NOT_AN_ERROR; + return ErrorTypeUtils::NOT_AN_ERROR; case CT_COMPLETION: - return ET_COMPLETION; + return ErrorTypeUtils::COMPLETION; default: - return ET_NOT_AN_ERROR; + return ErrorTypeUtils::NOT_AN_ERROR; } } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index 9f0a331e3..41314ef52 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -19,6 +19,7 @@ #include "defines.h" #include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" #include "suggest/core/layout/touch_position_correction_utils.h" #include "suggest/core/policy/weighting.h" #include "suggest/core/session/dic_traverse_session.h" @@ -204,7 +205,7 @@ class TypingWeighting : public Weighting { return cost * traverseSession->getMultiWordCostMultiplier(); } - ErrorType getErrorType(const CorrectionType correctionType, + ErrorTypeUtils::ErrorType getErrorType(const CorrectionType correctionType, const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, const DicNode *const dicNode) const; diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp index 0e7039610..adc474b4c 100644 --- a/native/jni/src/utils/char_utils.cpp +++ b/native/jni/src/utils/char_utils.cpp @@ -1118,7 +1118,8 @@ static int compare_pair_capital(const void *a, const void *b) { /* U+0118 */ 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067, /* U+0120 */ 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0126, 0x0127, /* U+0128 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, - /* U+0130 */ 0x0049, 0x0131, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B, + // U+0131: Manually changed from 0131 to 0049 + /* U+0130 */ 0x0049, 0x0049, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B, /* U+0138 */ 0x0138, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, /* U+0140 */ 0x006C, 0x004C, 0x006C, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E, // U+0141: Manually changed from 0141 to 004C @@ -1273,4 +1274,6 @@ static int compare_pair_capital(const void *a, const void *b) { /* U+04F0 */ 0x0423, 0x0443, 0x0423, 0x0443, 0x0427, 0x0447, 0x04F6, 0x04F7, /* U+04F8 */ 0x042B, 0x044B, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF, }; + +/* static */ const std::vector<int> CharUtils::EMPTY_STRING(1 /* size */, '\0' /* value */); } // namespace latinime diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index 41663c81a..98b8966df 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -18,6 +18,7 @@ #define LATINIME_CHAR_UTILS_H #include <cctype> +#include <vector> #include "defines.h" @@ -85,7 +86,15 @@ class CharUtils { return spaceCount; } + static AK_FORCE_INLINE std::vector<int> convertShortArrayToIntVector( + const unsigned short *const source, const int length) { + std::vector<int> destination; + destination.insert(destination.end(), source, source + length); + return destination; // Copies the vector + } + static unsigned short latin_tolower(const unsigned short c); + static const std::vector<int> EMPTY_STRING; private: DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); diff --git a/native/jni/src/utils/exclusive_ownership_pointer.h b/native/jni/src/utils/exclusive_ownership_pointer.h new file mode 100644 index 000000000..081802e8b --- /dev/null +++ b/native/jni/src/utils/exclusive_ownership_pointer.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_EXCLUSIVE_OWNERSHIP_POINTER_H +#define LATINIME_EXCLUSIVE_OWNERSHIP_POINTER_H + +#include "defines.h" + +namespace latinime { + +template<class T> +class ExclusiveOwnershipPointer { + public: + // This instance become an owner of the raw pointer. + AK_FORCE_INLINE ExclusiveOwnershipPointer(T *const rawPointer) + : mPointer(rawPointer), + mSharedOwnerPtr(new (ExclusiveOwnershipPointer<T> *)(this)) {} + + // Move the ownership. + AK_FORCE_INLINE ExclusiveOwnershipPointer(const ExclusiveOwnershipPointer<T> &pointer) + : mPointer(pointer.mPointer), mSharedOwnerPtr(pointer.mSharedOwnerPtr) { + transferOwnership(&pointer); + } + + AK_FORCE_INLINE ~ExclusiveOwnershipPointer() { + deletePointersIfHavingOwnership(); + } + + AK_FORCE_INLINE T *get() const { + return mPointer; + } + + private: + // This class allows to copy and ensures only one instance has the ownership of the + // managed pointer. + DISALLOW_DEFAULT_CONSTRUCTOR(ExclusiveOwnershipPointer); + DISALLOW_ASSIGNMENT_OPERATOR(ExclusiveOwnershipPointer); + + void transferOwnership(const ExclusiveOwnershipPointer<T> *const src) { + if (*mSharedOwnerPtr != src) { + AKLOGE("Failed to transfer the ownership because src is not the current owner." + "src: %p, owner: %p", src, *mSharedOwnerPtr); + ASSERT(false); + return; + } + // Transfer the ownership from src to this instance. + *mSharedOwnerPtr = this; + } + + void deletePointersIfHavingOwnership() { + if (mSharedOwnerPtr && *mSharedOwnerPtr == this) { + if (mPointer) { + if (DEBUG_DICT) { + AKLOGI("Releasing pointer: %p", mPointer); + } + delete mPointer; + } + delete mSharedOwnerPtr; + } + } + + T *mPointer; + // mSharedOwnerPtr points a shared memory space where the instance which has the ownership is + // stored. + ExclusiveOwnershipPointer<T> **mSharedOwnerPtr; +}; +} // namespace latinime +#endif /* LATINIME_EXCLUSIVE_OWNERSHIP_POINTER_H */ diff --git a/native/jni/src/utils/time_keeper.cpp b/native/jni/src/utils/time_keeper.cpp new file mode 100644 index 000000000..026284060 --- /dev/null +++ b/native/jni/src/utils/time_keeper.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/time_keeper.h" + +#include <ctime> + +namespace latinime { + +int TimeKeeper::sCurrentTime; +bool TimeKeeper::sSetForTesting; + +/* static */ void TimeKeeper::setCurrentTime() { + if (!sSetForTesting) { + sCurrentTime = time(0); + } +} + +/* static */ void TimeKeeper::startTestModeWithForceCurrentTime(const int currentTime) { + sCurrentTime = currentTime; + sSetForTesting = true; +} + +/* static */ void TimeKeeper::stopTestMode() { + sSetForTesting = false; +} + +} // namespace latinime diff --git a/native/jni/src/utils/time_keeper.h b/native/jni/src/utils/time_keeper.h new file mode 100644 index 000000000..d066757e4 --- /dev/null +++ b/native/jni/src/utils/time_keeper.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TIME_KEEPER_H +#define LATINIME_TIME_KEEPER_H + +#include "defines.h" + +namespace latinime { + +class TimeKeeper { + public: + static void setCurrentTime(); + + static void startTestModeWithForceCurrentTime(const int currentTime); + + static void stopTestMode(); + + static int peekCurrentTime() { return sCurrentTime; }; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TimeKeeper); + + static int sCurrentTime; + static bool sSetForTesting; +}; +} // namespace latinime +#endif /* LATINIME_TIME_KEEPER_H */ |