diff options
Diffstat (limited to 'native/jni/src')
110 files changed, 5987 insertions, 508 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index 2fe2bd8fa..24d04e51f 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -293,13 +293,6 @@ static inline void prof_out(void) { #define M_PI_F 3.14159265f #define MAX_PERCENTILE 100 -// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. -// As such, this is the maximum number of characters will be needed to represent an int as a -// string, including the terminator; this is used as the size of a string buffer large enough to -// hold any value that is intended to fit in an integer, e.g. in the code that reads the header -// of the binary dictionary where a {key,value} string pair scheme is used. -#define LARGEST_INT_DIGIT_COUNT 11 - #define NOT_A_CODE_POINT (-1) #define NOT_A_DISTANCE (-1) #define NOT_A_COORDINATE (-1) @@ -325,6 +318,8 @@ static inline void prof_out(void) { #define KEYCODE_SPACE ' ' #define KEYCODE_SINGLE_QUOTE '\'' #define KEYCODE_HYPHEN_MINUS '-' +// Code point to indicate beginning-of-sentence. This is not in the code point space of unicode. +#define CODE_POINT_BEGINNING_OF_SENTENCE 0x110000 #define SUGGEST_INTERFACE_OUTPUT_SCALE 1000000.0f #define MAX_PROBABILITY 255 @@ -341,6 +336,9 @@ static inline void prof_out(void) { #define MAX_POINTER_COUNT 1 #define MAX_POINTER_COUNT_G 2 +// (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram is supported. +#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 1 + #define DISALLOW_DEFAULT_CONSTRUCTOR(TypeName) \ TypeName() = delete diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index 47f5ec0d7..92f39ea25 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -103,10 +103,10 @@ class DicNode { PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - // Init for root with prevWordPtNodePos which is used for bigram - void initAsRoot(const int rootPtNodeArrayPos, const int prevWordPtNodePos) { + // Init for root with prevWordsPtNodePos which is used for n-gram + void initAsRoot(const int rootPtNodeArrayPos, const int *const prevWordsPtNodePos) { mIsCachedForNextSuggestion = false; - mDicNodeProperties.init(rootPtNodeArrayPos, prevWordPtNodePos); + mDicNodeProperties.init(rootPtNodeArrayPos, prevWordsPtNodePos); mDicNodeState.init(); PROF_NODE_RESET(mProfiler); } @@ -114,13 +114,18 @@ class DicNode { // Init for root with previous word void initAsRootWithPreviousWord(const DicNode *const dicNode, const int rootPtNodeArrayPos) { mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; - mDicNodeProperties.init(rootPtNodeArrayPos, dicNode->mDicNodeProperties.getPtNodePos()); + int newPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + newPrevWordsPtNodePos[0] = dicNode->mDicNodeProperties.getPtNodePos(); + for (size_t i = 1; i < NELEMS(newPrevWordsPtNodePos); ++i) { + newPrevWordsPtNodePos[i] = dicNode->getNthPrevWordTerminalPtNodePos(i); + } + mDicNodeProperties.init(rootPtNodeArrayPos, newPrevWordsPtNodePos); mDicNodeState.initAsRootWithPreviousWord(&dicNode->mDicNodeState, dicNode->mDicNodeProperties.getDepth()); PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - void initAsPassingChild(DicNode *parentDicNode) { + void initAsPassingChild(const DicNode *parentDicNode) { mIsCachedForNextSuggestion = parentDicNode->mIsCachedForNextSuggestion; const int codePoint = parentDicNode->mDicNodeState.mDicNodeStateOutput.getCurrentWordCodePointAt( @@ -140,7 +145,7 @@ class DicNode { dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount); mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0], probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth, - newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordTerminalPtNodePos()); + newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos()); mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount, mergedNodeCodePoints); PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); @@ -198,14 +203,17 @@ class DicNode { return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1; } - // Used to get bigram probability in DicNodeUtils + // Used to get n-gram probability in DicNodeUtils. int getPtNodePos() const { return mDicNodeProperties.getPtNodePos(); } - // Used to get bigram probability in DicNodeUtils - int getPrevWordTerminalPtNodePos() const { - return mDicNodeProperties.getPrevWordTerminalPtNodePos(); + // Used to get n-gram probability in DicNodeUtils. n is 1-indexed. + int getNthPrevWordTerminalPtNodePos(const int n) const { + if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { + return NOT_A_DICT_POS; + } + return mDicNodeProperties.getPrevWordsTerminalPtNodePos()[n - 1]; } // Used in DicNodeUtils diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index 2d02a7d9c..4445f4aaf 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -29,8 +29,8 @@ namespace latinime { /* static */ void DicNodeUtils::initAsRoot( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int prevWordPtNodePos, DicNode *const newRootDicNode) { - newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordPtNodePos); + const int *const prevWordsPtNodePos, DicNode *const newRootDicNode) { + newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordsPtNodePos); } /*static */ void DicNodeUtils::initAsRootWithPreviousWord( @@ -48,7 +48,7 @@ namespace latinime { /////////////////////////////////// // Traverse node expansion utils // /////////////////////////////////// -/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, +/* static */ void DicNodeUtils::getAllChildDicNodes(const DicNode *dicNode, const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNodeVector *const childDicNodes) { if (dicNode->isTotalInputSizeExceedingLimit()) { @@ -86,7 +86,7 @@ namespace latinime { const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) { const int unigramProbability = dicNode->getProbability(); const int ptNodePos = dicNode->getPtNodePos(); - const int prevWordTerminalPtNodePos = dicNode->getPrevWordTerminalPtNodePos(); + const int prevWordTerminalPtNodePos = dicNode->getNthPrevWordTerminalPtNodePos(1 /* n */); if (NOT_A_DICT_POS == ptNodePos || NOT_A_DICT_POS == prevWordTerminalPtNodePos) { // Note: Normally wordPos comes from the dictionary and should never equal // NOT_A_VALID_WORD_POS. diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h index 4c0f1f15d..00e80c604 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -30,12 +30,12 @@ class DicNodeUtils { public: static void initAsRoot( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int prevWordPtNodePos, DicNode *const newRootDicNode); + const int *const prevWordPtNodePos, DicNode *const newRootDicNode); static void initAsRootWithPreviousWord( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode); static void initByCopy(const DicNode *const srcDicNode, DicNode *const destDicNode); - static void getAllChildDicNodes(DicNode *dicNode, + static void getAllChildDicNodes(const DicNode *dicNode, const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNodeVector *childDicNodes); static float getBigramNodeImprobability( diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h index cb28e57d8..54cde1988 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_vector.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -52,7 +52,7 @@ class DicNodeVector { return static_cast<int>(mDicNodes.size()); } - void pushPassingChild(DicNode *dicNode) { + void pushPassingChild(const DicNode *dicNode) { ASSERT(!mLock); mDicNodes.emplace_back(); mDicNodes.back().initAsPassingChild(dicNode); diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h index 11f8c2905..8202176f7 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h @@ -32,15 +32,14 @@ class DicNodeProperties { : mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT), mIsTerminal(false), mHasChildrenPtNodes(false), - mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0), - mPrevWordTerminalPtNodePos(NOT_A_DICT_POS) {} + mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {} ~DicNodeProperties() {} // Should be called only once per DicNode is initialized. void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability, const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord, - const uint16_t depth, const uint16_t leavingDepth, const int prevWordNodePos) { + const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) { mPtNodePos = pos; mChildrenPtNodeArrayPos = childrenPos; mDicNodeCodePoint = nodeCodePoint; @@ -50,11 +49,11 @@ class DicNodeProperties { mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord; mDepth = depth; mLeavingDepth = leavingDepth; - mPrevWordTerminalPtNodePos = prevWordNodePos; + memmove(mPrevWordsTerminalPtNodePos, prevWordsNodePos, sizeof(mPrevWordsTerminalPtNodePos)); } - // Init for root with prevWordPtNodePos which is used for bigram - void init(const int rootPtNodeArrayPos, const int prevWordNodePos) { + // Init for root with prevWordsPtNodePos which is used for n-gram + void init(const int rootPtNodeArrayPos, const int *const prevWordsNodePos) { mPtNodePos = NOT_A_DICT_POS; mChildrenPtNodeArrayPos = rootPtNodeArrayPos; mDicNodeCodePoint = NOT_A_CODE_POINT; @@ -64,7 +63,7 @@ class DicNodeProperties { mIsBlacklistedOrNotAWord = false; mDepth = 0; mLeavingDepth = 0; - mPrevWordTerminalPtNodePos = prevWordNodePos; + memmove(mPrevWordsTerminalPtNodePos, prevWordsNodePos, sizeof(mPrevWordsTerminalPtNodePos)); } void initByCopy(const DicNodeProperties *const dicNodeProp) { @@ -77,7 +76,8 @@ class DicNodeProperties { mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; mDepth = dicNodeProp->mDepth; mLeavingDepth = dicNodeProp->mLeavingDepth; - mPrevWordTerminalPtNodePos = dicNodeProp->mPrevWordTerminalPtNodePos; + memmove(mPrevWordsTerminalPtNodePos, dicNodeProp->mPrevWordsTerminalPtNodePos, + sizeof(mPrevWordsTerminalPtNodePos)); } // Init as passing child @@ -91,7 +91,8 @@ class DicNodeProperties { mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child mLeavingDepth = dicNodeProp->mLeavingDepth; - mPrevWordTerminalPtNodePos = dicNodeProp->mPrevWordTerminalPtNodePos; + memmove(mPrevWordsTerminalPtNodePos, dicNodeProp->mPrevWordsTerminalPtNodePos, + sizeof(mPrevWordsTerminalPtNodePos)); } int getPtNodePos() const { @@ -131,8 +132,8 @@ class DicNodeProperties { return mIsBlacklistedOrNotAWord; } - int getPrevWordTerminalPtNodePos() const { - return mPrevWordTerminalPtNodePos; + const int *getPrevWordsTerminalPtNodePos() const { + return mPrevWordsTerminalPtNodePos; } private: @@ -148,7 +149,7 @@ class DicNodeProperties { bool mIsBlacklistedOrNotAWord; uint16_t mDepth; uint16_t mLeavingDepth; - int mPrevWordTerminalPtNodePos; + int mPrevWordsTerminalPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; }; } // namespace latinime #endif // LATINIME_DIC_NODE_PROPERTIES_H diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index f793363a8..295e760d6 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -26,6 +26,7 @@ #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/result/suggestion_results.h" +#include "suggest/core/session/prev_words_info.h" #include "utils/char_utils.h" namespace latinime { @@ -42,27 +43,15 @@ BigramDictionary::~BigramDictionary() { } /* Parameters : - * prevWord: the word before, the one for which we need to look up bigrams. - * prevWordLength: its length. + * prevWordsInfo: Information of previous words to get the predictions. * outSuggestionResults: SuggestionResults to put the predictions. */ -void BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength, +void BigramDictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo, SuggestionResults *const outSuggestionResults) const { - int pos = getBigramListPositionForWord(prevWord, prevWordLength, - false /* forceLowerCaseSearch */); - // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams - if (NOT_A_DICT_POS == pos) { - // If no bigrams for this exact word, search again in lower case. - pos = getBigramListPositionForWord(prevWord, prevWordLength, - true /* forceLowerCaseSearch */); - } - // If still no bigrams, we really don't have them! - if (NOT_A_DICT_POS == pos) return; - int unigramProbability = 0; int bigramCodePoints[MAX_WORD_LENGTH]; - BinaryDictionaryBigramsIterator bigramsIt( - mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); + BinaryDictionaryBigramsIterator bigramsIt = + prevWordsInfo->getBigramsIteratorForPrediction(mDictionaryStructurePolicy); while (bigramsIt.hasNext()) { bigramsIt.next(); if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { @@ -96,17 +85,13 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos); } -int BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1, - int length1) const { - int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */); - // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams - if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY; +int BigramDictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, + const int *word1, int length1) const { int nextWordPos = mDictionaryStructurePolicy->getTerminalPtNodePositionOfWord(word1, length1, false /* forceLowerCaseSearch */); if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; - - BinaryDictionaryBigramsIterator bigramsIt( - mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); + BinaryDictionaryBigramsIterator bigramsIt = + prevWordsInfo->getBigramsIteratorForPrediction(mDictionaryStructurePolicy); while (bigramsIt.hasNext()) { bigramsIt.next(); if (bigramsIt.getBigramPos() == nextWordPos diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h index 12aaf20d3..bd3aed1bd 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h @@ -22,15 +22,17 @@ namespace latinime { class DictionaryStructureWithBufferPolicy; +class PrevWordsInfo; class SuggestionResults; class BigramDictionary { public: BigramDictionary(const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy); - void getPredictions(const int *word, int length, + void getPredictions(const PrevWordsInfo *const prevWordsInfo, SuggestionResults *const outSuggestionResults) const; - int getBigramProbability(const int *word1, int length1, const int *word2, int length2) const; + int getBigramProbability(const PrevWordsInfo *const prevWordsInfo, + const int *word1, int length1) const; ~BigramDictionary(); private: diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h index d16ac47fe..bc9d57671 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h @@ -30,6 +30,11 @@ class BinaryDictionaryBigramsIterator { mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(pos != NOT_A_DICT_POS) {} + BinaryDictionaryBigramsIterator(BinaryDictionaryBigramsIterator &&bigramsIterator) + : mBigramsStructurePolicy(bigramsIterator.mBigramsStructurePolicy), + mPos(bigramsIterator.mPos), mBigramPos(bigramsIterator.mBigramPos), + mProbability(bigramsIterator.mProbability), mHasNext(bigramsIterator.mHasNext) {} + AK_FORCE_INLINE bool hasNext() const { return mHasNext; } diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index e288413a3..0bcde2294 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -19,6 +19,7 @@ #include "suggest/core/dictionary/dictionary.h" #include "defines.h" +#include "suggest/core/dictionary/dictionary_utils.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/result/suggestion_results.h" #include "suggest/core/session/dic_traverse_session.h" @@ -44,12 +45,11 @@ Dictionary::Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::Structu void Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, - int inputSize, int *prevWordCodePoints, int prevWordLength, + int inputSize, const PrevWordsInfo *const prevWordsInfo, const SuggestOptions *const suggestOptions, const float languageWeight, SuggestionResults *const outSuggestionResults) const { TimeKeeper::setCurrentTime(); - DicTraverseSession::initSessionInstance( - traverseSession, this, prevWordCodePoints, prevWordLength, suggestOptions); + traverseSession->init(this, prevWordsInfo, suggestOptions); const auto &suggest = suggestOptions->isGesture() ? mGestureSuggest : mTypingSuggest; suggest->getSuggestions(proximityInfo, traverseSession, xcoordinates, ycoordinates, times, pointerIds, inputCodePoints, inputSize, @@ -59,11 +59,10 @@ void Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession } } -void Dictionary::getPredictions(const int *word, int length, +void Dictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo, SuggestionResults *const outSuggestionResults) const { TimeKeeper::setCurrentTime(); - if (length <= 0) return; - mBigramDictionary.getPredictions(word, length, outSuggestionResults); + mBigramDictionary.getPredictions(prevWordsInfo, outSuggestionResults); } int Dictionary::getProbability(const int *word, int length) const { @@ -76,39 +75,55 @@ int Dictionary::getProbability(const int *word, int length) const { return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos); } -int Dictionary::getBigramProbability(const int *word0, int length0, const int *word1, - int length1) const { +int Dictionary::getMaxProbabilityOfExactMatches(const int *word, int length) const { TimeKeeper::setCurrentTime(); - return mBigramDictionary.getBigramProbability(word0, length0, word1, length1); + return DictionaryUtils::getMaxProbabilityOfExactMatches( + mDictionaryStructureWithBufferPolicy.get(), word, length); } -void Dictionary::addUnigramWord(const int *const word, const int length, +int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, + int length) const { + TimeKeeper::setCurrentTime(); + return mBigramDictionary.getBigramProbability(prevWordsInfo, word, length); +} + +bool Dictionary::addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty) { + if (unigramProperty->representsBeginningOfSentence() + && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy() + ->supportsBeginningOfSentence()) { + AKLOGE("The dictionary doesn't support Beginning-of-Sentence."); + return false; + } + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); +} + +bool Dictionary::removeUnigramEntry(const int *const codePoints, const int codePointCount) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, unigramProperty); + return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints, codePointCount); } -void Dictionary::addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp) { +bool Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, word1, length1, - probability, timestamp); + return mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty); } -void Dictionary::removeBigramWords(const int *const word0, const int length0, - const int *const word1, const int length1) { +bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const int *const word, const int length) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->removeBigramWords(word0, length0, word1, length1); + return mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length); } -void Dictionary::flush(const char *const filePath) { +bool Dictionary::flush(const char *const filePath) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->flush(filePath); + return mDictionaryStructureWithBufferPolicy->flush(filePath); } -void Dictionary::flushWithGC(const char *const filePath) { +bool Dictionary::flushWithGC(const char *const filePath) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); + return mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); } bool Dictionary::needsToRunGC(const bool mindsBlockByGC) { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index b6149b338..542ba7291 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -31,6 +31,7 @@ namespace latinime { class DictionaryStructureWithBufferPolicy; class DicTraverseSession; +class PrevWordsInfo; class ProximityInfo; class SuggestionResults; class SuggestOptions; @@ -56,35 +57,41 @@ class Dictionary { static const int KIND_MASK_FLAGS = 0xFFFFFF00; // Mask to get the flags static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; static const int KIND_FLAG_EXACT_MATCH = 0x40000000; + static const int KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = 0x20000000; Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy); void getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, - int inputSize, int *prevWordCodePoints, int prevWordLength, + int inputSize, const PrevWordsInfo *const prevWordsInfo, const SuggestOptions *const suggestOptions, const float languageWeight, SuggestionResults *const outSuggestionResults) const; - void getPredictions(const int *word, int length, + void getPredictions(const PrevWordsInfo *const prevWordsInfo, SuggestionResults *const outSuggestionResults) const; int getProbability(const int *word, int length) const; - int getBigramProbability(const int *word0, int length0, const int *word1, int length1) const; + int getMaxProbabilityOfExactMatches(const int *word, int length) const; - void addUnigramWord(const int *const codePoints, const int codePointCount, + int getBigramProbability(const PrevWordsInfo *const prevWordsInfo, + const int *word, int length) const; + + bool addUnigramEntry(const int *const codePoints, const int codePointCount, const UnigramProperty *const unigramProperty); - void addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp); + bool removeUnigramEntry(const int *const codePoints, const int codePointCount); + + bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty); - void removeBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1); + bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, + const int length); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC); diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp new file mode 100644 index 000000000..b94966cbe --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/dictionary_utils.h" + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/session/prev_words_info.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" + +namespace latinime { + +/* static */ int DictionaryUtils::getMaxProbabilityOfExactMatches( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int *const codePoints, const int codePointCount) { + std::vector<DicNode> current; + std::vector<DicNode> next; + + // No prev words information. + PrevWordsInfo emptyPrevWordsInfo; + int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + emptyPrevWordsInfo.getPrevWordsTerminalPtNodePos(dictionaryStructurePolicy, + prevWordsPtNodePos, false /* tryLowerCaseSearch */); + current.emplace_back(); + DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordsPtNodePos, ¤t.front()); + for (int i = 0; i < codePointCount; ++i) { + // The base-lower input is used to ignore case errors and accent errors. + const int codePoint = CharUtils::toBaseLowerCase(codePoints[i]); + for (const DicNode &dicNode : current) { + if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == codePoint) { + next.emplace_back(dicNode); + next.back().advanceDigraphIndex(); + continue; + } + processChildDicNodes(dictionaryStructurePolicy, codePoint, &dicNode, &next); + } + current.clear(); + current.swap(next); + } + + int maxProbability = NOT_A_PROBABILITY; + for (const DicNode &dicNode : current) { + if (!dicNode.isTerminalDicNode()) { + continue; + } + // dicNode can contain case errors, accent errors, intentional omissions or digraphs. + maxProbability = std::max(maxProbability, dicNode.getProbability()); + } + return maxProbability; +} + +/* static */ void DictionaryUtils::processChildDicNodes( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int inputCodePoint, const DicNode *const parentDicNode, + std::vector<DicNode> *const outDicNodes) { + DicNodeVector childDicNodes; + DicNodeUtils::getAllChildDicNodes(parentDicNode, dictionaryStructurePolicy, &childDicNodes); + for (int childIndex = 0; childIndex < childDicNodes.getSizeAndLock(); ++childIndex) { + DicNode *const childDicNode = childDicNodes[childIndex]; + const int codePoint = CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint()); + if (inputCodePoint == codePoint) { + outDicNodes->emplace_back(*childDicNode); + } + if (childDicNode->canBeIntentionalOmission()) { + processChildDicNodes(dictionaryStructurePolicy, inputCodePoint, childDicNode, + outDicNodes); + } + if (DigraphUtils::hasDigraphForCodePoint( + dictionaryStructurePolicy->getHeaderStructurePolicy(), + childDicNode->getNodeCodePoint())) { + childDicNode->advanceDigraphIndex(); + if (childDicNode->getNodeCodePoint() == codePoint) { + childDicNode->advanceDigraphIndex(); + outDicNodes->emplace_back(*childDicNode); + } + } + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.h b/native/jni/src/suggest/core/dictionary/dictionary_utils.h new file mode 100644 index 000000000..358ebf674 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_UTILS_H +#define LATINIME_DICTIONARY_UTILS_H + +#include <vector> + +#include "defines.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; +class DicNode; + +class DictionaryUtils { + public: + static int getMaxProbabilityOfExactMatches( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int *const codePoints, const int codePointCount); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryUtils); + + static void processChildDicNodes( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int inputCodePoint, const DicNode *const parentDicNode, + std::vector<DicNode> *const outDicNodes); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_UTILS_H diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp index 0635fef7e..b6bf7a98c 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -31,4 +31,8 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x80; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = NOT_AN_ERROR | MATCH_WITH_CASE_ERROR | MATCH_WITH_ACCENT_ERROR | MATCH_WITH_DIGRAPH; +const ErrorTypeUtils::ErrorType + ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = + ERRORS_TREATED_AS_AN_EXACT_MATCH | INTENTIONAL_OMISSION; + } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h index 0e8e5b635..e3e76b238 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.h +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -51,6 +51,11 @@ class ErrorTypeUtils { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; } + static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) { + return (containedErrorTypes + & ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0; + } + static bool isEditCorrectionError(const ErrorType errorType) { return (errorType & EDIT_CORRECTION) != 0; } @@ -67,6 +72,7 @@ class ErrorTypeUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; + static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION; }; } // namespace latinime #endif // LATINIME_ERROR_TYPE_UTILS_H diff --git a/native/jni/src/suggest/core/dictionary/property/bigram_property.h b/native/jni/src/suggest/core/dictionary/property/bigram_property.h index 8d3429b5b..343af143c 100644 --- a/native/jni/src/suggest/core/dictionary/property/bigram_property.h +++ b/native/jni/src/suggest/core/dictionary/property/bigram_property.h @@ -23,6 +23,7 @@ namespace latinime { +// TODO: Change to NgramProperty. class BigramProperty { public: BigramProperty(const std::vector<int> *const targetCodePoints, diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h index d2551057b..902eb000f 100644 --- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h +++ b/native/jni/src/suggest/core/dictionary/property/unigram_property.h @@ -48,15 +48,21 @@ class UnigramProperty { }; UnigramProperty() - : mIsNotAWord(false), mIsBlacklisted(false), mProbability(NOT_A_PROBABILITY), - mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), mShortcuts() {} - - UnigramProperty(const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp, const int level, const int count, - const std::vector<ShortcutProperty> *const shortcuts) - : mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), + : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false), + mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), + mShortcuts() {} + + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const int probability, const int timestamp, const int level, + const int count, const std::vector<ShortcutProperty> *const shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {} + bool representsBeginningOfSentence() const { + return mRepresentsBeginningOfSentence; + } + bool isNotAWord() const { return mIsNotAWord; } @@ -94,6 +100,7 @@ class UnigramProperty { DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); // TODO: Make members const. + bool mRepresentsBeginningOfSentence; bool mIsNotAWord; bool mIsBlacklisted; int mProbability; diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.cpp b/native/jni/src/suggest/core/dictionary/property/word_property.cpp index 95608dcf8..6f5f808f8 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.cpp +++ b/native/jni/src/suggest/core/dictionary/property/word_property.cpp @@ -16,14 +16,17 @@ #include "suggest/core/dictionary/property/word_property.h" +#include "utils/jni_data_utils.h" + namespace latinime { void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, jobject outShortcutTargets, jobject outShortcutProbabilities) const { - env->SetIntArrayRegion(outCodePoints, 0 /* start */, mCodePoints.size(), &mCodePoints[0]); - + JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, + MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(), + false /* needsNullTermination */); jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(), !mBigrams.empty(), mUnigramProperty.hasShortcuts()}; env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); @@ -41,8 +44,9 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, for (const auto &bigramProperty : mBigrams) { const std::vector<int> *const word1CodePoints = bigramProperty.getTargetCodePoints(); jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size()); - env->SetIntArrayRegion(bigramWord1CodePointArray, 0 /* start */, - word1CodePoints->size(), word1CodePoints->data()); + JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */, + word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(), + false /* needsNullTermination */); env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray); env->DeleteLocalRef(bigramWord1CodePointArray); @@ -62,6 +66,9 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size()); env->SetIntArrayRegion(shortcutTargetCodePointArray, 0 /* start */, targetCodePoints->size(), targetCodePoints->data()); + JniDataUtils::outputCodePoints(env, shortcutTargetCodePointArray, 0 /* start */, + targetCodePoints->size(), targetCodePoints->data(), targetCodePoints->size(), + false /* needsNullTermination */); env->CallBooleanMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); env->DeleteLocalRef(shortcutTargetCodePointArray); jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/suggest/core/dictionary/property/word_property.h index 5519a917c..aa3e0b68a 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.h +++ b/native/jni/src/suggest/core/dictionary/property/word_property.h @@ -42,6 +42,14 @@ class WordProperty { jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, jobject outShortcutTargets, jobject outShortcutProbabilities) const; + const UnigramProperty *getUnigramProperty() const { + return &mUnigramProperty; + } + + const std::vector<BigramProperty> *getBigramProperties() const { + return &mBigrams; + } + private: // Default copy constructor is used for using as a return value. DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); diff --git a/native/jni/src/suggest/core/layout/proximity_info.cpp b/native/jni/src/suggest/core/layout/proximity_info.cpp index c40a2bdca..4c75a188e 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info.cpp @@ -226,7 +226,7 @@ int ProximityInfo::getKeyCenterXOfKeyIdG( // When the referencePointY is NOT_A_COORDINATE, this method calculates the return value without // using the line segment. int ProximityInfo::getKeyCenterYOfKeyIdG( - const int keyId, const int referencePointY, const bool isGeometric) const { + const int keyId, const int referencePointY, const bool isGeometric) const { // TODO: Remove "isGeometric" and have separate "proximity_info"s for gesture and typing. if (keyId < 0) { return 0; diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h index 71e83a80c..211a79737 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h +++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h @@ -56,7 +56,7 @@ class ProximityInfoStateUtils { const std::vector<int> *const sampledLengthCache, const std::vector<int> *const sampledInputIndice, std::vector<float> *sampledSpeedRates, std::vector<float> *sampledDirections); - static void refreshBeelineSpeedRates(const int mostCommonKeyWidth, const float averageSpeed, + static void refreshBeelineSpeedRates(const int mostCommonKeyWidth, const float averageSpeed, const int inputSize, const int *const xCoordinates, const int *const yCoordinates, const int *times, const int sampledInputSize, const std::vector<int> *const sampledInputXs, diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index a8dab9fcd..a61227626 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -49,6 +49,10 @@ class DictionaryHeaderStructurePolicy { virtual bool shouldBoostExactMatches() const = 0; + virtual const std::vector<int> *getLocale() const = 0; + + virtual bool supportsBeginningOfSentence() const = 0; + protected: DictionaryHeaderStructurePolicy() {} diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 807f9b8dd..e2771f97c 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -29,6 +29,7 @@ class DicNodeVector; class DictionaryBigramsStructurePolicy; class DictionaryHeaderStructurePolicy; class DictionaryShortcutsStructurePolicy; +class PrevWordsInfo; class UnigramProperty; /* @@ -69,20 +70,25 @@ class DictionaryStructureWithBufferPolicy { virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0; // Returns whether the update was success or not. - virtual bool addUnigramWord(const int *const word, const int length, + virtual bool addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty) = 0; // Returns whether the update was success or not. - virtual bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp) = 0; + virtual bool removeUnigramEntry(const int *const word, const int length) = 0; // Returns whether the update was success or not. - virtual bool removeBigramWords(const int *const word0, const int length0, - const int *const word1, const int length1) = 0; + virtual bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty) = 0; - virtual void flush(const char *const filePath) = 0; + // Returns whether the update was success or not. + virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const int *const word, const int length) = 0; + + // Returns whether the flush was success or not. + virtual bool flush(const char *const filePath) = 0; - virtual void flushWithGC(const char *const filePath) = 0; + // Returns whether the GC and flush were success or not. + virtual bool flushWithGC(const char *const filePath) = 0; virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; diff --git a/native/jni/src/suggest/core/result/suggestion_results.cpp b/native/jni/src/suggest/core/result/suggestion_results.cpp index 088a55f6f..4c10bd08a 100644 --- a/native/jni/src/suggest/core/result/suggestion_results.cpp +++ b/native/jni/src/suggest/core/result/suggestion_results.cpp @@ -16,6 +16,8 @@ #include "suggest/core/result/suggestion_results.h" +#include "utils/jni_data_utils.h" + namespace latinime { void SuggestionResults::outputSuggestions(JNIEnv *env, jintArray outSuggestionCount, @@ -27,31 +29,22 @@ void SuggestionResults::outputSuggestions(JNIEnv *env, jintArray outSuggestionCo const SuggestedWord &suggestedWord = mSuggestedWords.top(); suggestedWord.getCodePointCount(); const int start = outputIndex * MAX_WORD_LENGTH; - env->SetIntArrayRegion(outputCodePointsArray, start, suggestedWord.getCodePointCount(), - suggestedWord.getCodePoint()); - if (suggestedWord.getCodePointCount() < MAX_WORD_LENGTH) { - const int terminal = 0; - env->SetIntArrayRegion(outputCodePointsArray, start + suggestedWord.getCodePointCount(), - 1 /* len */, &terminal); - } - const int score = suggestedWord.getScore(); - env->SetIntArrayRegion(outScoresArray, outputIndex, 1 /* len */, &score); - const int indexToPartialCommit = suggestedWord.getIndexToPartialCommit(); - env->SetIntArrayRegion(outSpaceIndicesArray, outputIndex, 1 /* len */, - &indexToPartialCommit); - const int type = suggestedWord.getType(); - env->SetIntArrayRegion(outTypesArray, outputIndex, 1 /* len */, &type); + JniDataUtils::outputCodePoints(env, outputCodePointsArray, start, + MAX_WORD_LENGTH /* maxLength */, suggestedWord.getCodePoint(), + suggestedWord.getCodePointCount(), true /* needsNullTermination */); + JniDataUtils::putIntToArray(env, outScoresArray, outputIndex, suggestedWord.getScore()); + JniDataUtils::putIntToArray(env, outSpaceIndicesArray, outputIndex, + suggestedWord.getIndexToPartialCommit()); + JniDataUtils::putIntToArray(env, outTypesArray, outputIndex, suggestedWord.getType()); if (mSuggestedWords.size() == 1) { - const int autoCommitFirstWordConfidence = - suggestedWord.getAutoCommitFirstWordConfidence(); - env->SetIntArrayRegion(outAutoCommitFirstWordConfidenceArray, 0 /* start */, - 1 /* len */, &autoCommitFirstWordConfidence); + JniDataUtils::putIntToArray(env, outAutoCommitFirstWordConfidenceArray, 0 /* index */, + suggestedWord.getAutoCommitFirstWordConfidence()); } ++outputIndex; mSuggestedWords.pop(); } - env->SetIntArrayRegion(outSuggestionCount, 0 /* start */, 1 /* len */, &outputIndex); - env->SetFloatArrayRegion(outLanguageWeight, 0 /* start */, 1 /* len */, &mLanguageWeight); + JniDataUtils::putIntToArray(env, outSuggestionCount, 0 /* index */, outputIndex); + JniDataUtils::putFloatToArray(env, outLanguageWeight, 0 /* index */, mLanguageWeight); } void SuggestionResults::addPrediction(const int *const codePoints, const int codePointCount, diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp index a307cb45d..7b0e7e1b4 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -26,6 +26,7 @@ #include "suggest/core/policy/scoring.h" #include "suggest/core/result/suggestion_results.h" #include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest_options.h" namespace latinime { @@ -89,6 +90,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; const bool isExactMatch = ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + const bool isExactMatchWithIntentionalOmission = + ErrorTypeUtils::isExactMatchWithIntentionalOmission( + terminalDicNode->getContainedErrorTypes()); const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. // (e.g. "AMD" and "and") @@ -96,10 +100,17 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; && !(isPossiblyOffensiveWord && isFirstCharUppercase); const int outputTypeFlags = (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) - | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); + | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) + | (isExactMatchWithIntentionalOmission ? + Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); // Entries that are blacklisted or do not represent a word should not be output. const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); + // When we have to block offensive words, non-exact matched offensive words should not be + // output. + const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords(); + const bool isBlockedOffensiveWord = blockOffensiveWords && isPossiblyOffensiveWord + && !isSafeExactMatch; // Increase output score of top typing suggestion to ensure autocorrection. // TODO: Better integration with java side autocorrection logic. @@ -110,8 +121,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; || (isValidWord && scoringPolicy->doesAutoCorrectValidWord()), boostExactMatches); - // Don't output invalid words. However, we still need to submit their shortcuts if any. - if (isValidWord) { + // Don't output invalid or blocked offensive words. However, we still need to submit their + // shortcuts if any. + if (isValidWord && !isBlockedOffensiveWord) { int codePoints[MAX_WORD_LENGTH]; terminalDicNode->outputResult(codePoints); const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp index 77b634e07..f1e411f38 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp +++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -20,6 +20,7 @@ #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/core/session/prev_words_info.h" namespace latinime { @@ -28,25 +29,14 @@ namespace latinime { const int DicTraverseSession::DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION = 256 * 1024; -void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord, - int prevWordLength, const SuggestOptions *const suggestOptions) { +void DicTraverseSession::init(const Dictionary *const dictionary, + const PrevWordsInfo *const prevWordsInfo, const SuggestOptions *const suggestOptions) { mDictionary = dictionary; mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy() ->getMultiWordCostMultiplier(); mSuggestOptions = suggestOptions; - if (!prevWord) { - mPrevWordPtNodePos = NOT_A_DICT_POS; - return; - } - // TODO: merge following similar calls to getTerminalPosition into one case-insensitive call. - mPrevWordPtNodePos = getDictionaryStructurePolicy()->getTerminalPtNodePositionOfWord( - prevWord, prevWordLength, false /* forceLowerCaseSearch */); - if (mPrevWordPtNodePos == NOT_A_DICT_POS) { - // Check bigrams for lower-cased previous word if original was not found. Useful for - // auto-capitalized words like "The [current_word]". - mPrevWordPtNodePos = getDictionaryStructurePolicy()->getTerminalPtNodePositionOfWord( - prevWord, prevWordLength, true /* forceLowerCaseSearch */); - } + prevWordsInfo->getPrevWordsTerminalPtNodePos( + getDictionaryStructurePolicy(), mPrevWordsPtNodePos, true /* tryLowerCaseSearch */); } void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo, diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h index 843ca85a0..5a51a112d 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.h +++ b/native/jni/src/suggest/core/session/dic_traverse_session.h @@ -29,6 +29,7 @@ namespace latinime { class Dictionary; class DictionaryStructureWithBufferPolicy; +class PrevWordsInfo; class ProximityInfo; class SuggestOptions; @@ -44,32 +45,25 @@ class DicTraverseSession { dictSize >= DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION); } - static AK_FORCE_INLINE void initSessionInstance(DicTraverseSession *traverseSession, - const Dictionary *const dictionary, const int *prevWord, const int prevWordLength, - const SuggestOptions *const suggestOptions) { - if (traverseSession) { - DicTraverseSession *tSession = static_cast<DicTraverseSession *>(traverseSession); - tSession->init(dictionary, prevWord, prevWordLength, suggestOptions); - } - } - static AK_FORCE_INLINE void releaseSessionInstance(DicTraverseSession *traverseSession) { delete traverseSession; } AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache) - : mPrevWordPtNodePos(NOT_A_DICT_POS), mProximityInfo(nullptr), - mDictionary(nullptr), mSuggestOptions(nullptr), mDicNodesCache(usesLargeCache), - mMultiBigramMap(), mInputSize(0), mMaxPointerCount(1), + : mProximityInfo(nullptr), mDictionary(nullptr), mSuggestOptions(nullptr), + mDicNodesCache(usesLargeCache), mMultiBigramMap(), mInputSize(0), mMaxPointerCount(1), mMultiWordCostMultiplier(1.0f) { // NOTE: mProximityInfoStates is an array of instances. // No need to initialize it explicitly here. + for (size_t i = 0; i < NELEMS(mPrevWordsPtNodePos); ++i) { + mPrevWordsPtNodePos[i] = NOT_A_DICT_POS; + } } // Non virtual inline destructor -- never inherit this class AK_FORCE_INLINE ~DicTraverseSession() {} - void init(const Dictionary *dictionary, const int *prevWord, int prevWordLength, + void init(const Dictionary *dictionary, const PrevWordsInfo *const prevWordsInfo, const SuggestOptions *const suggestOptions); // TODO: Remove and merge into init void setupForGetSuggestions(const ProximityInfo *pInfo, const int *inputCodePoints, @@ -85,9 +79,7 @@ class DicTraverseSession { //-------------------- const ProximityInfo *getProximityInfo() const { return mProximityInfo; } const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; } - int getPrevWordPtNodePos() const { return mPrevWordPtNodePos; } - // TODO: REMOVE - void setPrevWordPtNodePos(const int ptNodePos) { mPrevWordPtNodePos = ptNodePos; } + const int *getPrevWordsPtNodePos() const { return mPrevWordsPtNodePos; } DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; } MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; } const ProximityInfoState *getProximityInfoState(int id) const { @@ -174,7 +166,7 @@ class DicTraverseSession { const int *const inputYs, const int *const times, const int *const pointerIds, const int inputSize, const float maxSpatialDistance, const int maxPointerCount); - int mPrevWordPtNodePos; + int mPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; const ProximityInfo *mProximityInfo; const Dictionary *mDictionary; const SuggestOptions *mSuggestOptions; diff --git a/native/jni/src/suggest/core/session/prev_words_info.h b/native/jni/src/suggest/core/session/prev_words_info.h new file mode 100644 index 000000000..640f6a2fc --- /dev/null +++ b/native/jni/src/suggest/core/session/prev_words_info.h @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PREV_WORDS_INFO_H +#define LATINIME_PREV_WORDS_INFO_H + +#include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +// TODO: Support n-gram. +// This class does not take ownership of any code point buffers. +class PrevWordsInfo { + public: + // No prev word information. + PrevWordsInfo() { + clear(); + } + + PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount, + const bool isBeginningOfSentence) { + clear(); + mPrevWordCodePoints[0] = prevWordCodePoints; + mPrevWordCodePointCount[0] = prevWordCodePointCount; + mIsBeginningOfSentence[0] = isBeginningOfSentence; + } + + bool isValid() const { + for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { + if (mPrevWordCodePointCount[i] > MAX_WORD_LENGTH) { + return false; + } + } + return true; + } + + void getPrevWordsTerminalPtNodePos( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const { + for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { + outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, + mPrevWordCodePoints[i], mPrevWordCodePointCount[i], + mIsBeginningOfSentence[i], tryLowerCaseSearch); + } + } + + BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const { + const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch( + dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0], + mIsBeginningOfSentence[0]); + return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(), + bigramListPos); + } + + // n is 1-indexed. + const int *getNthPrevWordCodePoints(const int n) const { + if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { + return nullptr; + } + return mPrevWordCodePoints[n - 1]; + } + + // n is 1-indexed. + int getNthPrevWordCodePointCount(const int n) const { + if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { + return 0; + } + return mPrevWordCodePointCount[n - 1]; + } + + // n is 1-indexed. + bool isNthPrevWordBeginningOfSentence(const int n) const { + if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { + return false; + } + return mIsBeginningOfSentence[n - 1]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo); + + static int getTerminalPtNodePosOfWord( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { + return NOT_A_DICT_POS; + } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, + codePointCount, MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_DICT_POS; + } + } + const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( + codePoints, codePointCount, false /* forceLowerCaseSearch */); + if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { + // Return the position when when the word was found or doesn't try lower case + // search. + return wordPtNodePos; + } + // Check bigrams for lower-cased previous word if original was not found. Useful for + // auto-capitalized words like "The [current_word]". + return dictStructurePolicy->getTerminalPtNodePositionOfWord( + codePoints, codePointCount, true /* forceLowerCaseSearch */); + } + + static int getBigramListPositionForWordWithTryingLowerCaseSearch( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { + return NOT_A_DICT_POS; + } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, + codePointCount, MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_DICT_POS; + } + } + int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, + codePointCount, false /* forceLowerCaseSearch */); + // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the + // dictionary or has no bigrams + if (NOT_A_DICT_POS == pos) { + // If no bigrams for this exact word, search again in lower case. + pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, + codePointCount, true /* forceLowerCaseSearch */); + } + return pos; + } + + static int getBigramListPositionForWord( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *wordCodePoints, const int wordCodePointCount, + const bool forceLowerCaseSearch) { + if (!wordCodePoints || wordCodePointCount <= 0) return NOT_A_DICT_POS; + const int terminalPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( + wordCodePoints, wordCodePointCount, forceLowerCaseSearch); + if (NOT_A_DICT_POS == terminalPtNodePos) return NOT_A_DICT_POS; + return dictStructurePolicy->getBigramsPositionOfPtNode(terminalPtNodePos); + } + + void clear() { + for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { + mPrevWordCodePoints[i] = nullptr; + mPrevWordCodePointCount[i] = 0; + mIsBeginningOfSentence[i] = false; + } + } + + const int *mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; +}; +} // namespace latinime +#endif // LATINIME_PREV_WORDS_INFO_H diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index e675e0bb3..0cd305f5a 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -92,7 +92,7 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession) const { // Create a new dic node here DicNode rootNode; DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(), - traverseSession->getPrevWordPtNodePos(), &rootNode); + traverseSession->getPrevWordsPtNodePos(), &rootNode); traverseSession->getDicTraverseCache()->copyPushActive(&rootNode); } } diff --git a/native/jni/src/suggest/core/suggest_options.h b/native/jni/src/suggest/core/suggest_options.h index 1b21aafcf..2e22a7ac3 100644 --- a/native/jni/src/suggest/core/suggest_options.h +++ b/native/jni/src/suggest/core/suggest_options.h @@ -34,6 +34,10 @@ class SuggestOptions{ return getBoolOption(USE_FULL_EDIT_DISTANCE); } + AK_FORCE_INLINE bool blockOffensiveWords() const { + return getBoolOption(BLOCK_OFFENSIVE_WORDS); + } + AK_FORCE_INLINE bool getAdditionalFeaturesBoolOption(const int key) const { return getBoolOption(key + ADDITIONAL_FEATURES_OPTIONS); } @@ -45,9 +49,10 @@ class SuggestOptions{ // reorder options. static const int IS_GESTURE = 0; static const int USE_FULL_EDIT_DISTANCE = 1; + static const int BLOCK_OFFENSIVE_WORDS = 2; // Additional features options are stored after the other options and used as setting values of // experimental features. - static const int ADDITIONAL_FEATURES_OPTIONS = 2; + static const int ADDITIONAL_FEATURES_OPTIONS = 3; const int *const mOptions; const int mLength; diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 251a71941..87cf0cd3b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -139,8 +139,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { switch (mDictFormatVersion) { case FormatUtils::VERSION_2: return FormatUtils::VERSION_2; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + return FormatUtils::VERSION_4_ONLY_FOR_TESTING; case FormatUtils::VERSION_4: return FormatUtils::VERSION_4; + case FormatUtils::VERSION_4_DEV: + return FormatUtils::VERSION_4_DEV; default: return FormatUtils::UNKNOWN_VERSION; } @@ -238,6 +242,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const int unigramCount, const int bigramCount, const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; + AK_FORCE_INLINE const std::vector<int> *getLocale() const { + return &mLocale; + } + + bool supportsBeginningOfSentence() const { + return mDictFormatVersion >= FormatUtils::VERSION_4; + } + private: DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index d20accfbc..a8f8f284b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -26,6 +26,13 @@ namespace latinime { +// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. +// As such, this is the maximum number of characters will be needed to represent an int as a +// string, including the terminator; this is used as the size of a string buffer large enough to +// hold any value that is intended to fit in an integer, e.g. in the code that reads the header +// of the binary dictionary where a {key,value} string pair scheme is used. +const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; + const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; @@ -91,8 +98,10 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; case FormatUtils::VERSION_2: // Version 2 dictionary writing is not supported. return false; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: - return buffer->writeUintAndAdvancePosition(FormatUtils::VERSION_4 /* data */, + case FormatUtils::VERSION_4_DEV: + return buffer->writeUintAndAdvancePosition(version /* data */, HEADER_DICTIONARY_VERSION_SIZE, writingPos); default: return false; @@ -154,8 +163,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; /* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, const int value) { AttributeMap::mapped_type valueVector; - char charBuf[LARGEST_INT_DIGIT_COUNT + 1]; - snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%d", value); + char charBuf[LARGEST_INT_DIGIT_COUNT]; + snprintf(charBuf, sizeof(charBuf), "%d", value); insertCharactersIntoVector(charBuf, &valueVector); (*headerAttributes)[*key] = valueVector; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h index a6b4c4e14..9b90488fc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h @@ -92,6 +92,7 @@ class HeaderReadWriteUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); + static const int LARGEST_INT_DIGIT_COUNT; static const int MAX_ATTRIBUTE_KEY_LENGTH; static const int MAX_ATTRIBUTE_VALUE_LENGTH; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt new file mode 100644 index 000000000..9e29e836c --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt @@ -0,0 +1 @@ +Files under this directory have been auto generated. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp new file mode 100644 index 000000000..3e8e059f2 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" + +#include "suggest/core/dictionary/property/bigram_property.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos); + if (outBigramPos) { + // Lookup target PtNode position. + *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + } + if (outProbability) { + if (bigramEntry.hasHistoricalInfo()) { + *outProbability = + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(), + mHeaderPolicy); + } else { + *outProbability = bigramEntry.getProbability(); + } + } + if (outHasNext) { + *outHasNext = bigramEntry.hasNext(); + } +} + +bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, + const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) { + // 1. The word has no bigrams yet. + // 2. The word has bigrams, and there is the target in the list. + // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. + // 4. The word has bigrams. We have to append new bigram entry to the list. + // 5. Same as 4, but the list is the last entry of the content file. + if (outAddedNewEntry) { + *outAddedNewEntry = false; + } + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Case 1. PtNode that doesn't have a bigram list. + // Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, + bigramProperty); + // Write an entry. + const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; + } + + int tailEntryPos = NOT_A_DICT_POS; + const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos, + &tailEntryPos); + if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) { + // Case 4, 5. + // Add new entry to the bigram list. + if (tailEntryPos == NOT_A_DICT_POS) { + // Case 4. Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId); + // Copy existing bigram list. + if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) { + return false; + } + } + // Write new entry at the tail position of the bigram content. + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &newBigramEntry, bigramProperty); + if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) { + return false; + } + // Update has next flag of the tail entry. + if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; + } + + // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry. + const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (!originalBigramEntry.isValid()) { + // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing + // entry is updated. + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + } + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &updatedBigramEntry, bigramProperty); + return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); +} + +bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return false; + } + const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos, + nullptr /* outTailEntryPos */); + if (entryPosToUpdate == NOT_A_DICT_POS) { + // Bigram entry doesn't exist. + return false; + } + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (targetTerminalId != bigramEntry.getTargetTerminalId()) { + // Bigram entry doesn't exist. + return false; + } + // Remove bigram entry by marking it as invalid entry and overwriting the original entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate); +} + +bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return true; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + if (targetPtNodePos == NOT_A_DICT_POS) { + // Invalidate bigram entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } else if (bigramEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + bigramEntry.getHistoricalInfo(), mHeaderPolicy); + if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) { + const BigramEntry updatedBigramEntry = + bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + *outBigramCount += 1; + } else { + // Remove entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } + } else { + *outBigramCount += 1; + } + } + return true; +} + +int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return 0; + } + int bigramCount = 0; + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.isValid()) { + bigramCount++; + } + } + return bigramCount; +} + +int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, + const int bigramListPos, int *const outTailEntryPos) const { + if (outTailEntryPos) { + *outTailEntryPos = NOT_A_DICT_POS; + } + bool hasNext = true; + int invalidEntryPos = NOT_A_DICT_POS; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) { + // Entry with same target is found. + return entryPos; + } else if (!bigramEntry.isValid()) { + // Invalid entry that can be reused is found. + invalidEntryPos = entryPos; + } + if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) { + if (outTailEntryPos) { + *outTailEntryPos = entryPos; + } + } + } + return invalidEntryPos; +} + +const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( + const BigramEntry *const originalBigramEntry, + const BigramProperty *const bigramProperty) const { + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(), + bigramProperty->getLevel(), bigramProperty->getCount()); + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(), + &historicalInfoForUpdate, mHeaderPolicy); + return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); + } else { + return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability()); + } +} + +bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) { + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos); + const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h new file mode 100644 index 000000000..61623468e --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramDictContent; +} // namespace v402 +} // namespace backward +class BigramProperty; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class TerminalPositionLookupTable; + +class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable, + const HeaderPolicy *const headerPolicy) + : mBigramDictContent(bigramDictContent), + mTerminalPositionLookupTable(terminalPositionLookupTable), + mHeaderPolicy(headerPolicy) {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const; + + void skipAllBigrams(int *const pos) const { + // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. + } + + bool addNewEntry(const int terminalId, const int newTargetTerminalId, + const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); + + bool removeEntry(const int terminalId, const int targetTerminalId); + + bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount); + + int getBigramEntryConut(const int terminalId); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); + + int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos, + int *const outTailEntryPos) const; + + const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, + const BigramProperty *const bigramProperty) const; + + bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos); + + BigramDictContent *const mBigramDictContent; + const TerminalPositionLookupTable *const mTerminalPositionLookupTable; + const HeaderPolicy *const mHeaderPolicy; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp new file mode 100644 index 000000000..e2dd93c5e --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( + int *const bigramEntryPos) const { + const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); + const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); + if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { + AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " + "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, + bigramListBuffer->getTailPosition()); + ASSERT(false); + return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + Ver4DictConstants::NOT_A_TERMINAL_ID); + } + const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos); + const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0; + int probability = NOT_A_PROBABILITY; + int timestamp = NOT_A_TIMESTAMP; + int level = 0; + int count = 0; + if (mHasHistoricalInfo) { + timestamp = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos); + level = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos); + count = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos); + } else { + probability = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); + } + const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos); + const int targetTerminalId = + (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? + Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; + if (mHasHistoricalInfo) { + const HistoricalInfo historicalInfo(timestamp, level, count); + return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId); + } else { + return BigramEntry(hasNext, probability, targetTerminalId); + } +} + +bool BigramDictContent::writeBigramEntryAndAdvancePosition( + const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) { + BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer(); + const int bigramFlags = createAndGetBigramFlags(bigramEntryToWrite->hasNext()); + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags, + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags); + return false; + } + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo(); + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos, + historicalInfo->getTimeStamp()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos, + historicalInfo->getLevel()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos, + historicalInfo->getCount()); + return false; + } + } else { + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, + bigramEntryToWrite->getProbability()); + return false; + } + } + const int targetTerminalIdToWrite = + (bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ? + Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : + bigramEntryToWrite->getTargetTerminalId(); + if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite, + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d", + *entryWritingPos, bigramEntryToWrite->getTargetTerminalId()); + return false; + } + return true; +} + +bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos, + int *const outTailEntryPos) { + int readingPos = bigramListPos; + int writingPos = toPos; + bool hasNext = true; + while (hasNext) { + const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!hasNext) { + *outTailEntryPos = writingPos; + } + if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalBigramListPos = + originalBigramDictContent->getBigramListHeadPos(it->first); + if (originalBigramListPos == NOT_A_DICT_POS) { + // This terminal does not have a bigram list. + continue; + } + const int bigramListPos = getContentBuffer()->getTailPosition(); + int bigramEntryCount = 0; + // Copy bigram list with GC from original content. + if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, + terminalIdMap, &bigramEntryCount)) { + AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d", + originalBigramListPos, bigramListPos); + return false; + } + if (bigramEntryCount == 0) { + // All bigram entries are useless. This terminal does not have a bigram list. + continue; + } + *outBigramEntryCount += bigramEntryCount; + // Set bigram list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { + AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d", + it->second, bigramListPos); + return false; + } + } + return true; +} + +// Returns whether GC for the bigram list was succeeded or not. +bool BigramDictContent::runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntrycount) { + bool hasNext = true; + int readingPos = bigramListPos; + int writingPos = toPos; + int lastEntryPos = NOT_A_DICT_POS; + while (hasNext) { + const BigramEntry originalBigramEntry = + sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = originalBigramEntry.hasNext(); + if (originalBigramEntry.getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) { + continue; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + terminalIdMap->find(originalBigramEntry.getTargetTerminalId()); + if (it == terminalIdMap->end()) { + // Target word has been removed. + continue; + } + lastEntryPos = hasNext ? writingPos : NOT_A_DICT_POS; + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second); + if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos); + return false; + } + *outEntrycount += 1; + } + if (lastEntryPos != NOT_A_DICT_POS) { + // Update has next flag in the last written entry. + const BigramEntry bigramEntry = getBigramEntry(lastEntryPos).updateHasNextAndGetEntry( + false /* hasNext */); + if (!writeBigramEntry(&bigramEntry, lastEntryPos)) { + AKLOGE("Cannot write bigram entry to set hasNext flag after GC. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h new file mode 100644 index 000000000..b554e5676 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramDictContent : public SparseTableDictContent { + public: + BigramDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + BigramDictContent(const bool hasHistoricalInfo) + : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + const BigramEntry getBigramEntry(const int bigramEntryPos) const { + int readingPos = bigramEntryPos; + return getBigramEntryAndAdvancePosition(&readingPos); + } + + const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const; + + // Returns head position of bigram list for a PtNode specified by terminalId. + int getBigramListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); + } + + bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) { + int writingPos = getContentBuffer()->getTailPosition(); + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + + bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { + int writingPos = entryWritingPos; + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + + bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite, + int *const entryWritingPos); + + bool createNewBigramList(const int terminalId) { + const int bigramListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); + } + + bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos); + + bool flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION); + } + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount); + + bool isContentTailPos(const int pos) const { + return pos == getContentBuffer()->getTailPosition(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(BigramDictContent); + + int createAndGetBigramFlags(const bool hasNext) const { + return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0; + } + + int getBigramEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } else { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } + } + + bool runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntryCount); + + bool mHasHistoricalInfo; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h new file mode 100644 index 000000000..40968b4d8 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h + */ + +#ifndef LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H +#define LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/historical_info.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramEntry { + public: + BigramEntry(const BigramEntry& bigramEntry) + : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability), + mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(), + mTargetTerminalId(targetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, + const HistoricalInfo *const historicalInfo, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo), + mTargetTerminalId(targetTerminalId) {} + + const BigramEntry getInvalidatedEntry() const { + return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID); + } + + const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const { + return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const { + return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId); + } + + const BigramEntry updateProbabilityAndGetEntry(const int probability) const { + return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateHistoricalInfoAndGetEntry( + const HistoricalInfo *const historicalInfo) const { + return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId); + } + + bool isValid() const { + return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + } + + bool hasNext() const { + return mHasNext; + } + + int getProbability() const { + return mProbability; + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + int getTargetTerminalId() const { + return mTargetTerminalId; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry); + DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry); + + const bool mHasNext; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + const int mTargetTerminalId; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h new file mode 100644 index 000000000..0f2f25534 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_DICT_CONTENT_H + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class DictContent { + public: + virtual ~DictContent() {} + virtual bool isValid() const = 0; + + protected: + DictContent() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictContent); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp new file mode 100644 index 000000000..c671647d4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" + +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + // This method can be called with invalid terminal id during GC. + return ProbabilityEntry(0 /* flags */, NOT_A_PROBABILITY); + } + const BufferWithExtendableBuffer *const buffer = getBuffer(); + int entryPos = getEntryPos(terminalId); + const int flags = buffer->readUintAndAdvancePosition( + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos); + const int probability = buffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, &entryPos); + if (mHasHistoricalInfo) { + const int timestamp = buffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos); + const int level = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); + const int count = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); + const HistoricalInfo historicalInfo(timestamp, level, count); + return ProbabilityEntry(flags, probability, &historicalInfo); + } else { + return ProbabilityEntry(flags, probability); + } +} + +bool ProbabilityDictContent::setProbabilityEntry(const int terminalId, + const ProbabilityEntry *const probabilityEntry) { + if (terminalId < 0) { + return false; + } + const int entryPos = getEntryPos(terminalId); + if (terminalId >= mSize) { + ProbabilityEntry dummyEntry; + // Write new entry. + int writingPos = getBuffer()->getTailPosition(); + while (writingPos <= entryPos) { + // Fulfilling with dummy entries until writingPos. + if (!writeEntry(&dummyEntry, writingPos)) { + AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize); + return false; + } + writingPos += getEntrySize(); + mSize++; + } + } + return writeEntry(probabilityEntry, entryPos); +} + +bool ProbabilityDictContent::flushToFile(const char *const dictPath) const { + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo); + for (int i = 0; i < mSize; ++i) { + const ProbabilityEntry probabilityEntry = getProbabilityEntry(i); + if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i); + return false; + } + } + return probabilityDictContentToWrite.flush(dictPath, + Ver4DictConstants::FREQ_FILE_EXTENSION); + } else { + return flush(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION); + } +} + +bool ProbabilityDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent) { + mSize = 0; + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const ProbabilityEntry probabilityEntry = + originalProbabilityDictContent->getProbabilityEntry(it->first); + if (!setProbabilityEntry(it->second, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second); + return false; + } + mSize++; + } + return true; +} + +int ProbabilityDictContent::getEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE; + } else { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE; + } +} + +int ProbabilityDictContent::getEntryPos(const int terminalId) const { + return terminalId * getEntrySize(); +} + +bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry, + const int entryPos) { + BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer(); + int writingPos = entryPos; + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(), + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { + AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { + AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos); + return false; + } + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo(); + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h new file mode 100644 index 000000000..3734797d4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ProbabilityEntry; + +class ProbabilityDictContent : public SingleDictContent { + public: + ProbabilityDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SingleDictContent(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable), + mHasHistoricalInfo(hasHistoricalInfo), + mSize(getBuffer()->getTailPosition() / getEntrySize()) {} + + ProbabilityDictContent(const bool hasHistoricalInfo) + : mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {} + + const ProbabilityEntry getProbabilityEntry(const int terminalId) const; + + bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry); + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent); + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent); + + int getEntrySize() const; + + int getEntryPos(const int terminalId) const; + + bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos); + + bool mHasHistoricalInfo; + int mSize; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h new file mode 100644 index 000000000..8ccfa33dc --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h + */ + +#ifndef LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H +#define LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/historical_info.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ProbabilityEntry { + public: + ProbabilityEntry(const ProbabilityEntry &probabilityEntry) + : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), + mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} + + // Dummy entry + ProbabilityEntry() + : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {} + + // Entry without historical information + ProbabilityEntry(const int flags, const int probability) + : mFlags(flags), mProbability(probability), mHistoricalInfo() {} + + // Entry with historical information. + ProbabilityEntry(const int flags, const int probability, + const HistoricalInfo *const historicalInfo) + : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {} + + const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const { + return ProbabilityEntry(mFlags, probability, &mHistoricalInfo); + } + + const ProbabilityEntry createEntryWithUpdatedHistoricalInfo( + const HistoricalInfo *const historicalInfo) const { + return ProbabilityEntry(mFlags, mProbability, historicalInfo); + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + int getFlags() const { + return mFlags; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); + + const int mFlags; + const int mProbability; + const HistoricalInfo mHistoricalInfo; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp new file mode 100644 index 000000000..56bc8b98d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const { + const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); + if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { + AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", + *shortcutEntryPos, shortcutListBuffer->getTailPosition()); + ASSERT(false); + if (outhasNext) { + *outhasNext = false; + } + if (outCodePointCount) { + *outCodePointCount = 0; + } + return; + } + + const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + if (outProbability) { + *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; + } + if (outhasNext) { + *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + } + if (outCodePoint && outCodePointCount) { + shortcutListBuffer->readCodePointsAndAdvancePosition( + maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); + } +} + +int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); +} + +bool ShortcutDictContent::flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION); +} + +bool ShortcutDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalShortcutListPos = + originalShortcutDictContent->getShortcutListHeadPos(it->first); + if (originalShortcutListPos == NOT_A_DICT_POS) { + continue; + } + const int shortcutListPos = getContentBuffer()->getTailPosition(); + // Copy shortcut list from original content. + if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, + shortcutListPos)) { + AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", + originalShortcutListPos, shortcutListPos); + return false; + } + // Set shortcut list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { + AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", + it->second, shortcutListPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::createNewShortcutList(const int terminalId) { + const int shortcutListListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); +} + +bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { + return copyShortcutListFromDictContent(shortcutListPos, this, toPos); +} + +bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { + bool hasNext = true; + int readingPos = shortcutListPos; + int writingPos = toPos; + int codePoints[MAX_WORD_LENGTH]; + while (hasNext) { + int probability = 0; + int codePointCount = 0; + sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, + codePoints, &codePointCount, &probability, &hasNext, &readingPos); + if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, + hasNext, &writingPos)) { + AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = shortcutListBuffer->readUint( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); + return shortcutListBuffer->writeUint(shortcutFlagsToWrite, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); +} + +bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); + if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); + return false; + } + if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, + true /* writesTerminator */, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); + return false; + } + return true; +} + +// Find a shortcut entry that has specified target and return its position. +int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const { + bool hasNext = true; + int readingPos = shortcutListPos; + int targetCodePoints[MAX_WORD_LENGTH]; + while (hasNext) { + const int entryPos = readingPos; + int probability = 0; + int targetCodePointCount = 0; + getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, + &probability, &hasNext, &readingPos); + if (targetCodePointCount != codePointCount) { + continue; + } + bool matched = true; + for (int i = 0; i < codePointCount; ++i) { + if (targetCodePointsToFind[i] != targetCodePoints[i]) { + matched = false; + break; + } + } + if (matched) { + return entryPos; + } + } + return NOT_A_DICT_POS; +} + +int ShortcutDictContent::createAndGetShortcutFlags(const int probability, + const bool hasNext) const { + return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) + | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h new file mode 100644 index 000000000..179cec5bb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ShortcutDictContent : public SparseTableDictContent { + public: + ShortcutDictContent(const char *const dictPath, const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + ShortcutDictContent() + : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, int *const outProbability, bool *const outhasNext, + const int shortcutEntryPos) { + int readingPos = shortcutEntryPos; + return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, + outCodePointCount, outProbability, outhasNext, &readingPos); + } + + void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const; + + // Returns head position of shortcut list for a PtNode specified by terminalId. + int getShortcutListHeadPos(const int terminalId) const; + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent); + + bool createNewShortcutList(const int terminalId); + + bool copyShortcutList(const int shortcutListPos, const int toPos); + + bool setProbability(const int probability, const int shortcutEntryPos); + + bool writeShortcutEntry(const int *const codePoint, const int codePointCount, + const int probability, const bool hasNext, const int shortcutEntryPos) { + int writingPos = shortcutEntryPos; + return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, + hasNext, &writingPos); + } + + bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos); + + int findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); + + bool copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); + + int createAndGetShortcutFlags(const int probability, const bool hasNext) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h new file mode 100644 index 000000000..6433650b0 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class SingleDictContent : public DictContent { + public: + SingleDictContent(const char *const dictPath, const char *const contentFileName, + const bool isUpdatable) + : mMmappedBuffer(MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableContentBuffer(mMmappedBuffer ? mMmappedBuffer->getBuffer() : nullptr, + mMmappedBuffer ? mMmappedBuffer->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mIsValid(mMmappedBuffer) {} + + SingleDictContent() + : mMmappedBuffer(nullptr), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mIsValid(true) {} + + virtual ~SingleDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + BufferWithExtendableBuffer *getWritableBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictPath, const char *const contentFileNameSuffix) const { + return DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + contentFileNameSuffix, &mExpandableContentBuffer); + } + + private: + DISALLOW_COPY_AND_ASSIGN(SingleDictContent); + + const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + const bool mIsValid; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp new file mode 100644 index 000000000..7c9b4967a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool SparseTableDictContent::flush(const char *const dictPath, + const char *const lookupTableFileNameSuffix, const char *const addressTableFileNameSuffix, + const char *const contentFileNameSuffix) const { + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, lookupTableFileNameSuffix, + &mExpandableLookupTableBuffer)){ + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, addressTableFileNameSuffix, + &mExpandableAddressTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, contentFileNameSuffix, + &mExpandableContentBuffer)) { + return false; + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h new file mode 100644 index 000000000..c7233edd3 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "suggest/policyimpl/dictionary/utils/sparse_table.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// TODO: Support multiple contents. +class SparseTableDictContent : public DictContent { + public: + AK_FORCE_INLINE SparseTableDictContent(const char *const dictPath, + const char *const lookupTableFileName, const char *const addressTableFileName, + const char *const contentFileName, const bool isUpdatable, + const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer( + MmappedBuffer::openBuffer(dictPath, lookupTableFileName, isUpdatable)), + mAddressTableBuffer( + MmappedBuffer::openBuffer(dictPath, addressTableFileName, isUpdatable)), + mContentBuffer( + MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableLookupTableBuffer( + mLookupTableBuffer ? mLookupTableBuffer->getBuffer() : nullptr, + mLookupTableBuffer ? mLookupTableBuffer->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableAddressTableBuffer( + mAddressTableBuffer ? mAddressTableBuffer->getBuffer() : nullptr, + mAddressTableBuffer ? mAddressTableBuffer->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableContentBuffer(mContentBuffer ? mContentBuffer->getBuffer() : nullptr, + mContentBuffer ? mContentBuffer->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), + mIsValid(mLookupTableBuffer && mAddressTableBuffer && mContentBuffer) {} + + SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer(), mAddressTableBuffer(), mContentBuffer(), + mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), mIsValid(true) {} + + virtual ~SparseTableDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableLookupTableBuffer.isNearSizeLimit() + || mExpandableAddressTableBuffer.isNearSizeLimit() + || mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + SparseTable *getUpdatableAddressLookupTable() { + return &mAddressLookupTable; + } + + const SparseTable *getAddressLookupTable() const { + return &mAddressLookupTable; + } + + BufferWithExtendableBuffer *getWritableContentBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getContentBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictDirPath, const char *const lookupTableFileName, + const char *const addressTableFileName, const char *const contentFileName) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); + + const MmappedBuffer::MmappedBufferPtr mLookupTableBuffer; + const MmappedBuffer::MmappedBufferPtr mAddressTableBuffer; + const MmappedBuffer::MmappedBufferPtr mContentBuffer; + BufferWithExtendableBuffer mExpandableLookupTableBuffer; + BufferWithExtendableBuffer mExpandableAddressTableBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + SparseTable mAddressLookupTable; + const bool mIsValid; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp new file mode 100644 index 000000000..a9f841779 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + return NOT_A_DICT_POS; + } + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); + return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? + NOT_A_DICT_POS : terminalPos; +} + +bool TerminalPositionLookupTable::setTerminalPtNodePosition( + const int terminalId, const int terminalPtNodePos) { + if (terminalId < 0) { + return NOT_A_DICT_POS; + } + while (terminalId >= mSize) { + // Write new entry. + if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { + return false; + } + mSize++; + } + const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? + terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; + return getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); +} + +bool TerminalPositionLookupTable::flushToFile(const char *const dictPath) const { + // If the used buffer size is smaller than the actual buffer size, regenerate the lookup + // table and write the new table to the file. + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + TerminalPositionLookupTable lookupTableToWrite; + for (int i = 0; i < mSize; ++i) { + const int terminalPtNodePosition = getTerminalPtNodePosition(i); + if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { + AKLOGE("Cannot set terminal position to lookupTableToWrite." + " terminalId: %d, position: %d", i, terminalPtNodePosition); + return false; + } + } + return lookupTableToWrite.flush(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } else { + // We can simply use this lookup table because the buffer size has not been + // changed. + return flush(dictPath, Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } +} + +bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { + int removedEntryCount = 0; + int nextNewTerminalId = 0; + for (int i = 0; i < mSize; ++i) { + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); + if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { + // This entry is a garbage. + removedEntryCount++; + } else { + // Give a new terminal id to the entry. + if (!getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, + getEntryPos(nextNewTerminalId))) { + return false; + } + // Memorize the mapping to the old terminal id to the new terminal id. + terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); + nextNewTerminalId++; + } + } + mSize = nextNewTerminalId; + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h new file mode 100644 index 000000000..eadfe0faa --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h + */ + +#ifndef LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H +#define LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H + +#include <unordered_map> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class TerminalPositionLookupTable : public SingleDictContent { + public: + typedef std::unordered_map<int, int> TerminalIdMap; + + TerminalPositionLookupTable(const char *const dictPath, const bool isUpdatable) + : SingleDictContent(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION, isUpdatable), + mSize(getBuffer()->getTailPosition() + / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} + + TerminalPositionLookupTable() : mSize(0) {} + + int getTerminalPtNodePosition(const int terminalId) const; + + bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); + + int getNextTerminalId() const { + return mSize; + } + + bool flushToFile(const char *const dictPath) const; + + bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); + + private: + DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); + + int getEntryPos(const int terminalId) const { + return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + } + + int mSize; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h new file mode 100644 index 000000000..941fda748 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable) + : mShortcutDictContent(shortcutDictContent) {} + + ~Ver4ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + // The first shortcut entry is located at the head position of the shortcut list. + return pos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + int probability = 0; + mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, + outCodePoint, outCodePointCount, &probability, outHasNext, pos); + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); + } + } + + void skipAllShortcuts(int *const pos) const { + // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. + } + + bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, + const int probability) { + const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (shortcutListPos == NOT_A_DICT_POS) { + // Create shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, + false /* hasNext */, writingPos); + } + const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, + codePoints, codePointCount); + if (entryPos == NOT_A_DICT_POS) { + // Add new entry to the shortcut list. + // Create new shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, + codePointCount, probability, true /* hasNext */, &writingPos)) { + AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, + writingPos); + return false; + } + return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); + } + // Overwrite existing entry. + bool hasNext = false; + mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, + 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); + if (!mShortcutDictContent->writeShortcutEntry(codePoints, + codePointCount, probability, hasNext, entryPos)) { + AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, + entryPos); + return false; + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); + + ShortcutDictContent *const mShortcutDictContent; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp new file mode 100644 index 000000000..93f192976 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" + +#include <cerrno> +#include <cstring> +#include <sys/stat.h> +#include <sys/types.h> + +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/file_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( + const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion) { + if (!headerBuffer) { + ASSERT(false); + AKLOGE("The header buffer must be valid to open ver4 dict buffers."); + return Ver4DictBuffersPtr(nullptr); + } + // TODO: take only dictDirPath, and open both header and trie files in the constructor below + const bool isUpdatable = headerBuffer->isUpdatable(); + return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable, + formatVersion)); +} + +bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const { + // Create temporary directory. + const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + char tmpDirPath[tmpDirPathBufSize]; + FileUtils::getFilePathWithSuffix(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, + tmpDirPath); + if (FileUtils::existsDir(tmpDirPath)) { + if (!FileUtils::removeDirAndFiles(tmpDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); + ASSERT(false); + return false; + } + } + umask(S_IWGRP | S_IWOTH); + if (mkdir(tmpDirPath, S_IRWXU) == -1) { + AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); + return false; + } + // Get dictionary base path. + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); + char dictPath[dictPathBufSize]; + FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); + + // Write header file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { + AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::HEADER_FILE_EXTENSION); + return false; + } + // Write trie file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, &mExpandableTrieBuffer)) { + AKLOGE("Dictionary trie file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::TRIE_FILE_EXTENSION); + return false; + } + // Write dictionary contents. + if (!mTerminalPositionLookupTable.flushToFile(dictPath)) { + AKLOGE("Terminal position lookup table cannot be written. %s", tmpDirPath); + return false; + } + if (!mProbabilityDictContent.flushToFile(dictPath)) { + AKLOGE("Probability dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mBigramDictContent.flushToFile(dictPath)) { + AKLOGE("Bigram dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mShortcutDictContent.flushToFile(dictPath)) { + AKLOGE("Shortcut dict content cannot be written. %s", tmpDirPath); + return false; + } + // Remove existing dictionary. + if (!FileUtils::removeDirAndFiles(dictDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", dictDirPath); + ASSERT(false); + return false; + } + // Rename temporary directory. + if (rename(tmpDirPath, dictDirPath) != 0) { + AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); + ASSERT(false); + return false; + } + return true; +} + +Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath, + MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion) + : mHeaderBuffer(std::move(headerBuffer)), + mDictBuffer(MmappedBuffer::openBuffer(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)), + mHeaderPolicy(mHeaderBuffer->getBuffer(), formatVersion), + mExpandableHeaderBuffer(mHeaderBuffer ? mHeaderBuffer->getBuffer() : nullptr, + mHeaderPolicy.getSize(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableTrieBuffer(mDictBuffer ? mDictBuffer->getBuffer() : nullptr, + mDictBuffer ? mDictBuffer->getBufferSize() : 0, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mTerminalPositionLookupTable(dictPath, isUpdatable), + mProbabilityDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), + mBigramDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), + mShortcutDictContent(dictPath, isUpdatable), + mIsUpdatable(isUpdatable) {} + +Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) + : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy), + mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), + mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()), + mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(), + mIsUpdatable(true) {} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h new file mode 100644 index 000000000..e775be52e --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H +#define LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H + +#include <memory> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class Ver4DictBuffers { + public: + typedef std::unique_ptr<Ver4DictBuffers> Ver4DictBuffersPtr; + + static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, + MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion); + + static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( + const HeaderPolicy *const headerPolicy, const int maxTrieSize) { + return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize)); + } + + AK_FORCE_INLINE bool isValid() const { + return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid() + && mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid() + && mBigramDictContent.isValid() && mShortcutDictContent.isValid(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mExpandableTrieBuffer.isNearSizeLimit() + || mTerminalPositionLookupTable.isNearSizeLimit() + || mProbabilityDictContent.isNearSizeLimit() + || mBigramDictContent.isNearSizeLimit() + || mShortcutDictContent.isNearSizeLimit(); + } + + AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { + return &mHeaderPolicy; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { + return &mExpandableHeaderBuffer; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE ProbabilityDictContent *getMutableProbabilityDictContent() { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE const ProbabilityDictContent *getProbabilityDictContent() const { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() { + return &mBigramDictContent; + } + + AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const { + return &mBigramDictContent; + } + + AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + bool flush(const char *const dictDirPath) const { + return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); + } + + bool flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); + + Ver4DictBuffers(const char *const dictDirPath, + const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion); + + Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); + + const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; + const MmappedBuffer::MmappedBufferPtr mDictBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mExpandableHeaderBuffer; + BufferWithExtendableBuffer mExpandableTrieBuffer; + TerminalPositionLookupTable mTerminalPositionLookupTable; + ProbabilityDictContent mProbabilityDictContent; + BigramDictContent mBigramDictContent; + ShortcutDictContent mShortcutDictContent; + const int mIsUpdatable; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp new file mode 100644 index 000000000..81d85f495 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// These values MUST match the definitions in FormatSpec.java. +const char *const Ver4DictConstants::TRIE_FILE_EXTENSION = ".trie"; +const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; +const char *const Ver4DictConstants::FREQ_FILE_EXTENSION = ".freq"; +// tat = Terminal Address Table +const char *const Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; +const char *const Ver4DictConstants::BIGRAM_FILE_EXTENSION = ".bigram_freq"; +const char *const Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup"; +const char *const Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION = ".bigram_index_freq"; +const char *const Ver4DictConstants::SHORTCUT_FILE_EXTENSION = ".shortcut_shortcut"; +const char *const Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION = ".shortcut_lookup"; +const char *const Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION = + ".shortcut_index_shortcut"; + +// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. +const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; +// Extended region size, which is not GCed region size in dict file + additional buffer size, is +// limited to 1MB to prevent from inefficient traversing. +const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; + +const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; +const int Ver4DictConstants::PROBABILITY_SIZE = 1; +const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1; +const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; +const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; +const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; +const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; + +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; + +const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3; +// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing +// invalid terminal ID in bigram lists. +const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID = + (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1; +const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::BIGRAM_HAS_NEXT_MASK = 0x80; +const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1; + +const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h new file mode 100644 index 000000000..88ebd6a75 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H +#define LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// TODO: Create PtConstants under the pt_common and move some constant values there. +// Note that there are corresponding definitions in FormatSpec.java. +class Ver4DictConstants { + public: + static const char *const TRIE_FILE_EXTENSION; + static const char *const HEADER_FILE_EXTENSION; + static const char *const FREQ_FILE_EXTENSION; + static const char *const TERMINAL_ADDRESS_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_FILE_EXTENSION; + static const char *const BIGRAM_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_CONTENT_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_FILE_EXTENSION; + static const char *const SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_CONTENT_TABLE_FILE_EXTENSION; + + static const int MAX_DICTIONARY_SIZE; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + + static const int NOT_A_TERMINAL_ID; + static const int PROBABILITY_SIZE; + static const int FLAGS_IN_PROBABILITY_FILE_SIZE; + static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + static const int NOT_A_TERMINAL_ADDRESS; + static const int TERMINAL_ID_FIELD_SIZE; + static const int TIME_STAMP_FIELD_SIZE; + static const int WORD_LEVEL_FIELD_SIZE; + static const int WORD_COUNT_FIELD_SIZE; + + static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE; + static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; + + static const int BIGRAM_FLAGS_FIELD_SIZE; + static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + static const int INVALID_BIGRAM_TARGET_TERMINAL_ID; + static const int BIGRAM_PROBABILITY_MASK; + static const int BIGRAM_HAS_NEXT_MASK; + // Used when bigram list has time stamp. + static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE; + + static const int SHORTCUT_FLAGS_FIELD_SIZE; + static const int SHORTCUT_PROBABILITY_MASK; + static const int SHORTCUT_HAS_NEXT_MASK; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..82399f190 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int siblingNodePos) const { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + return PtNodeParams(); + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + const int headPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + dictBuf, &pos); + const int parentPos = + DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); + int codePoints[MAX_WORD_LENGTH]; + const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); + int terminalIdFieldPos = NOT_A_DICT_POS; + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + int probability = NOT_A_PROBABILITY; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + terminalIdFieldPos = pos; + if (usesAdditionalBuffer) { + terminalIdFieldPos += mBuffer->getOriginalBufferSize(); + } + terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); + const ProbabilityEntry probabilityEntry = + mProbabilityDictContent->getProbabilityEntry(terminalId); + if (probabilityEntry.hasHistoricalInfo()) { + probability = ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo(), mHeaderPolicy); + } else { + probability = probabilityEntry.getProbability(); + } + } + int childrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + childrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { + childrenPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + // Sibling position is the tail position of original PtNode. + int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; + // Read destination node if the read node is a moved node. + if (DynamicPtReadingUtils::isMoved(flags)) { + // The destination position is stored at the same place as the parent position. + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); + } else { + return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints, + terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, + newSiblingNodePos); + } +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h new file mode 100644 index 000000000..4032a67fa --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class ProbabilityDictContent; + +/* + * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved + * node and reads node attributes including probability form probabilityBuffer. + */ +class Ver4PatriciaTrieNodeReader : public PtNodeReader { + public: + Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, + const ProbabilityDictContent *const probabilityDictContent, + const HeaderPolicy *const headerPolicy) + : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent), + mHeaderPolicy(headerPolicy) {} + + ~Ver4PatriciaTrieNodeReader() {} + + virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const { + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, + NOT_A_DICT_POS /* siblingNodePos */); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + const ProbabilityDictContent *const mProbabilityDictContent; + const HeaderPolicy *const mHeaderPolicy; + + const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int siblingNodePos) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp new file mode 100644 index 000000000..4220a9561 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp @@ -0,0 +1,429 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" + +#include "suggest/core/dictionary/property/unigram_property.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->isTerminal()) { + // The PtNode is a terminal. Delete entry from the terminal position lookup table. + return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); + } else { + return true; + } +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->hasChildren()) { + // Update children's parent position. + mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); + while (!mReadingHelper.isEnd()) { + const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); + int parentOffsetFieldPos = childPtNodeParams.getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), + &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + mReadingHelper.readNextSiblingNode(childPtNodeParams); + } + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + false /* isDeleted */, true /* willBecomeNonTerminal */); + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { + AKLOGE("Cannot update terminal position lookup table. terminal id: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + // Update flags. + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) { + // Update probability and historical information. + // TODO: Update other information in the unigram property. + if (!toBeUpdatedPtNodeParams->isTerminal()) { + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, + unigramProperty); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { + if (!toBeUpdatedPtNodeParams->isTerminal()) { + AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + if (originalProbabilityEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); + const ProbabilityEntry probabilityEntry = + originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); + if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { + AKLOGE("Cannot write updated probability entry. terminalId: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); + if (!isValid) { + if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); + return false; + } + } + *outNeedsToKeepPtNode = isValid; + } else { + // No need to update probability. + *outNeedsToKeepPtNode = true; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( + const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { + int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); + return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + newChildrenPosition, &childrenPosFieldPos); +} + +bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId) { + return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, + toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { + return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, + ptNodeWritingPos); +} + + +bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, + int *const ptNodeWritingPos) { + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, + ptNodeWritingPos)) { + return false; + } + // Write probability. + ProbabilityEntry newProbabilityEntry; + const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( + &newProbabilityEntry, unigramProperty); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId, + &probabilityEntryToWrite); +} + +bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry( + const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam, + const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { + if (!mBigramPolicy->addNewEntry(sourcePtNodeParams->getTerminalId(), + targetPtNodeParam->getTerminalId(), bigramProperty, outAddedNewBigram)) { + AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", + sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId()); + return false; + } + if (!sourcePtNodeParams->hasBigrams()) { + // Update has bigrams flag. + return updatePtNodeFlags(sourcePtNodeParams->getHeadPos(), + sourcePtNodeParams->isBlacklisted(), sourcePtNodeParams->isNotAWord(), + sourcePtNodeParams->isTerminal(), sourcePtNodeParams->hasShortcutTargets(), + true /* hasBigrams */, + sourcePtNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::removeBigramEntry( + const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam) { + return mBigramPolicy->removeEntry(sourcePtNodeParams->getTerminalId(), + targetPtNodeParam->getTerminalId()); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { + return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries( + sourcePtNodeParams->getTerminalId(), outBigramEntryCount); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) { + int parentPos = toBeUpdatedPtNodeParams->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = toBeUpdatedPtNodeParams->getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { + return false; + } + + // Counts bigram entries. + if (outBigramEntryCount) { + *outBigramEntryCount = mBigramPolicy->getBigramEntryConut( + toBeUpdatedPtNodeParams->getTerminalId()); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), + targetCodePoints, targetCodePointCount, shortcutProbability)) { + AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId()); + return false; + } + if (!ptNodeParams->hasShortcutTargets()) { + // Update has shortcut targets flag. + return updatePtNodeFlags(ptNodeParams->getHeadPos(), + ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), + ptNodeParams->isTerminal(), true /* hasShortcutTargets */, + ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags( + const PtNodeParams *const ptNodeParams) { + const bool hasBigrams = mBuffers->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isBlacklisted(), + ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets, + hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos) { + const int nodePos = *ptNodeWritingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, + 0 /* nodeFlags */, ptNodeWritingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { + return false; + } + // Write code points + if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, + ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { + return false; + } + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!ptNodeParams->willBecomeNonTerminal()) { + if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { + terminalId = ptNodeParams->getTerminalId(); + } else if (ptNodeParams->isTerminal()) { + // Write terminal information using a new terminal id. + // Get a new unused terminal id. + terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); + } + } + const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + if (isTerminal) { + // Update the lookup table. + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + terminalId, nodePos)) { + return false; + } + // Write terminal Id. + if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, + Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { + return false; + } + if (outTerminalId) { + *outTerminalId = terminalId; + } + } + // Write children position + if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { + return false; + } + return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), + isTerminal, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const { + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(), + unigramProperty->getLevel(), unigramProperty->getCount()); + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalProbabilityEntry->getHistoricalInfo(), + unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); + return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( + &updatedHistoricalInfo); + } else { + return originalProbabilityEntry->createEntryWithUpdatedProbability( + unigramProperty->getProbability()); + } +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, + const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, + const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars) { + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, + hasShortcutTargets, hasBigrams, hasMultipleChars, + CHILDREN_POSITION_FIELD_SIZE); + if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { + AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); + return false; + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h new file mode 100644 index 000000000..08226ea26 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class Ver4BigramListPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PtNodeArrayReader; +class Ver4ShortcutListPolicy; + +/* + * This class is used for helping to writes nodes of ver4 patricia trie. + */ +class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { + public: + Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, + Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy, + const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader, + Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) + : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy), + mReadingHelper(ptNodeReader, ptNodeArrayReader), mBigramPolicy(bigramPolicy), + mShortcutPolicy(shortcutPolicy) {} + + virtual ~Ver4PatriciaTrieNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos); + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty); + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition); + + bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId); + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos); + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); + + virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam, const BigramProperty *const bigramProperty, + bool *const outAddedNewBigram); + + virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam); + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount); + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability); + + bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); + + bool writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos); + + // Create updated probability entry using given unigram property. In addition to the + // probability, this method updates historical information if needed. + // TODO: Update flags belonging to the unigram property. + const ProbabilityEntry createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const; + + bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, + const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, + const bool hasMultipleChars); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mTrieBuffer; + Ver4DictBuffers *const mBuffers; + const HeaderPolicy *const mHeaderPolicy; + DynamicPtReadingHelper mReadingHelper; + Ver4BigramListPolicy *const mBigramPolicy; + Ver4ShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp new file mode 100644 index 000000000..805820b05 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -0,0 +1,519 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" + +#include <vector> + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/property/bigram_property.h" +#include "suggest/core/dictionary/property/unigram_property.h" +#include "suggest/core/dictionary/property/word_property.h" +#include "suggest/core/session/prev_words_info.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; +const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + +void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); + while (!readingHelper.isEnd()) { + const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); + if (!ptNodeParams.isValid()) { + break; + } + bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); + if (isTerminal && mHeaderPolicy->isDecayingDict()) { + // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose + // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a + // valid terminal DicNode. + isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; + } + readingHelper.readNextSiblingNode(ptNodeParams); + if (ptNodeParams.representsNonWordInfo()) { + // Skip PtNodes that represent non-word information. + continue; + } + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), + ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, + ptNodeParams.hasChildren(), + ptNodeParams.isBlacklisted() + || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, + ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); + } + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } +} + +int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodePos(ptNodePos); + const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( + maxCodePointCount, outCodePoints, outUnigramProbability); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); + } + return codePointCount; +} + +int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = + readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } + return ptNodePos; +} + +int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + if (mHeaderPolicy->isDecayingDict()) { + // Both probabilities are encoded. Decode them and get probability. + return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); + } else { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return bigramProbability; + } + } +} + +int Ver4PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_PROBABILITY; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { + return NOT_A_PROBABILITY; + } + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); +} + +int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams.getTerminalId()); +} + +int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams.getTerminalId()); +} + +bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, + const UnigramProperty *const unigramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (length > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %d", length); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", + shortcut.getTargetCodePoints()->size()); + return false; + } + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + bool addedNewUnigram = false; + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = length; + memmove(codePointsToAdd, word, sizeof(int) * length); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, + unigramProperty, &addedNewUnigram)) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { + mUnigramCount++; + } + if (unigramProperty->getShortcuts().size() > 0) { + // Add shortcut target. + const int wordPos = getTerminalPtNodePositionOfWord(word, length, + false /* forceLowerCaseSearch */); + if (wordPos == NOT_A_DICT_POS) { + AKLOGE("Cannot find terminal PtNode position to add shortcut target."); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (!mUpdatingHelper.addShortcutTarget(wordPos, + shortcut.getTargetCodePoints()->data(), + shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " + "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), + shortcut.getProbability()); + return false; + } + } + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (!prevWordsInfo->isValid()) { + AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); + return false; + } + if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert the ngram to the dictionary. " + "length: %d", bigramProperty->getTargetCodePoints()->size()); + return false; + } + int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + false /* tryLowerCaseSearch */); + // TODO: Support N-gram. + if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { + if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { + const std::vector<UnigramProperty::ShortcutProperty> shortcuts; + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, true /* isNotAWord */, + false /* isBlacklisted */, MAX_PROBABILITY /* probability */, + NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); + if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), + prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); + return false; + } + // Refresh Terminal PtNode positions. + prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + false /* tryLowerCaseSearch */); + } else { + return false; + } + } + const int word1Pos = getTerminalPtNodePositionOfWord( + bigramProperty->getTargetCodePoints()->data(), + bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_DICT_POS) { + return false; + } + bool addedNewBigram = false; + if (mUpdatingHelper.addBigramWords(prevWordsPtNodePos[0], word1Pos, bigramProperty, + &addedNewBigram)) { + if (addedNewBigram) { + mBigramCount++; + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const int *const word, const int length) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (!prevWordsInfo->isValid()) { + AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); + return false; + } + if (length > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); + } + int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + false /* tryLowerCaseSerch */); + // TODO: Support N-gram. + if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { + return false; + } + const int wordPos = getTerminalPtNodePositionOfWord(word, length, + false /* forceLowerCaseSearch */); + if (wordPos == NOT_A_DICT_POS) { + return false; + } + if (mUpdatingHelper.removeBigramWords(prevWordsPtNodePos[0], wordPos)) { + mBigramCount--; + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); + return false; + } + if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { + AKLOGE("Cannot flush the dictionary to file."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { + AKLOGE("Cannot flush the dictionary to file with GC."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBuffers->isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() + > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size of the trie exceeds the limit. + return true; + } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mDictBuffer->getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy->isDecayingDict()) { + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, + mHeaderPolicy); + } + return false; +} + +void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, + char *const outResult, const int maxResultLength) { + const int compareLength = queryLength + 1 /* terminator */; + if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mUnigramCount); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mBigramCount); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getUnigramCountHardLimit( + mHeaderPolicy->getMaxUnigramCount()) : + static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getBigramCountHardLimit( + mHeaderPolicy->getMaxBigramCount()) : + static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } +} + +const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, + const int codePointCount) const { + const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, + false /* forceLowerCaseSearch */); + if (ptNodePos == NOT_A_DICT_POS) { + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); + } + const PtNodeParams ptNodeParams = mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + std::vector<int> codePointVector(ptNodeParams.getCodePoints(), + ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); + const ProbabilityEntry probabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + ptNodeParams.getTerminalId()); + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + // Fetch bigram information. + std::vector<BigramProperty> bigrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + if (bigramListPos != NOT_A_DICT_POS) { + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); + const TerminalPositionLookupTable *const terminalPositionLookupTable = + mBuffers->getTerminalPositionLookupTable(); + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + const int word1TerminalId = bigramEntry.getTargetTerminalId(); + const int word1TerminalPtNodePos = + terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); + if (word1TerminalPtNodePos == NOT_A_DICT_POS) { + continue; + } + // Word (unigram) probability + int word1Probability = NOT_A_PROBABILITY; + const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, + &word1Probability); + const std::vector<int> word1(bigramWord1CodePoints, + bigramWord1CodePoints + codePointCount); + const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); + const int probability = bigramEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mHeaderPolicy) : + bigramEntry.getProbability(); + bigrams.emplace_back(&word1, probability, + historicalInfo->getTimeStamp(), historicalInfo->getLevel(), + historicalInfo->getCount()); + } + } + // Fetch shortcut information. + std::vector<UnigramProperty::ShortcutProperty> shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers->getShortcutDictContent(); + bool hasNext = true; + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); + shortcuts.emplace_back(&target, shortcutProbability); + } + } + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + historicalInfo->getTimeStamp(), historicalInfo->getLevel(), + historicalInfo->getCount(), &shortcuts); + return WordProperty(&codePointVector, &unigramProperty, &bigrams); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { + // TODO: Return code point count like other methods. + // Null termination. + outCodePoints[0] = 0; + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + int unigramProbability = NOT_A_PROBABILITY; + const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); + if (codePointCount < MAX_WORD_LENGTH) { + // Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH + // code points. + outCodePoints[codePointCount] = 0; + } + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h new file mode 100644 index 000000000..2e948ac4a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H + +#include <vector> + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class DicNode; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class DicNodeVector; +namespace backward { +namespace v402 { + +class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) + : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), + mDictBuffer(mBuffers->getWritableTrieBuffer()), + mBigramPolicy(mBuffers->getMutableBigramDictContent(), + mBuffers->getTerminalPositionLookupTable(), mHeaderPolicy), + mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()), + mNodeReader(mDictBuffer, mBuffers->getProbabilityDictContent(), mHeaderPolicy), + mPtNodeArrayReader(mDictBuffer), + mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, + &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), + mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), + mWritingHelper(mBuffers.get()), + mUnigramCount(mHeaderPolicy->getUnigramCount()), + mBigramCount(mHeaderPolicy->getBigramCount()), + mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndProbabilityAndReturnCodePointCount( + const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + + int getTerminalPtNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; + + int getShortcutPositionOfPtNode(const int ptNodePos) const; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramPolicy; + } + + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutPolicy; + } + + bool addUnigramEntry(const int *const word, const int length, + const UnigramProperty *const unigramProperty); + + bool removeUnigramEntry(const int *const word, const int length) { + // Removing unigram entry is not supported. + return false; + } + + bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty); + + bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1, + const int length1); + + bool flush(const char *const filePath); + + bool flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC) const; + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength); + + const WordProperty getWordProperty(const int *const codePoints, + const int codePointCount) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); + + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + // When the dictionary size is near the maximum size, we have to refuse dynamic operations to + // prevent the dictionary from overflowing. + static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + + const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; + const HeaderPolicy *const mHeaderPolicy; + BufferWithExtendableBuffer *const mDictBuffer; + Ver4BigramListPolicy mBigramPolicy; + Ver4ShortcutListPolicy mShortcutPolicy; + Ver4PatriciaTrieNodeReader mNodeReader; + Ver4PtNodeArrayReader mPtNodeArrayReader; + Ver4PatriciaTrieNodeWriter mNodeWriter; + DynamicPtUpdatingHelper mUpdatingHelper; + Ver4PatriciaTrieWritingHelper mWritingHelper; + int mUnigramCount; + int mBigramCount; + std::vector<int> mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..80d531198 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( + const uint8_t *const buffer, int *pos) { + return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h new file mode 100644 index 000000000..3579c26d6 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H + +#include <cstdint> + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { + +class Ver4PatriciaTrieReadingUtils { + public: + static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..99eed0f67 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" + +#include <cstring> +#include <queue> + +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/file_utils.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, + const int unigramCount, const int bigramCount) const { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " + "extendedRegionSize: %d", false, unigramCount, bigramCount, + extendedRegionSize); + return false; + } + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const dictDirPath) { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( + Ver4DictBuffers::createVer4DictBuffers(headerPolicy, + Ver4DictConstants::MAX_DICTIONARY_SIZE)); + int unigramCount = 0; + int bigramCount = 0; + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { + return false; + } + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { + return false; + } + return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, + int *const outUnigramCount, int *const outBigramCount) { + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), + mBuffers->getProbabilityDictContent(), headerPolicy); + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); + Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), + mBuffers->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), + mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); + + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + &ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount(); + const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); + if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { + if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { + AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, + maxUnigramCount); + return false; + } + } + + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability + traversePolicyToUpdateBigramProbability(&ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateBigramProbability)) { + return false; + } + const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); + const int maxBigramCount = headerPolicy->getMaxBigramCount(); + if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { + if (!truncateBigrams(maxBigramCount)) { + AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); + return false; + } + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instances for the GCed dictionary. + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), + buffersToWrite->getProbabilityDictContent(), headerPolicy); + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); + Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), + buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), + buffersToWrite->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, + &newShortcutPolicy); + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + // Run GC for probability dict content. + if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, + mBuffers->getProbabilityDictContent())) { + return false; + } + // Run GC for bigram dict content. + if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, + mBuffers->getBigramDictContent(), outBigramCount)) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { + return false; + } + *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( + const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> + priorityQueue; + for (int i = 0; i < nextTerminalId; ++i) { + const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); + if (terminalPos == NOT_A_DICT_POS) { + continue; + } + const ProbabilityEntry probabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); + const int probability = probabilityEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + probabilityEntry.getProbability(); + priorityQueue.push(DictProbability(terminalPos, probability, + probabilityEntry.getHistoricalInfo()->getTimeStamp())); + } + + // Delete unigrams. + while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) { + const int ptNodePos = priorityQueue.top().getDictPos(); + priorityQueue.pop(); + const PtNodeParams ptNodeParams = + ptNodeReader->fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.representsNonWordInfo()) { + continue; + } + if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); + return false; + } + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> + priorityQueue; + BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); + for (int i = 0; i < nextTerminalId; ++i) { + const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); + if (bigramListPos == NOT_A_DICT_POS) { + continue; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int probability = bigramEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + bigramEntry.getProbability(); + priorityQueue.push(DictProbability(entryPos, probability, + bigramEntry.getHistoricalInfo()->getTimeStamp())); + } + } + + // Delete bigrams. + while (static_cast<int>(priorityQueue.size()) > maxBigramCount) { + const int entryPos = priorityQueue.top().getDictPos(); + const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); + const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { + AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); + return false; + } + priorityQueue.pop(); + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + } + return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h new file mode 100644 index 000000000..9034ee656 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PatriciaTrieNodeWriter; + +class Ver4PatriciaTrieWritingHelper { + public: + Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) + : mBuffers(buffers) {} + + bool writeToDictFile(const char *const dictDirPath, const int unigramCount, + const int bigramCount) const; + + // This method cannot be const because the original dictionary buffer will be updated to detect + // useless PtNodes during GC. + bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); + + class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) + : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); + + Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; + const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbability { + public: + DictProbability(const int dictPos, const int probability, const int timestamp) + : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {} + + int getDictPos() const { + return mDictPos; + } + + int getProbability() const { + return mProbability; + } + + int getTimestamp() const { + return mTimestamp; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability); + + int mDictPos; + int mProbability; + int mTimestamp; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbabilityComparator { + public: + bool operator()(const DictProbability &left, const DictProbability &right) { + if (left.getProbability() != right.getProbability()) { + return left.getProbability() > right.getProbability(); + } + if (left.getTimestamp() != right.getTimestamp()) { + return left.getTimestamp() < right.getTimestamp(); + } + return left.getDictPos() > right.getDictPos(); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator); + }; + + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, + int *const outBigramCount); + + bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount); + + bool truncateBigrams(const int maxBigramCount); + + Ver4DictBuffers *const mBuffers; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime + +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp new file mode 100644 index 000000000..537a6d420 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp + */ + +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + ptNodeArrayPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = ptNodeArrayPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &readingPos); + if (usesAdditionalBuffer) { + readingPos += mBuffer->getOriginalBufferSize(); + } + if (ptNodeCountInArray < 0) { + AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); + return false; + } + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + forwordLinkPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = forwordLinkPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int nextPtNodeArrayOffset = + DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); + if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { + *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; + } else { + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h new file mode 100644 index 000000000..4f8056801 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H +#define LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { + +class Ver4PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index be7e43b98..e4b5fa267 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -19,6 +19,9 @@ #include <climits> #include "defines.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" @@ -42,7 +45,7 @@ namespace latinime { if (isUpdatable) { AKLOGE("One file dictionaries don't support updating. path: %s", path); ASSERT(false); - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); + return nullptr; } return newPolicyForFileDict(path, bufOffset, size); } @@ -52,26 +55,45 @@ namespace latinime { DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict( const int formatVersion, const std::vector<int> &locale, const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { - switch (formatVersion) { + FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); + switch (dictFormatVersion) { case FormatUtils::VERSION_4: { - HeaderPolicy headerPolicy(FormatUtils::VERSION_4, locale, attributeMap); - Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers = - Ver4DictBuffers::createVer4DictBuffers(&headerPolicy, - Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); - if (!DynamicPtWritingUtils::writeEmptyDictionary( - dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { - AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); - } - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( - new Ver4PatriciaTriePolicy(std::move(dictBuffers))); + return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants, + backward::v402::Ver4DictBuffers, + backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr, + backward::v402::Ver4PatriciaTriePolicy>( + dictFormatVersion, locale, attributeMap); + } + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_4_DEV: { + return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers, + Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>( + dictFormatVersion, locale, attributeMap); } default: AKLOGE("DICT: dictionary format %d is not supported for on memory dictionary", formatVersion); break; } - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); + return nullptr; +} + +template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy> +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForOnMemoryV4Dict( + const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector<int> &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + HeaderPolicy headerPolicy(formatVersion, locale, attributeMap); + DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, + DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); + if (!DynamicPtWritingUtils::writeEmptyDictionary( + dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { + AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); + return nullptr; + } + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new StructurePolicy(std::move(dictBuffers))); } /* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr @@ -82,42 +104,62 @@ namespace latinime { getHeaderFilePathInDictDir(path, headerFilePathBufSize, headerFilePath); // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of // MmappedBufferPtr if the instance has the responsibility. - MmappedBuffer::MmappedBufferPtr mmappedBuffer( - MmappedBuffer::openBuffer(headerFilePath, isUpdatable)); + MmappedBuffer::MmappedBufferPtr mmappedBuffer = + MmappedBuffer::openBuffer(headerFilePath, isUpdatable); if (!mmappedBuffer) { - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); + return nullptr; } - switch (FormatUtils::detectFormatVersion(mmappedBuffer->getBuffer(), - mmappedBuffer->getBufferSize())) { + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion( + mmappedBuffer->getBuffer(), mmappedBuffer->getBufferSize()); + switch (formatVersion) { case FormatUtils::VERSION_2: AKLOGE("Given path is a directory but the format is version 2. path: %s", path); break; case FormatUtils::VERSION_4: { - const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */; - char dictPath[dictDirPathBufSize]; - if (!FileUtils::getFilePathWithoutSuffix(headerFilePath, - Ver4DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) { - AKLOGE("Dictionary file name is not valid as a ver4 dictionary. path: %s", path); - ASSERT(false); - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); - } - Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( - Ver4DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer))); - if (!dictBuffers || !dictBuffers->isValid()) { - AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s", - path); - ASSERT(false); - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); - } - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( - new Ver4PatriciaTriePolicy(std::move(dictBuffers))); + return newPolicyForV4Dict<backward::v402::Ver4DictConstants, + backward::v402::Ver4DictBuffers, + backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr, + backward::v402::Ver4PatriciaTriePolicy>( + headerFilePath, formatVersion, std::move(mmappedBuffer)); + } + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_4_DEV: { + return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers, + Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>( + headerFilePath, formatVersion, std::move(mmappedBuffer)); } default: AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); break; } ASSERT(false); - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); + return nullptr; +} + +template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy> +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForV4Dict( + const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, + MmappedBuffer::MmappedBufferPtr &&mmappedBuffer) { + const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */; + char dictPath[dictDirPathBufSize]; + if (!FileUtils::getFilePathWithoutSuffix(headerFilePath, + DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) { + AKLOGE("Dictionary file name is not valid as a ver4 dictionary. header path: %s", + headerFilePath); + ASSERT(false); + return nullptr; + } + DictBuffersPtr dictBuffers = + DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion); + if (!dictBuffers || !dictBuffers->isValid()) { + AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s", + dictPath); + ASSERT(false); + return nullptr; + } + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new StructurePolicy(std::move(dictBuffers))); } /* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr @@ -128,14 +170,16 @@ namespace latinime { MmappedBuffer::MmappedBufferPtr mmappedBuffer( MmappedBuffer::openBuffer(path, bufOffset, size, false /* isUpdatable */)); if (!mmappedBuffer) { - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); + return nullptr; } switch (FormatUtils::detectFormatVersion(mmappedBuffer->getBuffer(), mmappedBuffer->getBufferSize())) { case FormatUtils::VERSION_2: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: + case FormatUtils::VERSION_4_DEV: AKLOGE("Given path is a file but the format is version 4. path: %s", path); break; default: @@ -143,7 +187,7 @@ namespace latinime { break; } ASSERT(false); - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); + return nullptr; } /* static */ void DictionaryStructureWithBufferPolicyFactory::getHeaderFilePathInDictDir( diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h index f71447e23..768454d8d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h @@ -22,6 +22,8 @@ #include "defines.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" namespace latinime { @@ -32,16 +34,26 @@ class DictionaryStructureWithBufferPolicyFactory { const int size, const bool isUpdatable); static DictionaryStructureWithBufferPolicy::StructurePolicyPtr - newPolicyForOnMemoryDict(const int formatVersion, - const std::vector<int> &locale, + newPolicyForOnMemoryDict(const int formatVersion, const std::vector<int> &locale, const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); + template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy> + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForOnMemoryV4Dict(const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector<int> &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr newPolicyForDirectoryDict(const char *const path, const bool isUpdatable); + template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy> + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr newPolicyForV4Dict( + const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, + MmappedBuffer::MmappedBufferPtr &&mmappedBuffer); + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr newPolicyForFileDict(const char *const path, const int bufOffset, const int size); diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp index 7d0d09631..08b4e0b5e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h index 15f924a6a..15f924a6a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp index 8f42df6d2..1f00fc6ab 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -29,10 +29,10 @@ bool DynamicPtGcEventListeners // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless // children. bool isUselessPtNode = !ptNodeParams->isTerminal(); - if (ptNodeParams->isTerminal()) { + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { bool needsToKeepPtNode = true; - if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(ptNodeParams, - &needsToKeepPtNode)) { + if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + ptNodeParams, &needsToKeepPtNode)) { AKLOGE("Cannot update PtNode probability or get needs to keep PtNode after GC."); return false; } @@ -56,7 +56,7 @@ bool DynamicPtGcEventListeners } } else { mValueStack.back() += 1; - if (ptNodeParams->isTerminal()) { + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { mValidUnigramCount += 1; } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp index e02dd5550..9e575858a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp @@ -16,6 +16,7 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "suggest/core/dictionary/property/unigram_property.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" @@ -29,9 +30,8 @@ const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; bool DynamicPtUpdatingHelper::addUnigramWord( DynamicPtReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - const bool isNotAWord, const bool isBlacklisted, const int timestamp, - bool *const outAddedNewUnigram) { + const int *const wordCodePoints, const int codePointCount, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { int parentPos = NOT_A_DICT_POS; while (!readingHelper->isEnd()) { const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); @@ -53,20 +53,18 @@ bool DynamicPtUpdatingHelper::addUnigramWord( if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j, wordCodePoints[matchedCodePointCount + j])) { *outAddedNewUnigram = true; - return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, isNotAWord, isBlacklisted, - probability, timestamp, wordCodePoints + matchedCodePointCount, + return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, + wordCodePoints + matchedCodePointCount, codePointCount - matchedCodePointCount); } } // All characters are matched. if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { - return setPtNodeProbability(&ptNodeParams, isNotAWord, isBlacklisted, probability, - timestamp, outAddedNewUnigram); + return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); } if (!ptNodeParams.hasChildren()) { *outAddedNewUnigram = true; - return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, - isNotAWord, isBlacklisted, probability, timestamp, + return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); } @@ -83,17 +81,17 @@ bool DynamicPtUpdatingHelper::addUnigramWord( return createAndInsertNodeIntoPtNodeArray(parentPos, wordCodePoints + readingHelper->getPrevTotalCodePointCount(), codePointCount - readingHelper->getPrevTotalCodePointCount(), - isNotAWord, isBlacklisted, probability, timestamp, &pos); + unigramProperty, &pos); } bool DynamicPtUpdatingHelper::addBigramWords(const int word0Pos, const int word1Pos, - const int probability, const int timestamp, bool *const outAddedNewBigram) { + const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { const PtNodeParams sourcePtNodeParams( mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); const PtNodeParams targetPtNodeParams( mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); - return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, probability, - timestamp, outAddedNewBigram); + return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, + bigramProperty, outAddedNewBigram); } // Remove a bigram relation from word0Pos to word1Pos. @@ -115,36 +113,34 @@ bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, const int nodeCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp, int *const forwardLinkFieldPos) { + const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, newPtNodeArrayPos, forwardLinkFieldPos)) { return false; } return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, - isNotAWord, isBlacklisted, probability, timestamp); + unigramProperty); } -bool DynamicPtUpdatingHelper::setPtNodeProbability( - const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const int probability, const int timestamp, - bool *const outAddedNewUnigram) { +bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { if (originalPtNodeParams->isTerminal()) { // Overwrites the probability. *outAddedNewUnigram = false; - return mPtNodeWriter->updatePtNodeProbability(originalPtNodeParams, probability, timestamp); + return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); } else { // Make the node terminal and write the probability. *outAddedNewUnigram = true; const int movedPos = mBuffer->getTailPosition(); int writingPos = movedPos; const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, - isNotAWord, isBlacklisted, true /* isTerminal */, - originalPtNodeParams->getParentPos(), originalPtNodeParams->getCodePointCount(), - originalPtNodeParams->getCodePoints(), probability)); + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + true /* isTerminal */, originalPtNodeParams->getParentPos(), + originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(), + unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, - timestamp, &writingPos)) { + unigramProperty, &writingPos)) { return false; } if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { @@ -155,31 +151,30 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability( } bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( - const PtNodeParams *const parentPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const int probability, const int timestamp, + const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, const int *const codePoints, const int codePointCount) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { return false; } return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, - codePointCount, isNotAWord, isBlacklisted, probability, timestamp); + codePointCount, unigramProperty); } bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp) { + const UnigramProperty *const unigramProperty) { int writingPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, 1 /* arraySize */, &writingPos)) { return false; } const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - isNotAWord, isBlacklisted, true /* isTerminal */, - parentPtNodePos, nodeCodePointCount, nodeCodePoints, probability)); - if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, timestamp, - &writingPos)) { + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */, + parentPtNodePos, nodeCodePointCount, nodeCodePoints, + unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { return false; } if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, @@ -192,13 +187,13 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( // Returns whether the dictionary updating was succeeded or not. bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode, - const int timestamp, const int *const newNodeCodePoints, const int newNodeCodePointCount) { + const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints, + const int newNodeCodePointCount) { // When addsExtraChild is true, split the reallocating PtNode and add new child. // Reallocating PtNode: abcde, newNode: abcxy. // abc (1st, not terminal) __ de (2nd) // \_ xy (extra child, terminal) - // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. + // Otherwise, this method makes 1st part terminal and write information in unigramProperty. // Reallocating PtNode: abcde, newNode: abc. // abc (1st, terminal) __ de (2nd) const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; @@ -216,11 +211,12 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } } else { const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - isNotAWord, isBlacklisted, true /* isTerminal */, - reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, - reallocatingPtNodeParams->getCodePoints(), probabilityOfNewPtNode)); + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), + overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(), + unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, - timestamp, &writingPos)) { + unigramProperty, &writingPos)) { return false; } } @@ -244,11 +240,12 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } if (addsExtraChild) { const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( - isNotAWord, isBlacklisted, true /* isTerminal */, - firstPartOfReallocatedPtNodePos, newNodeCodePointCount - overlappingCodePointCount, - newNodeCodePoints + overlappingCodePointCount, probabilityOfNewPtNode)); + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + true /* isTerminal */, firstPartOfReallocatedPtNodePos, + newNodeCodePointCount - overlappingCodePointCount, + newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, - timestamp, &writingPos)) { + unigramProperty, &writingPos)) { return false; } } @@ -269,8 +266,8 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( - const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const bool isTerminal, const int parentPos, + const PtNodeParams *const originalPtNodeParams, + const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos, const int codePointCount, const int *const codePoints, const int probability) const { const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( isBlacklisted, isNotAWord, isTerminal, originalPtNodeParams->hasShortcutTargets(), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h index 9b2815263..f10d15a9b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h @@ -22,10 +22,12 @@ namespace latinime { +class BigramProperty; class BufferWithExtendableBuffer; class DynamicPtReadingHelper; class PtNodeReader; class PtNodeWriter; +class UnigramProperty; class DynamicPtUpdatingHelper { public: @@ -37,13 +39,12 @@ class DynamicPtUpdatingHelper { // Add a word to the dictionary. If the word already exists, update the probability. bool addUnigramWord(DynamicPtReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - const bool isNotAWord, const bool isBlacklisted, const int timestamp, - bool *const outAddedNewUnigram); + const int *const wordCodePoints, const int codePointCount, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); // Add a bigram relation from word0Pos to word1Pos. - bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, - const int timestamp, bool *const outAddedNewBigram); + bool addBigramWords(const int word0Pos, const int word1Pos, + const BigramProperty *const bigramProperty, bool *const outAddedNewBigram); // Remove a bigram relation from word0Pos to word1Pos. bool removeBigramWords(const int word0Pos, const int word1Pos); @@ -62,25 +63,22 @@ class DynamicPtUpdatingHelper { PtNodeWriter *const mPtNodeWriter; bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted, - const int probability, const int timestamp, int *const forwardLinkFieldPos); + const int nodeCodePointCount, const UnigramProperty *const unigramProperty, + int *const forwardLinkFieldPos); - bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const int probability, const int timestamp, - bool *const outAddedNewUnigram); + bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, - const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp, const int *const codePoints, const int codePointCount); + const UnigramProperty *const unigramProperty, const int *const codePoints, + const int codePointCount); bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted, - const int probability, const int timestamp); + const int nodeCodePointCount, const UnigramProperty *const unigramProperty); bool reallocatePtNodeAndAddNewPtNodes( const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode, - const int timestamp, const int *const newNodeCodePoints, + const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints, const int newNodeCodePointCount); const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h index 91192fc57..b2e60a837 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -23,6 +23,7 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/char_utils.h" namespace latinime { @@ -158,6 +159,16 @@ class PtNodeParams { return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags); } + AK_FORCE_INLINE bool representsNonWordInfo() const { + return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0]) + && isNotAWord(); + } + + AK_FORCE_INLINE int representsBeginningOfSentence() const { + return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE + && isNotAWord(); + } + // Parent node position AK_FORCE_INLINE int getParentPos() const { return mParentPos; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h index e843f074a..a8029f73f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h @@ -24,6 +24,9 @@ namespace latinime { +class BigramProperty; +class UnigramProperty; + // Interface class used to write PtNode information. class PtNodeWriter { public: @@ -51,8 +54,8 @@ class PtNodeWriter { virtual bool markPtNodeAsWillBecomeNonTerminal( const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; - virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int probability, const int timestamp) = 0; + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) = 0; virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( const PtNodeParams *const toBeUpdatedPtNodeParams, @@ -65,10 +68,10 @@ class PtNodeWriter { int *const ptNodeWritingPos) = 0; virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - const int timestamp, int *const ptNodeWritingPos) = 0; + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0; virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams, - const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, + const PtNodeParams *const targetPtNodeParam, const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) = 0; virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp index 847dcdee5..91c76941c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" @@ -44,7 +44,7 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; } /* static */ int ShortcutListReadingUtils::readShortcutTarget( - const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) { + const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) { return ByteArrayUtils::readStringAndAdvancePosition(dictRoot, maxLength, outWord, pos); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h index d065bf7fd..d065bf7fd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h index a898e2afc..00bb502dc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h @@ -21,7 +21,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index b3af1f47a..a6a470c4e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -24,6 +24,7 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" +#include "utils/char_utils.h" namespace latinime { @@ -318,12 +319,15 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(), getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); - childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, - PatriciaTrieReadingUtils::isTerminal(flags), - PatriciaTrieReadingUtils::hasChildrenInFlags(flags), - PatriciaTrieReadingUtils::isBlacklisted(flags) - || PatriciaTrieReadingUtils::isNotAWord(flags), - mergedNodeCodePointCount, mergedNodeCodePoints); + // Skip PtNodes don't start with Unicode code point because they represent non-word information. + if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { + childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, + PatriciaTrieReadingUtils::isTerminal(flags), + PatriciaTrieReadingUtils::hasChildrenInFlags(flags), + PatriciaTrieReadingUtils::isBlacklisted(flags) + || PatriciaTrieReadingUtils::isNotAWord(flags), + mergedNodeCodePointCount, mergedNodeCodePoints); + } return siblingPos; } @@ -379,8 +383,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin shortcuts.emplace_back(&shortcutTarget, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 85f46603e..dce94363a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -22,9 +22,9 @@ #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" @@ -81,35 +81,43 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return &mShortcutListPolicy; } - bool addUnigramWord(const int *const word, const int length, + bool addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty) { // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); return false; } - bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp) { + bool removeUnigramEntry(const int *const word, const int length) { // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); return false; } - bool removeBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1) { + bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty) { // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); return false; } - void flush(const char *const filePath) { + bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, + const int length) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + + bool flush(const char *const filePath) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: flush() is called for non-updatable dictionary."); + return false; } - void flushWithGC(const char *const filePath) { + bool flushWithGC(const char *const filePath) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; } bool needsToRunGC(const bool mindsBlockByGC) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h index 6d2b4778c..8e16ccc05 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h @@ -21,7 +21,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp index 5df2096a4..7a52fd180 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp @@ -14,10 +14,11 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/core/dictionary/property/bigram_property.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" @@ -49,13 +50,18 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out } bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, - const int newProbability, const int timestamp, bool *const outAddedNewEntry) { + const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) { + // 1. The word has no bigrams yet. + // 2. The word has bigrams, and there is the target in the list. + // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. + // 4. The word has bigrams. We have to append new bigram entry to the list. + // 5. Same as 4, but the list is the last entry of the content file. if (outAddedNewEntry) { *outAddedNewEntry = false; } const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); if (bigramListPos == NOT_A_DICT_POS) { - // Updating PtNode doesn't have a bigram list. + // Case 1. PtNode that doesn't have a bigram list. // Create new bigram list. if (!mBigramDictContent->createNewBigramList(terminalId)) { return false; @@ -63,7 +69,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, - newProbability, timestamp); + bigramProperty); // Write an entry. const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { @@ -75,42 +81,55 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget return true; } - const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos); - if (entryPosToUpdate != NOT_A_DICT_POS) { - // Overwrite existing entry. - const BigramEntry originalBigramEntry = - mBigramDictContent->getBigramEntry(entryPosToUpdate); - if (!originalBigramEntry.isValid()) { - // Reuse invalid entry. - if (outAddedNewEntry) { - *outAddedNewEntry = true; + int tailEntryPos = NOT_A_DICT_POS; + const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos, + &tailEntryPos); + if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) { + // Case 4, 5. + // Add new entry to the bigram list. + if (tailEntryPos == NOT_A_DICT_POS) { + // Case 4. Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId); + // Copy existing bigram list. + if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) { + return false; } } - const BigramEntry updatedBigramEntry = - originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); + // Write new entry at the tail position of the bigram content. + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &updatedBigramEntry, newProbability, timestamp); - return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); + &newBigramEntry, bigramProperty); + if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) { + return false; + } + // Update has next flag of the tail entry. + if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; } - // Add new entry to the bigram list. - // Create new bigram list. - if (!mBigramDictContent->createNewBigramList(terminalId)) { - return false; + // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry. + const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (!originalBigramEntry.isValid()) { + // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing + // entry is updated. + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } } - // Write new entry at a head position of the bigram list. - int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); - const BigramEntry newBigramEntry(true /* hasNext */, NOT_A_PROBABILITY, newTargetTerminalId); + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &newBigramEntry, newProbability, timestamp); - if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite, &writingPos)) { - return false; - } - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - // Append existing entries by copying. - return mBigramDictContent->copyBigramList(bigramListPos, writingPos); + &updatedBigramEntry, bigramProperty); + return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); } bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { @@ -119,7 +138,8 @@ bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTer // Bigram list doesn't exist. return false; } - const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos); + const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos, + nullptr /* outTailEntryPos */); if (entryPosToUpdate == NOT_A_DICT_POS) { // Bigram entry doesn't exist. return false; @@ -204,7 +224,10 @@ int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { } int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, - const int bigramListPos) const { + const int bigramListPos, int *const outTailEntryPos) const { + if (outTailEntryPos) { + *outTailEntryPos = NOT_A_DICT_POS; + } bool hasNext = true; int invalidEntryPos = NOT_A_DICT_POS; int readingPos = bigramListPos; @@ -220,23 +243,36 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, // Invalid entry that can be reused is found. invalidEntryPos = entryPos; } + if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) { + if (outTailEntryPos) { + *outTailEntryPos = entryPos; + } + } } return invalidEntryPos; } const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( - const BigramEntry *const originalBigramEntry, const int newProbability, - const int timestamp) const { + const BigramEntry *const originalBigramEntry, + const BigramProperty *const bigramProperty) const { // TODO: Consolidate historical info and probability. if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(), + bigramProperty->getLevel(), bigramProperty->getCount()); const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalBigramEntry->getHistoricalInfo(), newProbability, timestamp, - mHeaderPolicy); + originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(), + &historicalInfoForUpdate, mHeaderPolicy); return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); } else { - return originalBigramEntry->updateProbabilityAndGetEntry(newProbability); + return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability()); } } +bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) { + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos); + const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h index 5b6c5a173..1613941c4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h @@ -24,6 +24,7 @@ namespace latinime { class BigramDictContent; +class BigramProperty; class HeaderPolicy; class TerminalPositionLookupTable; @@ -43,8 +44,8 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. } - bool addNewEntry(const int terminalId, const int newTargetTerminalId, const int newProbability, - const int timestamp, bool *const outAddedNewEntry); + bool addNewEntry(const int terminalId, const int newTargetTerminalId, + const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); bool removeEntry(const int terminalId, const int targetTerminalId); @@ -56,10 +57,13 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); - int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos) const; + int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos, + int *const outTailEntryPos) const; const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, - const int newProbability, const int timestamp) const; + const BigramProperty *const bigramProperty) const; + + bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos); BigramDictContent *const mBigramDictContent; const TerminalPositionLookupTable *const mTerminalPositionLookupTable; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp index 279f5b33a..e1ceaee49 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -23,9 +23,11 @@ namespace latinime { const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( int *const bigramEntryPos) const { const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); - if (*bigramEntryPos < 0 || *bigramEntryPos >= bigramListBuffer->getTailPosition()) { - AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bufSize: %d", - *bigramEntryPos, bigramListBuffer->getTailPosition()); + const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); + if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { + AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " + "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, + bigramListBuffer->getTailPosition()); ASSERT(false); return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, Ver4DictConstants::NOT_A_TERMINAL_ID); @@ -38,8 +40,6 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( int level = 0; int count = 0; if (mHasHistoricalInfo) { - probability = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); timestamp = bigramListBuffer->readUintAndAdvancePosition( Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos); level = bigramListBuffer->readUintAndAdvancePosition( @@ -47,7 +47,8 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( count = bigramListBuffer->readUintAndAdvancePosition( Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos); } else { - probability = bigramFlags & Ver4DictConstants::BIGRAM_PROBABILITY_MASK; + probability = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); } const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition( Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos); @@ -65,21 +66,13 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( bool BigramDictContent::writeBigramEntryAndAdvancePosition( const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) { BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer(); - const int bigramFlags = createAndGetBigramFlags( - mHasHistoricalInfo ? 0 : bigramEntryToWrite->getProbability(), - bigramEntryToWrite->hasNext()); + const int bigramFlags = createAndGetBigramFlags(bigramEntryToWrite->hasNext()); if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags, Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) { AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags); return false; } if (mHasHistoricalInfo) { - if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(), - Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, - bigramEntryToWrite->getProbability()); - return false; - } const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo(); if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(), Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { @@ -99,6 +92,13 @@ bool BigramDictContent::writeBigramEntryAndAdvancePosition( historicalInfo->getCount()); return false; } + } else { + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, + bigramEntryToWrite->getProbability()); + return false; + } } const int targetTerminalIdToWrite = (bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ? @@ -113,13 +113,17 @@ bool BigramDictContent::writeBigramEntryAndAdvancePosition( return true; } -bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos) { +bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos, + int *const outTailEntryPos) { int readingPos = bigramListPos; int writingPos = toPos; bool hasNext = true; while (hasNext) { const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos); hasNext = bigramEntry.hasNext(); + if (!hasNext) { + *outTailEntryPos = writingPos; + } if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) { AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos); return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h index ba2a05209..52447a336 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h @@ -58,6 +58,11 @@ class BigramDictContent : public SparseTableDictContent { return addressLookupTable->get(terminalId); } + bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) { + int writingPos = getContentBuffer()->getTailPosition(); + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { int writingPos = entryWritingPos; return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); @@ -71,7 +76,7 @@ class BigramDictContent : public SparseTableDictContent { return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); } - bool copyBigramList(const int bigramListPos, const int toPos); + bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos); bool flushToFile(const char *const dictPath) const { return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, @@ -83,12 +88,29 @@ class BigramDictContent : public SparseTableDictContent { const BigramDictContent *const originalBigramDictContent, int *const outBigramEntryCount); + bool isContentTailPos(const int pos) const { + return pos == getContentBuffer()->getTailPosition(); + } + private: DISALLOW_COPY_AND_ASSIGN(BigramDictContent); - int createAndGetBigramFlags(const int probability, const bool hasNext) const { - return (probability & Ver4DictConstants::BIGRAM_PROBABILITY_MASK) - | (hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0); + int createAndGetBigramFlags(const bool hasNext) const { + return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0; + } + + int getBigramEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } else { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } } bool runGCBigramList(const int bigramListPos, diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h index fe984615c..790273541 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h @@ -19,7 +19,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp index 95f654498..5aa6b9a92 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp @@ -27,7 +27,8 @@ namespace latinime { /* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( - const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer) { + const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion) { if (!headerBuffer) { ASSERT(false); AKLOGE("The header buffer must be valid to open ver4 dict buffers."); @@ -35,7 +36,8 @@ namespace latinime { } // TODO: take only dictDirPath, and open both header and trie files in the constructor below const bool isUpdatable = headerBuffer->isUpdatable(); - return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable)); + return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable, + formatVersion)); } bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, @@ -54,6 +56,7 @@ bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, return false; } } + umask(S_IWGRP | S_IWOTH); if (mkdir(tmpDirPath, S_IRWXU) == -1) { AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); return false; @@ -113,11 +116,12 @@ bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, } Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath, - MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable) + MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion) : mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(MmappedBuffer::openBuffer(dictPath, Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)), - mHeaderPolicy(mHeaderBuffer->getBuffer(), FormatUtils::VERSION_4), + mHeaderPolicy(mHeaderBuffer->getBuffer(), formatVersion), mExpandableHeaderBuffer(mHeaderBuffer ? mHeaderBuffer->getBuffer() : nullptr, mHeaderPolicy.getSize(), BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h index fc41432f4..df177c14a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h @@ -36,7 +36,8 @@ class Ver4DictBuffers { typedef std::unique_ptr<Ver4DictBuffers> Ver4DictBuffersPtr; static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, - MmappedBuffer::MmappedBufferPtr headerBuffer); + MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion); static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( const HeaderPolicy *const headerPolicy, const int maxTrieSize) { @@ -120,7 +121,8 @@ class Ver4DictBuffers { DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); Ver4DictBuffers(const char *const dictDirPath, - const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable); + const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion); Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp index 67420a252..0a435e91c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -95,4 +95,4 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce } } -} +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 38ff42fee..f89d3d7a0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -16,13 +16,14 @@ #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/core/dictionary/property/unigram_property.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" @@ -75,7 +76,7 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); const PatriciaTrieReadingUtils::NodeFlags updatedFlags = DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, - false /* isDeleted */, false /* willBecomeNonTerminal */); + false /* isDeleted */, false /* willBecomeNonTerminal */); int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); // Update flags. if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, @@ -133,9 +134,11 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( &writingPos); } -bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability( - const PtNodeParams *const toBeUpdatedPtNodeParams, const int newProbability, - const int timestamp) { +bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) { + // Update probability and historical information. + // TODO: Update other information in the unigram property. if (!toBeUpdatedPtNodeParams->isTerminal()) { return false; } @@ -143,7 +146,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability( mBuffers->getProbabilityDictContent()->getProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId()); const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, - newProbability, timestamp); + unigramProperty); return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); } @@ -204,7 +207,8 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( - const PtNodeParams *const ptNodeParams, const int timestamp, int *const ptNodeWritingPos) { + const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, + int *const ptNodeWritingPos) { int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, ptNodeWritingPos)) { @@ -213,17 +217,16 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( // Write probability. ProbabilityEntry newProbabilityEntry; const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( - &newProbabilityEntry, ptNodeParams->getProbability(), timestamp); + &newProbabilityEntry, unigramProperty); return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId, &probabilityEntryToWrite); } bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry( - const PtNodeParams *const sourcePtNodeParams, - const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, - bool *const outAddedNewBigram) { + const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam, + const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { if (!mBigramPolicy->addNewEntry(sourcePtNodeParams->getTerminalId(), - targetPtNodeParam->getTerminalId(), probability, timestamp, outAddedNewBigram)) { + targetPtNodeParam->getTerminalId(), bigramProperty, outAddedNewBigram)) { AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId()); return false; @@ -379,18 +382,21 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( } const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, - const int timestamp) const { + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const { // TODO: Consolidate historical info and probability. if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(), + unigramProperty->getLevel(), unigramProperty->getCount()); const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp, - mHeaderPolicy); + originalProbabilityEntry->getHistoricalInfo(), + unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( &updatedHistoricalInfo); } else { - return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability); + return originalProbabilityEntry->createEntryWithUpdatedProbability( + unigramProperty->getProbability()); } } @@ -409,4 +415,4 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, return true; } -} +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index b2b0504a1..e90bc44c0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -57,8 +57,8 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { virtual bool markPtNodeAsWillBecomeNonTerminal( const PtNodeParams *const toBeUpdatedPtNodeParams); - virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newProbability, const int timestamp); + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty); virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); @@ -73,10 +73,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { int *const ptNodeWritingPos); virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - const int timestamp, int *const ptNodeWritingPos); + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams, - const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, + const PtNodeParams *const targetPtNodeParam, const BigramProperty *const bigramProperty, bool *const outAddedNewBigram); virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, @@ -102,11 +102,12 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const PtNodeParams *const ptNodeParams, int *const outTerminalId, int *const ptNodeWritingPos); - // Create updated probability entry using given probability and timestamp. In addition to the + // Create updated probability entry using given unigram property. In addition to the // probability, this method updates historical information if needed. + // TODO: Update flags belonging to the unigram property. const ProbabilityEntry createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, - const int timestamp) const; + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const; bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index bbfd22e59..aec3b8ea3 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -23,6 +23,7 @@ #include "suggest/core/dictionary/property/bigram_property.h" #include "suggest/core/dictionary/property/unigram_property.h" #include "suggest/core/dictionary/property/word_property.h" +#include "suggest/core/session/prev_words_info.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" @@ -59,13 +60,17 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d // valid terminal DicNode. isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; } + readingHelper.readNextSiblingNode(ptNodeParams); + if (ptNodeParams.representsNonWordInfo()) { + // Skip PtNodes that represent non-word information. + continue; + } childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, ptNodeParams.hasChildren(), ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); - readingHelper.readNextSiblingNode(ptNodeParams); } if (readingHelper.isError()) { mIsCorrupted = true; @@ -111,9 +116,7 @@ int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, } else if (bigramProbability == NOT_A_PROBABILITY) { return ProbabilityUtils::backoff(unigramProbability); } else { - // bigramProbability is a bigram probability delta. - return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, - bigramProbability); + return bigramProbability; } } } @@ -153,10 +156,10 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons ptNodeParams.getTerminalId()); } -bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int length, +bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty) { if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { @@ -178,11 +181,19 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; - if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, - unigramProperty->getProbability(), unigramProperty->isNotAWord(), - unigramProperty->isBlacklisted(), unigramProperty->getTimestamp(), - &addedNewUnigram)) { - if (addedNewUnigram) { + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = length; + memmove(codePointsToAdd, word, sizeof(int) * length); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, + unigramProperty, &addedNewUnigram)) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { mUnigramCount++; } if (unigramProperty->getShortcuts().size() > 0) { @@ -210,11 +221,15 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len } } -bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, - const int *const word1, const int length1, const int probability, - const int timestamp) { +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) { + // TODO: Implement. + return false; +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty) { if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { @@ -222,23 +237,47 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le mDictBuffer->getTailPosition()); return false; } - if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) { - AKLOGE("Either src word or target word is too long to insert the bigram to the dictionary. " - "length0: %d, length1: %d", length0, length1); + if (!prevWordsInfo->isValid()) { + AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); return false; } - const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0, - false /* forceLowerCaseSearch */); - if (word0Pos == NOT_A_DICT_POS) { + if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert the ngram to the dictionary. " + "length: %d", bigramProperty->getTargetCodePoints()->size()); return false; } - const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1, - false /* forceLowerCaseSearch */); + int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + false /* tryLowerCaseSearch */); + // TODO: Support N-gram. + if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { + if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { + const std::vector<UnigramProperty::ShortcutProperty> shortcuts; + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, true /* isNotAWord */, + false /* isBlacklisted */, MAX_PROBABILITY /* probability */, + NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); + if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), + prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); + return false; + } + // Refresh Terminal PtNode positions. + prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + false /* tryLowerCaseSearch */); + } else { + return false; + } + } + const int word1Pos = getTerminalPtNodePositionOfWord( + bigramProperty->getTargetCodePoints()->data(), + bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); if (word1Pos == NOT_A_DICT_POS) { return false; } bool addedNewBigram = false; - if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, probability, timestamp, + if (mUpdatingHelper.addBigramWords(prevWordsPtNodePos[0], word1Pos, bigramProperty, &addedNewBigram)) { if (addedNewBigram) { mBigramCount++; @@ -249,10 +288,10 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le } } -bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, - const int *const word1, const int length1) { +bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const int *const word, const int length) { if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { @@ -260,22 +299,26 @@ bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int mDictBuffer->getTailPosition()); return false; } - if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) { - AKLOGE("Either src word or target word is too long to remove the bigram to from the " - "dictionary. length0: %d, length1: %d", length0, length1); + if (!prevWordsInfo->isValid()) { + AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); return false; } - const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0, - false /* forceLowerCaseSearch */); - if (word0Pos == NOT_A_DICT_POS) { + if (length > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); + } + int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + false /* tryLowerCaseSerch */); + // TODO: Support N-gram. + if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { return false; } - const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1, + const int wordPos = getTerminalPtNodePositionOfWord(word, length, false /* forceLowerCaseSearch */); - if (word1Pos == NOT_A_DICT_POS) { + if (wordPos == NOT_A_DICT_POS) { return false; } - if (mUpdatingHelper.removeBigramWords(word0Pos, word1Pos)) { + if (mUpdatingHelper.removeBigramWords(prevWordsPtNodePos[0], wordPos)) { mBigramCount--; return true; } else { @@ -283,26 +326,30 @@ bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int } } -void Ver4PatriciaTriePolicy::flush(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return; + return false; } if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; + return false; } + return true; } -void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return; + return false; } if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { AKLOGE("Cannot flush the dictionary to file with GC."); mIsCorrupted = true; + return false; } + return true; } bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -396,7 +443,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code const int probability = bigramEntry.hasHistoricalInfo() ? ForgettingCurveUtils::decodeProbability( bigramEntry.getHistoricalInfo(), mHeaderPolicy) : - getProbability(word1Probability, bigramEntry.getProbability()); + bigramEntry.getProbability(); bigrams.emplace_back(&word1, probability, historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getCount()); @@ -419,14 +466,17 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code shortcuts.emplace_back(&target, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); } int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { + // TODO: Return code point count like other methods. + // Null termination. + outCodePoints[0] = 0; if (token == 0) { mTerminalPtNodePositionsForIteratingWords.clear(); DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( @@ -443,8 +493,13 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; int unigramProbability = NOT_A_PROBABILITY; - getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, - outCodePoints, &unigramProbability); + const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); + if (codePointCount < MAX_WORD_LENGTH) { + // Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH + // code points. + outCodePoints[codePointCount] = 0; + } const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 8f981def5..0a20965f3 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -21,10 +21,10 @@ #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" @@ -90,18 +90,20 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return &mShortcutPolicy; } - bool addUnigramWord(const int *const word, const int length, + bool addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty); - bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp); + bool removeUnigramEntry(const int *const word, const int length); - bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + const BigramProperty *const bigramProperty); + + bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1, const int length1); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index 12298d967..e868ddf6f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -19,9 +19,9 @@ #include <cstring> #include <queue> -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" @@ -213,13 +213,16 @@ bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( // Delete unigrams. while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) { const int ptNodePos = priorityQueue.top().getDictPos(); + priorityQueue.pop(); const PtNodeParams ptNodeParams = ptNodeReader->fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.representsNonWordInfo()) { + continue; + } if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); return false; } - priorityQueue.pop(); } return true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h index 23cbe3aa3..a2e88a46c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h @@ -37,13 +37,13 @@ class BufferWithExtendableBuffer { BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize, const int maxAdditionalBufferSize) : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), - mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mAdditionalBuffer(0), mUsedAdditionalBufferSize(0), mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} // Without original buffer. BufferWithExtendableBuffer(const int maxAdditionalBufferSize) : mOriginalBuffer(0), mOriginalBufferSize(0), - mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mAdditionalBuffer(0), mUsedAdditionalBufferSize(0), mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} AK_FORCE_INLINE int getTailPosition() const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 87fa5994c..4da339b0a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -17,8 +17,13 @@ #include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" #include <cstdio> +#include <errno.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> #include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" @@ -34,9 +39,18 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = const int dictVersion, const std::vector<int> localeAsCodePointVector, const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { TimeKeeper::setCurrentTime(); - switch (dictVersion) { + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); + switch (formatVersion) { case FormatUtils::VERSION_4: - return createEmptyV4DictFile(filePath, localeAsCodePointVector, attributeMap); + return createEmptyV4DictFile<backward::v402::Ver4DictConstants, + backward::v402::Ver4DictBuffers, + backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr>( + filePath, localeAsCodePointVector, attributeMap, formatVersion); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_4_DEV: + return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers, + Ver4DictBuffers::Ver4DictBuffersPtr>( + filePath, localeAsCodePointVector, attributeMap, formatVersion); default: AKLOGE("Cannot create dictionary %s because format version %d is not supported.", filePath, dictVersion); @@ -44,13 +58,14 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = } } +template<class DictConstants, class DictBuffers, class DictBuffersPtr> /* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath, const std::vector<int> localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { - HeaderPolicy headerPolicy(FormatUtils::VERSION_4, localeAsCodePointVector, attributeMap); - Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( - Ver4DictBuffers::createVer4DictBuffers(&headerPolicy, - Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE)); + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion) { + HeaderPolicy headerPolicy(formatVersion, localeAsCodePointVector, attributeMap); + DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, + DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, 0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); @@ -96,9 +111,15 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = /* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath, const BufferWithExtendableBuffer *const buffer) { - FILE *const file = fopen(filePath, "wb"); + const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd == -1) { + AKLOGE("File %s cannot be opened. errno: %d", filePath, errno); + ASSERT(false); + return false; + } + FILE *const file = fdopen(fd, "wb"); if (!file) { - AKLOGE("File %s cannot be opened.", filePath); + AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno); ASSERT(false); return false; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h index 54ec651f7..5df5856d2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h @@ -21,6 +21,7 @@ #include "defines.h" #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" namespace latinime { @@ -44,9 +45,16 @@ class DictFileWritingUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); + static bool createEmptyV401DictFile(const char *const filePath, + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); + + template<class DictConstants, class DictBuffers, class DictBuffersPtr> static bool createEmptyV4DictFile(const char *const filePath, const std::vector<int> localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); static bool flushBufferToFile(const char *const filePath, const BufferWithExtendableBuffer *const buffer); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index c7d3df984..fed0ae77e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -30,7 +30,7 @@ const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; const int ForgettingCurveUtils::MAX_LEVEL = 3; -const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1; +const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 1; const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15; const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14; @@ -41,25 +41,34 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT // TODO: Revise the logic to decide the initial probability depending on the given probability. /* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( - const HistoricalInfo *const originalHistoricalInfo, - const int newProbability, const int timestamp, const HeaderPolicy *const headerPolicy) { + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { + const int timestamp = newHistoricalInfo->getTimeStamp(); if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { - return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */); - } else if (!originalHistoricalInfo->isValid()) { + // Add entry as a valid word. + const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); + const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); + return HistoricalInfo(timestamp, level, count); + } else if (!originalHistoricalInfo->isValid() + || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel() + || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() + && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { // Initial information. - return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */); + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); + const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); + return HistoricalInfo(timestamp, level, count); } else { const int updatedCount = originalHistoricalInfo->getCount() + 1; if (updatedCount >= headerPolicy->getForgettingCurveOccurrencesToLevelUp()) { // The count exceeds the max value the level can be incremented. if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { // The level is already max. - return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), - originalHistoricalInfo->getCount()); + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); } else { // Level up. - return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel() + 1, - 0 /* count */); + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel() + 1, 0 /* count */); } } else { return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); @@ -73,8 +82,8 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT headerPolicy->getForgettingCurveDurationToLevelDown()); return sProbabilityTable.getProbability( headerPolicy->getForgettingCurveProbabilityValuesTableId(), - std::min(std::max(historicalInfo->getLevel(), 0), MAX_LEVEL), - std::min(std::max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT)); + clampToValidLevelRange(historicalInfo->getLevel()), + clampToValidTimeStepCountRange(elapsedTimeStepCount)); } /* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability, @@ -155,6 +164,23 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT return elapsedTimeInSeconds / timeStepDurationInSeconds; } +/* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) { + return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, + const HeaderPolicy *const headerPolicy) { + return std::min(std::max(count, 0), headerPolicy->getForgettingCurveOccurrencesToLevelUp() - 1); +} + +/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { + return std::min(std::max(level, 0), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) { + return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT); +} + const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index bb8690939..3ff80aeec 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -30,7 +30,7 @@ class ForgettingCurveUtils { public: static const HistoricalInfo createUpdatedHistoricalInfo( const HistoricalInfo *const originalHistoricalInfo, const int newProbability, - const int timestamp, const HeaderPolicy *const headerPolicy); + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy); static const HistoricalInfo createHistoricalInfoToSave( const HistoricalInfo *const originalHistoricalInfo, @@ -93,7 +93,7 @@ class ForgettingCurveUtils { static const int DECAY_INTERVAL_SECONDS; static const int MAX_LEVEL; - static const int MIN_VALID_LEVEL; + static const int MIN_VISIBLE_LEVEL; static const int MAX_ELAPSED_TIME_STEP_COUNT; static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; @@ -103,8 +103,11 @@ class ForgettingCurveUtils { static const ProbabilityTable sProbabilityTable; static int backoff(const int unigramProbability); - static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown); + static int clampToVisibleEntryLevelRange(const int level); + static int clampToValidLevelRange(const int level); + static int clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy); + static int clampToValidTimeStepCountRange(const int timeStepCount); }; } // namespace latinime #endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index cd3c403fa..1916ea560 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -25,6 +25,20 @@ const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; // Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { + switch (formatVersion) { + case VERSION_2: + return VERSION_2; + case VERSION_4_ONLY_FOR_TESTING: + return VERSION_4_ONLY_FOR_TESTING; + case VERSION_4: + return VERSION_4; + case VERSION_4_DEV: + return VERSION_4_DEV; + default: + return UNKNOWN_VERSION; + } +} /* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( const uint8_t *const dict, const int dictSize) { // The magic number is stored big-endian. @@ -36,7 +50,7 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0); switch (magicNumber) { case MAGIC_NUMBER: - // Version 2 header is as follows: + // The layout of the header is as follows: // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE // Dictionary format version number (2 bytes) // Options (2 bytes) @@ -44,13 +58,7 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; // Conceptually this converts the hardcoded value of the bytes in the file into // the symbolic value we use in the code. But we want the constants to be the // same so we use them for both here. - if (ByteArrayUtils::readUint16(dict, 4) == VERSION_2) { - return VERSION_2; - } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4) { - return VERSION_4; - } else { - return UNKNOWN_VERSION; - } + return getFormatVersion(ByteArrayUtils::readUint16(dict, 4)); default: return UNKNOWN_VERSION; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index 759b1c9b2..55ad5799f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -31,7 +31,9 @@ class FormatUtils { enum FORMAT_VERSION { // These MUST have the same values as the relevant constants in FormatSpec.java. VERSION_2 = 2, - VERSION_4 = 401, + VERSION_4_ONLY_FOR_TESTING = 399, + VERSION_4 = 402, + VERSION_4_DEV = 403, UNKNOWN_VERSION = -1 }; @@ -39,6 +41,7 @@ class FormatUtils { // unsupported or obsolete dictionary formats. static const uint32_t MAGIC_NUMBER; + static FORMAT_VERSION getFormatVersion(const int formatVersion); static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); private: diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp index d3e0c237f..4a126ff85 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp @@ -33,7 +33,7 @@ namespace latinime { const int mmapFd = open(path, O_RDONLY); if (mmapFd < 0) { AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); - return MmappedBufferPtr(nullptr); + return nullptr; } const int pagesize = sysconf(_SC_PAGESIZE); const int offset = bufferOffset % pagesize; @@ -45,13 +45,13 @@ namespace latinime { if (mmappedBuffer == MAP_FAILED) { AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); close(mmapFd); - return MmappedBufferPtr(nullptr); + return nullptr; } uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; if (!buffer) { AKLOGE("DICT: buffer is null"); close(mmapFd); - return MmappedBufferPtr(nullptr); + return nullptr; } return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, mmapFd, isUpdatable)); @@ -61,7 +61,7 @@ namespace latinime { const char *const path, const bool isUpdatable) { const int fileSize = FileUtils::getFileSize(path); if (fileSize == -1) { - return MmappedBufferPtr(nullptr); + return nullptr; } else if (fileSize == 0) { return MmappedBufferPtr(new MmappedBuffer(isUpdatable)); } else { @@ -76,7 +76,7 @@ namespace latinime { const int filePathLength = snprintf(filePath, filePathBufferSize, "%s%s", dirPath, fileName); if (filePathLength >= filePathBufferSize) { - return MmappedBufferPtr(nullptr); + return nullptr; } return openBuffer(filePath, isUpdatable); } diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index fa9600c74..3fc566e7a 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -37,6 +37,7 @@ const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f; const float ScoringParams::PROXIMITY_COST = 0.0694f; const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.072f; const float ScoringParams::FIRST_PROXIMITY_COST = 0.07788f; +const float ScoringParams::INTENTIONAL_OMISSION_COST = 0.1f; const float ScoringParams::OMISSION_COST = 0.467f; const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.345f; const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.5256f; diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h index b66962019..b12de6d87 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h @@ -44,6 +44,7 @@ class ScoringParams { static const float PROXIMITY_COST; static const float FIRST_CHAR_PROXIMITY_COST; static const float FIRST_PROXIMITY_COST; + static const float INTENTIONAL_OMISSION_COST; static const float OMISSION_COST; static const float OMISSION_COST_SAME_CHAR; static const float OMISSION_COST_FIRST_CHAR; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index 0ba439b47..84077174d 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -54,12 +54,15 @@ class TypingWeighting : public Weighting { float getOmissionCost(const DicNode *const parentDicNode, const DicNode *const dicNode) const { const bool isZeroCostOmission = parentDicNode->isZeroCostOmission(); + const bool isIntentionalOmission = parentDicNode->canBeIntentionalOmission(); const bool sameCodePoint = dicNode->isSameNodeCodePoint(parentDicNode); // If the traversal omitted the first letter then the dicNode should now be on the second. const bool isFirstLetterOmission = dicNode->getNodeCodePointCount() == 2; float cost = 0.0f; if (isZeroCostOmission) { cost = 0.0f; + } else if (isIntentionalOmission) { + cost = ScoringParams::INTENTIONAL_OMISSION_COST; } else if (isFirstLetterOmission) { cost = ScoringParams::OMISSION_COST_FIRST_CHAR; } else { diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp index adc474b4c..b17e0847d 100644 --- a/native/jni/src/utils/char_utils.cpp +++ b/native/jni/src/utils/char_utils.cpp @@ -22,6 +22,9 @@ namespace latinime { +const int CharUtils::MIN_UNICODE_CODE_POINT = 0; +const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF; + struct LatinCapitalSmallPair { unsigned short capital; unsigned short small; diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index 239419d5b..f28ed5682 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -18,6 +18,7 @@ #define LATINIME_CHAR_UTILS_H #include <cctype> +#include <cstring> #include <vector> #include "defines.h" @@ -86,12 +87,32 @@ class CharUtils { return spaceCount; } + static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) { + return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; + } + static unsigned short latin_tolower(const unsigned short c); static const std::vector<int> EMPTY_STRING; + // Returns updated code point count. Returns 0 when the code points cannot be marked as a + // Beginning-of-Sentence. + static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, + const int codePointCount, const int maxCodePoint) { + if (codePointCount >= maxCodePoint) { + // the code points cannot be marked as a Beginning-of-Sentence. + return 0; + } + memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount); + codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE; + return codePointCount + 1; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); + static const int MIN_UNICODE_CODE_POINT; + static const int MAX_UNICODE_CODE_POINT; + /** * Table mapping most combined Latin, Greek, and Cyrillic characters * to their base characters. If c is in range, BASE_CHARS[c] == c diff --git a/native/jni/src/utils/jni_data_utils.cpp b/native/jni/src/utils/jni_data_utils.cpp new file mode 100644 index 000000000..5555293d5 --- /dev/null +++ b/native/jni/src/utils/jni_data_utils.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/jni_data_utils.h" + +namespace latinime { + +const int JniDataUtils::CODE_POINT_REPLACEMENT_CHARACTER = 0xFFFD; +const int JniDataUtils::CODE_POINT_NULL = 0; + +} // namespace latinime diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h index 2ce02dc05..67a66fdfe 100644 --- a/native/jni/src/utils/jni_data_utils.h +++ b/native/jni/src/utils/jni_data_utils.h @@ -23,6 +23,7 @@ #include "jni.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "utils/char_utils.h" namespace latinime { @@ -65,8 +66,44 @@ class JniDataUtils { return attributeMap; } + static void outputCodePoints(JNIEnv *env, jintArray intArrayToOutputCodePoints, const int start, + const int maxLength, const int *const codePoints, const int codePointCount, + const bool needsNullTermination) { + const int outputCodePointCount = std::min(maxLength, codePointCount); + int outputCodePonts[outputCodePointCount]; + for (int i = 0; i < outputCodePointCount; ++i) { + const int codePoint = codePoints[i]; + if (!CharUtils::isInUnicodeSpace(codePoint)) { + outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER; + } else if (codePoint >= 0x01 && codePoint <= 0x1F) { + // Control code. + outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER; + } else { + outputCodePonts[i] = codePoint; + } + } + env->SetIntArrayRegion(intArrayToOutputCodePoints, start, outputCodePointCount, + outputCodePonts); + if (needsNullTermination && outputCodePointCount < maxLength) { + env->SetIntArrayRegion(intArrayToOutputCodePoints, start + outputCodePointCount, + 1 /* len */, &CODE_POINT_NULL); + } + } + + static void putIntToArray(JNIEnv *env, jintArray array, const int index, const int value) { + env->SetIntArrayRegion(array, index, 1 /* len */, &value); + } + + static void putFloatToArray(JNIEnv *env, jfloatArray array, const int index, + const float value) { + env->SetFloatArrayRegion(array, index, 1 /* len */, &value); + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(JniDataUtils); + + static const int CODE_POINT_REPLACEMENT_CHARACTER; + static const int CODE_POINT_NULL; }; } // namespace latinime #endif // LATINIME_JNI_DATA_UTILS_H |