diff options
Diffstat (limited to 'native/jni/src')
42 files changed, 366 insertions, 223 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index 761063f8a..a80c97530 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -293,13 +293,6 @@ static inline void prof_out(void) { #define M_PI_F 3.14159265f #define MAX_PERCENTILE 100 -// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. -// As such, this is the maximum number of characters will be needed to represent an int as a -// string, including the terminator; this is used as the size of a string buffer large enough to -// hold any value that is intended to fit in an integer, e.g. in the code that reads the header -// of the binary dictionary where a {key,value} string pair scheme is used. -#define LARGEST_INT_DIGIT_COUNT 11 - #define NOT_A_CODE_POINT (-1) #define NOT_A_DISTANCE (-1) #define NOT_A_COORDINATE (-1) diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index e288413a3..fdc893653 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -88,11 +88,10 @@ void Dictionary::addUnigramWord(const int *const word, const int length, mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, unigramProperty); } -void Dictionary::addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp) { +void Dictionary::addBigramWords(const int *const word0, const int length0, + const BigramProperty *const bigramProperty) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, word1, length1, - probability, timestamp); + mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, bigramProperty); } void Dictionary::removeBigramWords(const int *const word0, const int length0, diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index b6149b338..f0a7e5b6a 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -76,8 +76,8 @@ class Dictionary { void addUnigramWord(const int *const codePoints, const int codePointCount, const UnigramProperty *const unigramProperty); - void addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp); + void addBigramWords(const int *const word0, const int length0, + const BigramProperty *const bigramProperty); void removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1); diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/suggest/core/dictionary/property/word_property.h index 5519a917c..aa3e0b68a 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.h +++ b/native/jni/src/suggest/core/dictionary/property/word_property.h @@ -42,6 +42,14 @@ class WordProperty { jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, jobject outShortcutTargets, jobject outShortcutProbabilities) const; + const UnigramProperty *getUnigramProperty() const { + return &mUnigramProperty; + } + + const std::vector<BigramProperty> *getBigramProperties() const { + return &mBigrams; + } + private: // Default copy constructor is used for using as a return value. DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); diff --git a/native/jni/src/suggest/core/layout/proximity_info.cpp b/native/jni/src/suggest/core/layout/proximity_info.cpp index c40a2bdca..4c75a188e 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info.cpp @@ -226,7 +226,7 @@ int ProximityInfo::getKeyCenterXOfKeyIdG( // When the referencePointY is NOT_A_COORDINATE, this method calculates the return value without // using the line segment. int ProximityInfo::getKeyCenterYOfKeyIdG( - const int keyId, const int referencePointY, const bool isGeometric) const { + const int keyId, const int referencePointY, const bool isGeometric) const { // TODO: Remove "isGeometric" and have separate "proximity_info"s for gesture and typing. if (keyId < 0) { return 0; diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h index 71e83a80c..211a79737 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h +++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h @@ -56,7 +56,7 @@ class ProximityInfoStateUtils { const std::vector<int> *const sampledLengthCache, const std::vector<int> *const sampledInputIndice, std::vector<float> *sampledSpeedRates, std::vector<float> *sampledDirections); - static void refreshBeelineSpeedRates(const int mostCommonKeyWidth, const float averageSpeed, + static void refreshBeelineSpeedRates(const int mostCommonKeyWidth, const float averageSpeed, const int inputSize, const int *const xCoordinates, const int *const yCoordinates, const int *times, const int sampledInputSize, const std::vector<int> *const sampledInputXs, diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index a8dab9fcd..845e629e6 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -49,6 +49,8 @@ class DictionaryHeaderStructurePolicy { virtual bool shouldBoostExactMatches() const = 0; + virtual const std::vector<int> *getLocale() const = 0; + protected: DictionaryHeaderStructurePolicy() {} diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 807f9b8dd..ce5a49f83 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -73,8 +73,8 @@ class DictionaryStructureWithBufferPolicy { const UnigramProperty *const unigramProperty) = 0; // Returns whether the update was success or not. - virtual bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp) = 0; + virtual bool addBigramWords(const int *const word0, const int length0, + const BigramProperty *const bigramProperty) = 0; // Returns whether the update was success or not. virtual bool removeBigramWords(const int *const word0, const int length0, diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 251a71941..da24302c2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -139,6 +139,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { switch (mDictFormatVersion) { case FormatUtils::VERSION_2: return FormatUtils::VERSION_2; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + return FormatUtils::VERSION_4_ONLY_FOR_TESTING; case FormatUtils::VERSION_4: return FormatUtils::VERSION_4; default: @@ -238,6 +240,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const int unigramCount, const int bigramCount, const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; + AK_FORCE_INLINE const std::vector<int> *getLocale() const { + return &mLocale; + } + private: DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index d20accfbc..2a9028a9e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -26,6 +26,13 @@ namespace latinime { +// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. +// As such, this is the maximum number of characters will be needed to represent an int as a +// string, including the terminator; this is used as the size of a string buffer large enough to +// hold any value that is intended to fit in an integer, e.g. in the code that reads the header +// of the binary dictionary where a {key,value} string pair scheme is used. +const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; + const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; @@ -91,8 +98,9 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; case FormatUtils::VERSION_2: // Version 2 dictionary writing is not supported. return false; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: - return buffer->writeUintAndAdvancePosition(FormatUtils::VERSION_4 /* data */, + return buffer->writeUintAndAdvancePosition(version /* data */, HEADER_DICTIONARY_VERSION_SIZE, writingPos); default: return false; @@ -154,8 +162,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; /* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, const int value) { AttributeMap::mapped_type valueVector; - char charBuf[LARGEST_INT_DIGIT_COUNT + 1]; - snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%d", value); + char charBuf[LARGEST_INT_DIGIT_COUNT]; + snprintf(charBuf, sizeof(charBuf), "%d", value); insertCharactersIntoVector(charBuf, &valueVector); (*headerAttributes)[*key] = valueVector; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h index a6b4c4e14..9b90488fc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h @@ -92,6 +92,7 @@ class HeaderReadWriteUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); + static const int LARGEST_INT_DIGIT_COUNT; static const int MAX_ATTRIBUTE_KEY_LENGTH; static const int MAX_ATTRIBUTE_VALUE_LENGTH; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index be7e43b98..c4d18608c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -52,9 +52,11 @@ namespace latinime { DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict( const int formatVersion, const std::vector<int> &locale, const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { - switch (formatVersion) { + FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); + switch (dictFormatVersion) { + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: { - HeaderPolicy headerPolicy(FormatUtils::VERSION_4, locale, attributeMap); + HeaderPolicy headerPolicy(dictFormatVersion, locale, attributeMap); Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers = Ver4DictBuffers::createVer4DictBuffers(&headerPolicy, Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); @@ -87,11 +89,13 @@ namespace latinime { if (!mmappedBuffer) { return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); } - switch (FormatUtils::detectFormatVersion(mmappedBuffer->getBuffer(), - mmappedBuffer->getBufferSize())) { + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion( + mmappedBuffer->getBuffer(), mmappedBuffer->getBufferSize()); + switch (formatVersion) { case FormatUtils::VERSION_2: AKLOGE("Given path is a directory but the format is version 2. path: %s", path); break; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: { const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */; char dictPath[dictDirPathBufSize]; @@ -102,7 +106,8 @@ namespace latinime { return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(nullptr); } Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( - Ver4DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer))); + Ver4DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), + formatVersion)); if (!dictBuffers || !dictBuffers->isValid()) { AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s", path); @@ -135,6 +140,7 @@ namespace latinime { case FormatUtils::VERSION_2: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: AKLOGE("Given path is a file but the format is version 4. path: %s", path); break; diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp index 7d0d09631..08b4e0b5e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h index 15f924a6a..15f924a6a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp index e02dd5550..9e575858a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp @@ -16,6 +16,7 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "suggest/core/dictionary/property/unigram_property.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" @@ -29,9 +30,8 @@ const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; bool DynamicPtUpdatingHelper::addUnigramWord( DynamicPtReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - const bool isNotAWord, const bool isBlacklisted, const int timestamp, - bool *const outAddedNewUnigram) { + const int *const wordCodePoints, const int codePointCount, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { int parentPos = NOT_A_DICT_POS; while (!readingHelper->isEnd()) { const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); @@ -53,20 +53,18 @@ bool DynamicPtUpdatingHelper::addUnigramWord( if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j, wordCodePoints[matchedCodePointCount + j])) { *outAddedNewUnigram = true; - return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, isNotAWord, isBlacklisted, - probability, timestamp, wordCodePoints + matchedCodePointCount, + return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, + wordCodePoints + matchedCodePointCount, codePointCount - matchedCodePointCount); } } // All characters are matched. if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { - return setPtNodeProbability(&ptNodeParams, isNotAWord, isBlacklisted, probability, - timestamp, outAddedNewUnigram); + return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); } if (!ptNodeParams.hasChildren()) { *outAddedNewUnigram = true; - return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, - isNotAWord, isBlacklisted, probability, timestamp, + return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); } @@ -83,17 +81,17 @@ bool DynamicPtUpdatingHelper::addUnigramWord( return createAndInsertNodeIntoPtNodeArray(parentPos, wordCodePoints + readingHelper->getPrevTotalCodePointCount(), codePointCount - readingHelper->getPrevTotalCodePointCount(), - isNotAWord, isBlacklisted, probability, timestamp, &pos); + unigramProperty, &pos); } bool DynamicPtUpdatingHelper::addBigramWords(const int word0Pos, const int word1Pos, - const int probability, const int timestamp, bool *const outAddedNewBigram) { + const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { const PtNodeParams sourcePtNodeParams( mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); const PtNodeParams targetPtNodeParams( mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); - return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, probability, - timestamp, outAddedNewBigram); + return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, + bigramProperty, outAddedNewBigram); } // Remove a bigram relation from word0Pos to word1Pos. @@ -115,36 +113,34 @@ bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, const int nodeCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp, int *const forwardLinkFieldPos) { + const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, newPtNodeArrayPos, forwardLinkFieldPos)) { return false; } return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, - isNotAWord, isBlacklisted, probability, timestamp); + unigramProperty); } -bool DynamicPtUpdatingHelper::setPtNodeProbability( - const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const int probability, const int timestamp, - bool *const outAddedNewUnigram) { +bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { if (originalPtNodeParams->isTerminal()) { // Overwrites the probability. *outAddedNewUnigram = false; - return mPtNodeWriter->updatePtNodeProbability(originalPtNodeParams, probability, timestamp); + return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); } else { // Make the node terminal and write the probability. *outAddedNewUnigram = true; const int movedPos = mBuffer->getTailPosition(); int writingPos = movedPos; const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, - isNotAWord, isBlacklisted, true /* isTerminal */, - originalPtNodeParams->getParentPos(), originalPtNodeParams->getCodePointCount(), - originalPtNodeParams->getCodePoints(), probability)); + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + true /* isTerminal */, originalPtNodeParams->getParentPos(), + originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(), + unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, - timestamp, &writingPos)) { + unigramProperty, &writingPos)) { return false; } if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { @@ -155,31 +151,30 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability( } bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( - const PtNodeParams *const parentPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const int probability, const int timestamp, + const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, const int *const codePoints, const int codePointCount) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { return false; } return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, - codePointCount, isNotAWord, isBlacklisted, probability, timestamp); + codePointCount, unigramProperty); } bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp) { + const UnigramProperty *const unigramProperty) { int writingPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, 1 /* arraySize */, &writingPos)) { return false; } const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - isNotAWord, isBlacklisted, true /* isTerminal */, - parentPtNodePos, nodeCodePointCount, nodeCodePoints, probability)); - if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, timestamp, - &writingPos)) { + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */, + parentPtNodePos, nodeCodePointCount, nodeCodePoints, + unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { return false; } if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, @@ -192,13 +187,13 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( // Returns whether the dictionary updating was succeeded or not. bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode, - const int timestamp, const int *const newNodeCodePoints, const int newNodeCodePointCount) { + const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints, + const int newNodeCodePointCount) { // When addsExtraChild is true, split the reallocating PtNode and add new child. // Reallocating PtNode: abcde, newNode: abcxy. // abc (1st, not terminal) __ de (2nd) // \_ xy (extra child, terminal) - // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. + // Otherwise, this method makes 1st part terminal and write information in unigramProperty. // Reallocating PtNode: abcde, newNode: abc. // abc (1st, terminal) __ de (2nd) const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; @@ -216,11 +211,12 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } } else { const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - isNotAWord, isBlacklisted, true /* isTerminal */, - reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, - reallocatingPtNodeParams->getCodePoints(), probabilityOfNewPtNode)); + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), + overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(), + unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, - timestamp, &writingPos)) { + unigramProperty, &writingPos)) { return false; } } @@ -244,11 +240,12 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } if (addsExtraChild) { const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( - isNotAWord, isBlacklisted, true /* isTerminal */, - firstPartOfReallocatedPtNodePos, newNodeCodePointCount - overlappingCodePointCount, - newNodeCodePoints + overlappingCodePointCount, probabilityOfNewPtNode)); + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + true /* isTerminal */, firstPartOfReallocatedPtNodePos, + newNodeCodePointCount - overlappingCodePointCount, + newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, - timestamp, &writingPos)) { + unigramProperty, &writingPos)) { return false; } } @@ -269,8 +266,8 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( - const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const bool isTerminal, const int parentPos, + const PtNodeParams *const originalPtNodeParams, + const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos, const int codePointCount, const int *const codePoints, const int probability) const { const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( isBlacklisted, isNotAWord, isTerminal, originalPtNodeParams->hasShortcutTargets(), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h index 9b2815263..f10d15a9b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h @@ -22,10 +22,12 @@ namespace latinime { +class BigramProperty; class BufferWithExtendableBuffer; class DynamicPtReadingHelper; class PtNodeReader; class PtNodeWriter; +class UnigramProperty; class DynamicPtUpdatingHelper { public: @@ -37,13 +39,12 @@ class DynamicPtUpdatingHelper { // Add a word to the dictionary. If the word already exists, update the probability. bool addUnigramWord(DynamicPtReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - const bool isNotAWord, const bool isBlacklisted, const int timestamp, - bool *const outAddedNewUnigram); + const int *const wordCodePoints, const int codePointCount, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); // Add a bigram relation from word0Pos to word1Pos. - bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, - const int timestamp, bool *const outAddedNewBigram); + bool addBigramWords(const int word0Pos, const int word1Pos, + const BigramProperty *const bigramProperty, bool *const outAddedNewBigram); // Remove a bigram relation from word0Pos to word1Pos. bool removeBigramWords(const int word0Pos, const int word1Pos); @@ -62,25 +63,22 @@ class DynamicPtUpdatingHelper { PtNodeWriter *const mPtNodeWriter; bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted, - const int probability, const int timestamp, int *const forwardLinkFieldPos); + const int nodeCodePointCount, const UnigramProperty *const unigramProperty, + int *const forwardLinkFieldPos); - bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, - const bool isBlacklisted, const int probability, const int timestamp, - bool *const outAddedNewUnigram); + bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, - const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp, const int *const codePoints, const int codePointCount); + const UnigramProperty *const unigramProperty, const int *const codePoints, + const int codePointCount); bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted, - const int probability, const int timestamp); + const int nodeCodePointCount, const UnigramProperty *const unigramProperty); bool reallocatePtNodeAndAddNewPtNodes( const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode, - const int timestamp, const int *const newNodeCodePoints, + const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints, const int newNodeCodePointCount); const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h index e843f074a..a8029f73f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h @@ -24,6 +24,9 @@ namespace latinime { +class BigramProperty; +class UnigramProperty; + // Interface class used to write PtNode information. class PtNodeWriter { public: @@ -51,8 +54,8 @@ class PtNodeWriter { virtual bool markPtNodeAsWillBecomeNonTerminal( const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; - virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int probability, const int timestamp) = 0; + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) = 0; virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( const PtNodeParams *const toBeUpdatedPtNodeParams, @@ -65,10 +68,10 @@ class PtNodeWriter { int *const ptNodeWritingPos) = 0; virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - const int timestamp, int *const ptNodeWritingPos) = 0; + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0; virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams, - const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, + const PtNodeParams *const targetPtNodeParam, const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) = 0; virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp index 847dcdee5..91c76941c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" @@ -44,7 +44,7 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; } /* static */ int ShortcutListReadingUtils::readShortcutTarget( - const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) { + const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) { return ByteArrayUtils::readStringAndAdvancePosition(dictRoot, maxLength, outWord, pos); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h index d065bf7fd..d065bf7fd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h index a898e2afc..00bb502dc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h @@ -21,7 +21,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 85f46603e..54d1e0f6d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -22,9 +22,9 @@ #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" @@ -88,8 +88,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return false; } - bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp) { + bool addBigramWords(const int *const word0, const int length0, + const BigramProperty *const bigramProperty) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h index 6d2b4778c..8e16ccc05 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h @@ -21,7 +21,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp index 5df2096a4..7a52fd180 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp @@ -14,10 +14,11 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/core/dictionary/property/bigram_property.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" @@ -49,13 +50,18 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out } bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, - const int newProbability, const int timestamp, bool *const outAddedNewEntry) { + const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) { + // 1. The word has no bigrams yet. + // 2. The word has bigrams, and there is the target in the list. + // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. + // 4. The word has bigrams. We have to append new bigram entry to the list. + // 5. Same as 4, but the list is the last entry of the content file. if (outAddedNewEntry) { *outAddedNewEntry = false; } const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); if (bigramListPos == NOT_A_DICT_POS) { - // Updating PtNode doesn't have a bigram list. + // Case 1. PtNode that doesn't have a bigram list. // Create new bigram list. if (!mBigramDictContent->createNewBigramList(terminalId)) { return false; @@ -63,7 +69,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, - newProbability, timestamp); + bigramProperty); // Write an entry. const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { @@ -75,42 +81,55 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget return true; } - const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos); - if (entryPosToUpdate != NOT_A_DICT_POS) { - // Overwrite existing entry. - const BigramEntry originalBigramEntry = - mBigramDictContent->getBigramEntry(entryPosToUpdate); - if (!originalBigramEntry.isValid()) { - // Reuse invalid entry. - if (outAddedNewEntry) { - *outAddedNewEntry = true; + int tailEntryPos = NOT_A_DICT_POS; + const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos, + &tailEntryPos); + if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) { + // Case 4, 5. + // Add new entry to the bigram list. + if (tailEntryPos == NOT_A_DICT_POS) { + // Case 4. Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId); + // Copy existing bigram list. + if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) { + return false; } } - const BigramEntry updatedBigramEntry = - originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); + // Write new entry at the tail position of the bigram content. + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &updatedBigramEntry, newProbability, timestamp); - return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); + &newBigramEntry, bigramProperty); + if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) { + return false; + } + // Update has next flag of the tail entry. + if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; } - // Add new entry to the bigram list. - // Create new bigram list. - if (!mBigramDictContent->createNewBigramList(terminalId)) { - return false; + // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry. + const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (!originalBigramEntry.isValid()) { + // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing + // entry is updated. + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } } - // Write new entry at a head position of the bigram list. - int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); - const BigramEntry newBigramEntry(true /* hasNext */, NOT_A_PROBABILITY, newTargetTerminalId); + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &newBigramEntry, newProbability, timestamp); - if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite, &writingPos)) { - return false; - } - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - // Append existing entries by copying. - return mBigramDictContent->copyBigramList(bigramListPos, writingPos); + &updatedBigramEntry, bigramProperty); + return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); } bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { @@ -119,7 +138,8 @@ bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTer // Bigram list doesn't exist. return false; } - const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos); + const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos, + nullptr /* outTailEntryPos */); if (entryPosToUpdate == NOT_A_DICT_POS) { // Bigram entry doesn't exist. return false; @@ -204,7 +224,10 @@ int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { } int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, - const int bigramListPos) const { + const int bigramListPos, int *const outTailEntryPos) const { + if (outTailEntryPos) { + *outTailEntryPos = NOT_A_DICT_POS; + } bool hasNext = true; int invalidEntryPos = NOT_A_DICT_POS; int readingPos = bigramListPos; @@ -220,23 +243,36 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, // Invalid entry that can be reused is found. invalidEntryPos = entryPos; } + if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) { + if (outTailEntryPos) { + *outTailEntryPos = entryPos; + } + } } return invalidEntryPos; } const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( - const BigramEntry *const originalBigramEntry, const int newProbability, - const int timestamp) const { + const BigramEntry *const originalBigramEntry, + const BigramProperty *const bigramProperty) const { // TODO: Consolidate historical info and probability. if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(), + bigramProperty->getLevel(), bigramProperty->getCount()); const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalBigramEntry->getHistoricalInfo(), newProbability, timestamp, - mHeaderPolicy); + originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(), + &historicalInfoForUpdate, mHeaderPolicy); return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); } else { - return originalBigramEntry->updateProbabilityAndGetEntry(newProbability); + return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability()); } } +bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) { + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos); + const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h index 5b6c5a173..1613941c4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h @@ -24,6 +24,7 @@ namespace latinime { class BigramDictContent; +class BigramProperty; class HeaderPolicy; class TerminalPositionLookupTable; @@ -43,8 +44,8 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. } - bool addNewEntry(const int terminalId, const int newTargetTerminalId, const int newProbability, - const int timestamp, bool *const outAddedNewEntry); + bool addNewEntry(const int terminalId, const int newTargetTerminalId, + const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); bool removeEntry(const int terminalId, const int targetTerminalId); @@ -56,10 +57,13 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); - int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos) const; + int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos, + int *const outTailEntryPos) const; const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, - const int newProbability, const int timestamp) const; + const BigramProperty *const bigramProperty) const; + + bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos); BigramDictContent *const mBigramDictContent; const TerminalPositionLookupTable *const mTerminalPositionLookupTable; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp index 279f5b33a..56f19dbae 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -113,13 +113,17 @@ bool BigramDictContent::writeBigramEntryAndAdvancePosition( return true; } -bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos) { +bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos, + int *const outTailEntryPos) { int readingPos = bigramListPos; int writingPos = toPos; bool hasNext = true; while (hasNext) { const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos); hasNext = bigramEntry.hasNext(); + if (!hasNext) { + *outTailEntryPos = writingPos; + } if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) { AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos); return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h index ba2a05209..944e0f9e2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h @@ -58,6 +58,11 @@ class BigramDictContent : public SparseTableDictContent { return addressLookupTable->get(terminalId); } + bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) { + int writingPos = getContentBuffer()->getTailPosition(); + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { int writingPos = entryWritingPos; return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); @@ -71,7 +76,7 @@ class BigramDictContent : public SparseTableDictContent { return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); } - bool copyBigramList(const int bigramListPos, const int toPos); + bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos); bool flushToFile(const char *const dictPath) const { return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, @@ -83,6 +88,10 @@ class BigramDictContent : public SparseTableDictContent { const BigramDictContent *const originalBigramDictContent, int *const outBigramEntryCount); + bool isContentTailPos(const int pos) const { + return pos == getContentBuffer()->getTailPosition(); + } + private: DISALLOW_COPY_AND_ASSIGN(BigramDictContent); diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h index fe984615c..790273541 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h @@ -19,7 +19,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp index 95f654498..77ed38b89 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp @@ -27,7 +27,8 @@ namespace latinime { /* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( - const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer) { + const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion) { if (!headerBuffer) { ASSERT(false); AKLOGE("The header buffer must be valid to open ver4 dict buffers."); @@ -35,7 +36,8 @@ namespace latinime { } // TODO: take only dictDirPath, and open both header and trie files in the constructor below const bool isUpdatable = headerBuffer->isUpdatable(); - return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable)); + return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable, + formatVersion)); } bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, @@ -113,11 +115,12 @@ bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, } Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath, - MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable) + MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion) : mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(MmappedBuffer::openBuffer(dictPath, Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)), - mHeaderPolicy(mHeaderBuffer->getBuffer(), FormatUtils::VERSION_4), + mHeaderPolicy(mHeaderBuffer->getBuffer(), formatVersion), mExpandableHeaderBuffer(mHeaderBuffer ? mHeaderBuffer->getBuffer() : nullptr, mHeaderPolicy.getSize(), BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h index fc41432f4..df177c14a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h @@ -36,7 +36,8 @@ class Ver4DictBuffers { typedef std::unique_ptr<Ver4DictBuffers> Ver4DictBuffersPtr; static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, - MmappedBuffer::MmappedBufferPtr headerBuffer); + MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion); static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( const HeaderPolicy *const headerPolicy, const int maxTrieSize) { @@ -120,7 +121,8 @@ class Ver4DictBuffers { DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); Ver4DictBuffers(const char *const dictDirPath, - const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable); + const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion); Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp index 67420a252..0a435e91c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -95,4 +95,4 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce } } -} +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 38ff42fee..f89d3d7a0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -16,13 +16,14 @@ #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/core/dictionary/property/unigram_property.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" +#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" @@ -75,7 +76,7 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); const PatriciaTrieReadingUtils::NodeFlags updatedFlags = DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, - false /* isDeleted */, false /* willBecomeNonTerminal */); + false /* isDeleted */, false /* willBecomeNonTerminal */); int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); // Update flags. if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, @@ -133,9 +134,11 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( &writingPos); } -bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability( - const PtNodeParams *const toBeUpdatedPtNodeParams, const int newProbability, - const int timestamp) { +bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) { + // Update probability and historical information. + // TODO: Update other information in the unigram property. if (!toBeUpdatedPtNodeParams->isTerminal()) { return false; } @@ -143,7 +146,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability( mBuffers->getProbabilityDictContent()->getProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId()); const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, - newProbability, timestamp); + unigramProperty); return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); } @@ -204,7 +207,8 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( - const PtNodeParams *const ptNodeParams, const int timestamp, int *const ptNodeWritingPos) { + const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, + int *const ptNodeWritingPos) { int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, ptNodeWritingPos)) { @@ -213,17 +217,16 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( // Write probability. ProbabilityEntry newProbabilityEntry; const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( - &newProbabilityEntry, ptNodeParams->getProbability(), timestamp); + &newProbabilityEntry, unigramProperty); return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId, &probabilityEntryToWrite); } bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry( - const PtNodeParams *const sourcePtNodeParams, - const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, - bool *const outAddedNewBigram) { + const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam, + const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { if (!mBigramPolicy->addNewEntry(sourcePtNodeParams->getTerminalId(), - targetPtNodeParam->getTerminalId(), probability, timestamp, outAddedNewBigram)) { + targetPtNodeParam->getTerminalId(), bigramProperty, outAddedNewBigram)) { AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId()); return false; @@ -379,18 +382,21 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( } const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, - const int timestamp) const { + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const { // TODO: Consolidate historical info and probability. if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(), + unigramProperty->getLevel(), unigramProperty->getCount()); const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp, - mHeaderPolicy); + originalProbabilityEntry->getHistoricalInfo(), + unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( &updatedHistoricalInfo); } else { - return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability); + return originalProbabilityEntry->createEntryWithUpdatedProbability( + unigramProperty->getProbability()); } } @@ -409,4 +415,4 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, return true; } -} +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index b2b0504a1..e90bc44c0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -57,8 +57,8 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { virtual bool markPtNodeAsWillBecomeNonTerminal( const PtNodeParams *const toBeUpdatedPtNodeParams); - virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newProbability, const int timestamp); + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty); virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); @@ -73,10 +73,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { int *const ptNodeWritingPos); virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - const int timestamp, int *const ptNodeWritingPos); + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams, - const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp, + const PtNodeParams *const targetPtNodeParam, const BigramProperty *const bigramProperty, bool *const outAddedNewBigram); virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, @@ -102,11 +102,12 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const PtNodeParams *const ptNodeParams, int *const outTerminalId, int *const ptNodeWritingPos); - // Create updated probability entry using given probability and timestamp. In addition to the + // Create updated probability entry using given unigram property. In addition to the // probability, this method updates historical information if needed. + // TODO: Update flags belonging to the unigram property. const ProbabilityEntry createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, - const int timestamp) const; + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const; bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index f8587f53a..8373dc549 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -183,9 +183,7 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, - unigramProperty->getProbability(), unigramProperty->isNotAWord(), - unigramProperty->isBlacklisted(), unigramProperty->getTimestamp(), - &addedNewUnigram)) { + unigramProperty, &addedNewUnigram)) { if (addedNewUnigram) { mUnigramCount++; } @@ -215,8 +213,7 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len } bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, - const int *const word1, const int length1, const int probability, - const int timestamp) { + const BigramProperty *const bigramProperty) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); return false; @@ -226,9 +223,10 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le mDictBuffer->getTailPosition()); return false; } - if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) { + if (length0 > MAX_WORD_LENGTH + || bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("Either src word or target word is too long to insert the bigram to the dictionary. " - "length0: %d, length1: %d", length0, length1); + "length0: %d, length1: %d", length0, bigramProperty->getTargetCodePoints()->size()); return false; } const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0, @@ -236,14 +234,14 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le if (word0Pos == NOT_A_DICT_POS) { return false; } - const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1, - false /* forceLowerCaseSearch */); + const int word1Pos = getTerminalPtNodePositionOfWord( + bigramProperty->getTargetCodePoints()->data(), + bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); if (word1Pos == NOT_A_DICT_POS) { return false; } bool addedNewBigram = false; - if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, probability, timestamp, - &addedNewBigram)) { + if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, bigramProperty, &addedNewBigram)) { if (addedNewBigram) { mBigramCount++; } @@ -431,6 +429,9 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code } int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { + // TODO: Return code point count like other methods. + // Null termination. + outCodePoints[0] = 0; if (token == 0) { mTerminalPtNodePositionsForIteratingWords.clear(); DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( @@ -447,8 +448,13 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; int unigramProbability = NOT_A_PROBABILITY; - getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, - outCodePoints, &unigramProbability); + const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); + if (codePointCount < MAX_WORD_LENGTH) { + // Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH + // code points. + outCodePoints[codePointCount] = 0; + } const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 8f981def5..b78576484 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -21,10 +21,10 @@ #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" @@ -93,8 +93,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool addUnigramWord(const int *const word, const int length, const UnigramProperty *const unigramProperty); - bool addBigramWords(const int *const word0, const int length0, const int *const word1, - const int length1, const int probability, const int timestamp); + bool addBigramWords(const int *const word0, const int length0, + const BigramProperty *const bigramProperty); bool removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index 12298d967..f31c50253 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -19,9 +19,9 @@ #include <cstring> #include <queue> -#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h index 23cbe3aa3..a2e88a46c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h @@ -37,13 +37,13 @@ class BufferWithExtendableBuffer { BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize, const int maxAdditionalBufferSize) : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), - mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mAdditionalBuffer(0), mUsedAdditionalBufferSize(0), mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} // Without original buffer. BufferWithExtendableBuffer(const int maxAdditionalBufferSize) : mOriginalBuffer(0), mOriginalBufferSize(0), - mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mAdditionalBuffer(0), mUsedAdditionalBufferSize(0), mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} AK_FORCE_INLINE int getTailPosition() const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 87fa5994c..7bc7b0a48 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -34,9 +34,12 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = const int dictVersion, const std::vector<int> localeAsCodePointVector, const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { TimeKeeper::setCurrentTime(); - switch (dictVersion) { + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); + switch (formatVersion) { + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: - return createEmptyV4DictFile(filePath, localeAsCodePointVector, attributeMap); + return createEmptyV4DictFile(filePath, localeAsCodePointVector, attributeMap, + formatVersion); default: AKLOGE("Cannot create dictionary %s because format version %d is not supported.", filePath, dictVersion); @@ -46,8 +49,9 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = /* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath, const std::vector<int> localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { - HeaderPolicy headerPolicy(FormatUtils::VERSION_4, localeAsCodePointVector, attributeMap); + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion) { + HeaderPolicy headerPolicy(formatVersion, localeAsCodePointVector, attributeMap); Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( Ver4DictBuffers::createVer4DictBuffers(&headerPolicy, Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE)); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h index 54ec651f7..a822989db 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h @@ -21,6 +21,7 @@ #include "defines.h" #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" namespace latinime { @@ -46,7 +47,8 @@ class DictFileWritingUtils { static bool createEmptyV4DictFile(const char *const filePath, const std::vector<int> localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); static bool flushBufferToFile(const char *const filePath, const BufferWithExtendableBuffer *const buffer); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index c7d3df984..fed0ae77e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -30,7 +30,7 @@ const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; const int ForgettingCurveUtils::MAX_LEVEL = 3; -const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1; +const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 1; const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15; const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14; @@ -41,25 +41,34 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT // TODO: Revise the logic to decide the initial probability depending on the given probability. /* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( - const HistoricalInfo *const originalHistoricalInfo, - const int newProbability, const int timestamp, const HeaderPolicy *const headerPolicy) { + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { + const int timestamp = newHistoricalInfo->getTimeStamp(); if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { - return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */); - } else if (!originalHistoricalInfo->isValid()) { + // Add entry as a valid word. + const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); + const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); + return HistoricalInfo(timestamp, level, count); + } else if (!originalHistoricalInfo->isValid() + || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel() + || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() + && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { // Initial information. - return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */); + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); + const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); + return HistoricalInfo(timestamp, level, count); } else { const int updatedCount = originalHistoricalInfo->getCount() + 1; if (updatedCount >= headerPolicy->getForgettingCurveOccurrencesToLevelUp()) { // The count exceeds the max value the level can be incremented. if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { // The level is already max. - return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), - originalHistoricalInfo->getCount()); + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); } else { // Level up. - return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel() + 1, - 0 /* count */); + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel() + 1, 0 /* count */); } } else { return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); @@ -73,8 +82,8 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT headerPolicy->getForgettingCurveDurationToLevelDown()); return sProbabilityTable.getProbability( headerPolicy->getForgettingCurveProbabilityValuesTableId(), - std::min(std::max(historicalInfo->getLevel(), 0), MAX_LEVEL), - std::min(std::max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT)); + clampToValidLevelRange(historicalInfo->getLevel()), + clampToValidTimeStepCountRange(elapsedTimeStepCount)); } /* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability, @@ -155,6 +164,23 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT return elapsedTimeInSeconds / timeStepDurationInSeconds; } +/* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) { + return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, + const HeaderPolicy *const headerPolicy) { + return std::min(std::max(count, 0), headerPolicy->getForgettingCurveOccurrencesToLevelUp() - 1); +} + +/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { + return std::min(std::max(level, 0), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) { + return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT); +} + const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index bb8690939..3ff80aeec 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -30,7 +30,7 @@ class ForgettingCurveUtils { public: static const HistoricalInfo createUpdatedHistoricalInfo( const HistoricalInfo *const originalHistoricalInfo, const int newProbability, - const int timestamp, const HeaderPolicy *const headerPolicy); + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy); static const HistoricalInfo createHistoricalInfoToSave( const HistoricalInfo *const originalHistoricalInfo, @@ -93,7 +93,7 @@ class ForgettingCurveUtils { static const int DECAY_INTERVAL_SECONDS; static const int MAX_LEVEL; - static const int MIN_VALID_LEVEL; + static const int MIN_VISIBLE_LEVEL; static const int MAX_ELAPSED_TIME_STEP_COUNT; static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; @@ -103,8 +103,11 @@ class ForgettingCurveUtils { static const ProbabilityTable sProbabilityTable; static int backoff(const int unigramProbability); - static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown); + static int clampToVisibleEntryLevelRange(const int level); + static int clampToValidLevelRange(const int level); + static int clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy); + static int clampToValidTimeStepCountRange(const int timeStepCount); }; } // namespace latinime #endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index cd3c403fa..a8518cdca 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -25,6 +25,18 @@ const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; // Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { + switch (formatVersion) { + case VERSION_2: + return VERSION_2; + case VERSION_4_ONLY_FOR_TESTING: + return VERSION_4_ONLY_FOR_TESTING; + case VERSION_4: + return VERSION_4; + default: + return UNKNOWN_VERSION; + } +} /* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( const uint8_t *const dict, const int dictSize) { // The magic number is stored big-endian. @@ -46,6 +58,8 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; // same so we use them for both here. if (ByteArrayUtils::readUint16(dict, 4) == VERSION_2) { return VERSION_2; + } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4_ONLY_FOR_TESTING) { + return VERSION_4_ONLY_FOR_TESTING; } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4) { return VERSION_4; } else { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index 759b1c9b2..20dfb9d8c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -31,6 +31,7 @@ class FormatUtils { enum FORMAT_VERSION { // These MUST have the same values as the relevant constants in FormatSpec.java. VERSION_2 = 2, + VERSION_4_ONLY_FOR_TESTING = 399, VERSION_4 = 401, UNKNOWN_VERSION = -1 }; @@ -39,6 +40,7 @@ class FormatUtils { // unsupported or obsolete dictionary formats. static const uint32_t MAGIC_NUMBER; + static FORMAT_VERSION getFormatVersion(const int formatVersion); static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); private: |