diff options
Diffstat (limited to 'native')
16 files changed, 301 insertions, 117 deletions
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp index e1b35340b..bc4ca8e9e 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp @@ -992,7 +992,16 @@ namespace latinime { } } if (character != NOT_AN_INDEX) { - codePointBuf[index] = proximityInfo->getCodePointOf(character); + const int codePoint = proximityInfo->getCodePointOf(character); + if (codePoint == NOT_A_CODE_POINT) { + AKLOGE("Key index(%d) is not found. Cannot construct most probable string", + character); + ASSERT(false); + // Make the length zero, which means most probable string won't be used. + index = 0; + break; + } + codePointBuf[index] = codePoint; index++; } sumLogProbability += minLogProbability; diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index 59748c80d..a8dab9fcd 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -44,8 +44,6 @@ class DictionaryHeaderStructurePolicy { virtual float getMultiWordCostMultiplier() const = 0; - virtual int getLastDecayedTime() const = 0; - virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const = 0; diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp index cd2243025..5df2096a4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp @@ -37,7 +37,8 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out if (outProbability) { if (bigramEntry.hasHistoricalInfo()) { *outProbability = - ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()); + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(), + mHeaderPolicy); } else { *outProbability = bigramEntry.getProbability(); } @@ -160,8 +161,8 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i } } else if (bigramEntry.hasHistoricalInfo()) { const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - bigramEntry.getHistoricalInfo()); - if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) { + bigramEntry.getHistoricalInfo(), mHeaderPolicy); + if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) { const BigramEntry updatedBigramEntry = bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { @@ -230,7 +231,8 @@ const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( if (mHeaderPolicy->hasHistoricalInfoOfWords()) { const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalBigramEntry->getHistoricalInfo(), newProbability, timestamp); + originalBigramEntry->getHistoricalInfo(), newProbability, timestamp, + mHeaderPolicy); return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); } else { return originalBigramEntry->updateProbabilityAndGetEntry(newProbability); diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index 3ce57d910..7c7b05ca8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -18,7 +18,7 @@ namespace latinime { -// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. +// Note that these are corresponding definitions in Java side in DictionaryHeader. const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = "REQUIRES_GERMAN_UMLAUT_PROCESSING"; @@ -33,8 +33,26 @@ const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE // count. const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration +const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY = + "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP"; +const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = + "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; +const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY = + "FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS"; + +const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; +const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; + const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; +const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 4; +const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 0; +// 4 days +const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS = + 4 * 24 * 60 * 60; + +const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000; +const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000; // Used for logging. Question mark is used to indicate that the key is not found. void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index fc347618c..66824245e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -52,7 +52,20 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( - &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)) {} + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, + DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, + DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), + mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), + mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} // Constructs header information using an attribute map. HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, @@ -71,8 +84,20 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( - &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)) { - } + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, + DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, + DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), + mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), + mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} // Temporary dummy header. HeaderPolicy() @@ -80,7 +105,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), - mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false) {} + mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), + mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0), + mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {} ~HeaderPolicy() {} @@ -159,6 +186,26 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return &mAttributeMap; } + AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const { + return mForgettingCurveOccurrencesToLevelUp; + } + + AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { + return mForgettingCurveProbabilityValuesTableId; + } + + AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const { + return mForgettingCurveDurationToLevelDown; + } + + AK_FORCE_INLINE int getMaxUnigramCount() const { + return mMaxUnigramCount; + } + + AK_FORCE_INLINE int getMaxBigramCount() const { + return mMaxBigramCount; + } + void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; @@ -183,8 +230,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const EXTENDED_REGION_SIZE_KEY; static const char *const HAS_HISTORICAL_INFO_KEY; static const char *const LOCALE_KEY; + static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; + static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; + static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; + static const char *const MAX_UNIGRAM_COUNT_KEY; + static const char *const MAX_BIGRAM_COUNT_KEY; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; + static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP; + static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; + static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS; + static const int DEFAULT_MAX_UNIGRAM_COUNT; + static const int DEFAULT_MAX_BIGRAM_COUNT; const FormatUtils::FORMAT_VERSION mDictFormatVersion; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; @@ -200,6 +257,11 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const int mBigramCount; const int mExtendedRegionSize; const bool mHasHistoricalInfoOfWords; + const int mForgettingCurveOccurrencesToLevelUp; + const int mForgettingCurveProbabilityValuesTableId; + const int mForgettingCurveDurationToLevelDown; + const int mMaxUnigramCount; + const int mMaxBigramCount; const std::vector<int> readLocale() const; float readMultipleWordCostMultiplier() const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp index cb9d450ec..279f5b33a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -23,6 +23,13 @@ namespace latinime { const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( int *const bigramEntryPos) const { const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); + if (*bigramEntryPos < 0 || *bigramEntryPos >= bigramListBuffer->getTailPosition()) { + AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bufSize: %d", + *bigramEntryPos, bigramListBuffer->getTailPosition()); + ASSERT(false); + return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + Ver4DictConstants::NOT_A_TERMINAL_ID); + } const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition( Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos); const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp index 29972a4e8..64d7bc0a5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp @@ -24,6 +24,19 @@ void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePo int *const outCodePoint, int *const outCodePointCount, int *const outProbability, bool *const outhasNext, int *const shortcutEntryPos) const { const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); + if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { + AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", + *shortcutEntryPos, shortcutListBuffer->getTailPosition()); + ASSERT(false); + if (outhasNext) { + *outhasNext = false; + } + if (outCodePointCount) { + *outCodePointCount = 0; + } + return; + } + const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); if (outProbability) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp index 17fc9483b..f149781f4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -65,7 +65,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce mProbabilityDictContent->getProbabilityEntry(terminalId); if (probabilityEntry.hasHistoricalInfo()) { probability = ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo()); + probabilityEntry.getHistoricalInfo(), mHeaderPolicy); } else { probability = probabilityEntry.getProbability(); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h index 9d932457c..1db9ea026 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h @@ -26,6 +26,7 @@ namespace latinime { class BufferWithExtendableBuffer; +class HeaderPolicy; class ProbabilityDictContent; /* @@ -35,8 +36,10 @@ class ProbabilityDictContent; class Ver4PatriciaTrieNodeReader : public PtNodeReader { public: Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, - const ProbabilityDictContent *const probabilityDictContent) - : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent) {} + const ProbabilityDictContent *const probabilityDictContent, + const HeaderPolicy *const headerPolicy) + : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent), + mHeaderPolicy(headerPolicy) {} ~Ver4PatriciaTrieNodeReader() {} @@ -50,6 +53,7 @@ class Ver4PatriciaTrieNodeReader : public PtNodeReader { const BufferWithExtendableBuffer *const mBuffer; const ProbabilityDictContent *const mProbabilityDictContent; + const HeaderPolicy *const mHeaderPolicy; const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, const int siblingNodePos) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 32576cf0a..f24c2e1af 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -159,7 +159,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA toBeUpdatedPtNodeParams->getTerminalId()); if (originalProbabilityEntry.hasHistoricalInfo()) { const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - originalProbabilityEntry.getHistoricalInfo()); + originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); const ProbabilityEntry probabilityEntry = originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( @@ -168,7 +168,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA toBeUpdatedPtNodeParams->getTerminalId()); return false; } - const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo); + const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); if (!isValid) { if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); @@ -382,10 +382,11 @@ const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, const int timestamp) const { // TODO: Consolidate historical info and probability. - if (mBuffers->getHeaderPolicy()->hasHistoricalInfoOfWords()) { + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp); + originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp, + mHeaderPolicy); return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( &updatedHistoricalInfo); } else { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index 66845bbd6..f01b3af0e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -28,6 +28,7 @@ namespace latinime { class BufferWithExtendableBuffer; +class HeaderPolicy; class Ver4BigramListPolicy; class Ver4DictBuffers; class Ver4PatriciaTrieNodeReader; @@ -40,10 +41,11 @@ class Ver4ShortcutListPolicy; class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { public: Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, - Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader, + Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy, + const PtNodeReader *const ptNodeReader, const PtNodeArrayReader *const ptNodeArrayReader, Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) - : mTrieBuffer(trieBuffer), mBuffers(buffers), + : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy), mReadingHelper(ptNodeReader, ptNodeArrayReader), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} @@ -116,6 +118,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { BufferWithExtendableBuffer *const mTrieBuffer; Ver4DictBuffers *const mBuffers; + const HeaderPolicy *const mHeaderPolicy; DynamicPtReadingHelper mReadingHelper; Ver4BigramListPolicy *const mBigramPolicy; Ver4ShortcutListPolicy *const mShortcutPolicy; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index b5d80be1d..4d1b0dadb 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -329,11 +329,15 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer snprintf(outResult, maxResultLength, "%d", mBigramCount); } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getUnigramCountHardLimit( + mHeaderPolicy->getMaxUnigramCount()) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getBigramCountHardLimit( + mHeaderPolicy->getMaxBigramCount()) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } } @@ -382,7 +386,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code bigramWord1CodePoints + codePointCount); const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); const int probability = bigramEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) : + ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mHeaderPolicy) : bigramEntry.getProbability(); bigrams.push_back(WordProperty::BigramProperty(&word1, probability, historicalInfo->getTimeStamp(), historicalInfo->getLevel(), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 7796e2ddc..639c153a1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -47,10 +47,10 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy), mShortcutPolicy(mBuffers.get()->getMutableShortcutDictContent(), mBuffers.get()->getTerminalPositionLookupTable()), - mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()), + mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent(), mHeaderPolicy), mPtNodeArrayReader(mDictBuffer), - mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader, - &mBigramPolicy, &mShortcutPolicy), + mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, + &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), mUnigramCount(mHeaderPolicy->getUnigramCount()), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index 93053c38d..3907c84a0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -74,14 +74,15 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, int *const outBigramCount) { Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), - mBuffers->getProbabilityDictContent()); + mBuffers->getProbabilityDictContent(), headerPolicy); Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), mBuffers->getTerminalPositionLookupTable(), headerPolicy); Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), mBuffers->getTerminalPositionLookupTable()); Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), - mBuffers, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, &shortcutPolicy); + mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); @@ -95,12 +96,11 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, } const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted .getValidUnigramCount(); - if (headerPolicy->isDecayingDict() - && unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { - if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, - ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) { + const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); + if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { + if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, - ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC); + maxUnigramCount); return false; } } @@ -113,11 +113,10 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return false; } const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); - if (headerPolicy->isDecayingDict() - && bigramCount > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { - if (!truncateBigrams(ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC)) { - AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, - ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC); + const int maxBigramCount = headerPolicy->getMaxBigramCount(); + if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { + if (!truncateBigrams(maxBigramCount)) { + AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); return false; } } @@ -126,7 +125,8 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, &shortcutPolicy); + buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); @@ -137,14 +137,14 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, // Create policy instances for the GCed dictionary. Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), - buffersToWrite->getProbabilityDictContent()); + buffersToWrite->getProbabilityDictContent(), headerPolicy); Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), buffersToWrite->getTerminalPositionLookupTable()); Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, + buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, &newShortcutPolicy); // Re-assign terminal IDs for valid terminal PtNodes. TerminalPositionLookupTable::TerminalIdMap terminalIdMap; @@ -202,8 +202,9 @@ bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( const ProbabilityEntry probabilityEntry = mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); const int probability = probabilityEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo()) : - probabilityEntry.getProbability(); + ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + probabilityEntry.getProbability(); priorityQueue.push(DictProbability(terminalPos, probability, probabilityEntry.getHistoricalInfo()->getTimeStamp())); } @@ -245,8 +246,9 @@ bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { continue; } const int probability = bigramEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) : - bigramEntry.getProbability(); + ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + bigramEntry.getProbability(); priorityQueue.push(DictProbability(entryPos, probability, bigramEntry.getHistoricalInfo()->getTimeStamp())); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index d58d25989..35e05d77a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -19,33 +19,29 @@ #include <cmath> #include <stdlib.h> -#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" #include "utils/time_keeper.h" namespace latinime { -const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000; -const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000; -const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; -const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; - -const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; +const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; const int ForgettingCurveUtils::MAX_LEVEL = 3; -const int ForgettingCurveUtils::MAX_COUNT = 3; const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1; -const int ForgettingCurveUtils::TIME_STEP_DURATION_IN_SECONDS = 6 * 60 * 60; const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15; const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14; +const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2; +const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2; + const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; // TODO: Revise the logic to decide the initial probability depending on the given probability. /* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( const HistoricalInfo *const originalHistoricalInfo, - const int newProbability, const int timestamp) { + const int newProbability, const int timestamp, const HeaderPolicy *const headerPolicy) { if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */); } else if (!originalHistoricalInfo->isValid()) { @@ -53,7 +49,7 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */); } else { const int updatedCount = originalHistoricalInfo->getCount() + 1; - if (updatedCount > MAX_COUNT) { + if (updatedCount >= headerPolicy->getForgettingCurveOccurrencesToLevelUp()) { // The count exceeds the max value the level can be incremented. if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { // The level is already max. @@ -71,9 +67,11 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT } /* static */ int ForgettingCurveUtils::decodeProbability( - const HistoricalInfo *const historicalInfo) { - const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp()); - return sProbabilityTable.getProbability(historicalInfo->getLevel(), + const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { + const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp(), + headerPolicy->getForgettingCurveDurationToLevelDown()); + return sProbabilityTable.getProbability( + headerPolicy->getForgettingCurveProbabilityValuesTableId(), historicalInfo->getLevel(), min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT)); } @@ -82,24 +80,31 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT if (unigramProbability == NOT_A_PROBABILITY) { return NOT_A_PROBABILITY; } else if (bigramProbability == NOT_A_PROBABILITY) { - return min(backoff(unigramProbability), MAX_COMPUTED_PROBABILITY); + return min(backoff(unigramProbability), MAX_PROBABILITY); } else { - return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); + // TODO: Investigate better way to handle bigram probability. + return min(max(unigramProbability, bigramProbability + MULTIPLIER_TWO_IN_PROBABILITY_SCALE), + MAX_PROBABILITY); } } -/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) { +/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy) { return historicalInfo->getLevel() > 0 - || getElapsedTimeStepCount(historicalInfo->getTimeStamp()) - < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; + || getElapsedTimeStepCount(historicalInfo->getTimeStamp(), + headerPolicy->getForgettingCurveDurationToLevelDown()) + < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; } /* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( - const HistoricalInfo *const originalHistoricalInfo) { + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy) { if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) { return HistoricalInfo(); } - const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp()); + const int durationToLevelDownInSeconds = headerPolicy->getForgettingCurveDurationToLevelDown(); + const int elapsedTimeStep = getElapsedTimeStepCount( + originalHistoricalInfo->getTimeStamp(), durationToLevelDownInSeconds); if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { // No need to update historical info. return *originalHistoricalInfo; @@ -108,19 +113,18 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? originalHistoricalInfo->getLevel() : maxLevelDownAmonut; - const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() + - levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS; - return HistoricalInfo(adjustedTimestamp, + const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimeStamp() + + levelDownAmount * durationToLevelDownInSeconds; + return HistoricalInfo(adjustedTimestampInSeconds, originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); } /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, - const int unigramCount, const int bigramCount, - const DictionaryHeaderStructurePolicy *const headerPolicy) { - if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { + const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) { + if (unigramCount >= getUnigramCountHardLimit(headerPolicy->getMaxUnigramCount())) { // Unigram count exceeds the limit. return true; - } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { + } else if (bigramCount >= getBigramCountHardLimit(headerPolicy->getMaxBigramCount())) { // Bigram count exceeds the limit. return true; } @@ -137,37 +141,71 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT // See comments in ProbabilityUtils::backoff(). /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else { - return max(unigramProbability - 8, 0); - } + // See TODO comments in ForgettingCurveUtils::getProbability(). + return unigramProbability; } -/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp) { - return (TimeKeeper::peekCurrentTime() - timestamp) / TIME_STEP_DURATION_IN_SECONDS; +/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp, + const int durationToLevelDownInSeconds) { + const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp; + const int timeStepDurationInSeconds = + durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + return elapsedTimeInSeconds / timeStepDurationInSeconds; } -ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() { - mTable.resize(MAX_LEVEL + 1); - for (int level = 0; level <= MAX_LEVEL; ++level) { - mTable[level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); - const float initialProbability = - static_cast<float>(MAX_COMPUTED_PROBABILITY / (1 << (MAX_LEVEL - level))); - for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; ++timeStepCount) { - if (level == 0) { - mTable[level][timeStepCount] = NOT_A_PROBABILITY; - continue; +const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 32; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 35; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 40; + + +ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { + mTables.resize(PROBABILITY_TABLE_COUNT); + for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) { + mTables[tableId].resize(MAX_LEVEL + 1); + for (int level = 0; level <= MAX_LEVEL; ++level) { + mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); + const float initialProbability = getBaseProbabilityForLevel(tableId, level); + const float endProbability = getBaseProbabilityForLevel(tableId, level - 1); + for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; + ++timeStepCount) { + if (level == 0) { + mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; + continue; + } + const float probability = initialProbability + * powf(initialProbability / endProbability, + -1.0f * static_cast<float>(timeStepCount) + / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1)); + mTables[tableId][level][timeStepCount] = + min(max(static_cast<int>(probability), 1), MAX_PROBABILITY); } - const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS; - const float probability = initialProbability - * powf(2.0f, -1.0f * static_cast<float>(elapsedTime) - / static_cast<float>(TIME_STEP_DURATION_IN_SECONDS - * (MAX_ELAPSED_TIME_STEP_COUNT + 1))); - mTable[level][timeStepCount] = - min(max(static_cast<int>(probability), 1), MAX_COMPUTED_PROBABILITY); } } } +/* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel( + const int tableId, const int level) { + if (tableId == WEAK_PROBABILITY_TABLE_ID) { + // Max probability is 127. + return static_cast<float>(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level))); + } else if (tableId == MODEST_PROBABILITY_TABLE_ID) { + // Max probability is 128. + return static_cast<float>(MODEST_BASE_PROBABILITY * (level + 1)); + } else if (tableId == STRONG_PROBABILITY_TABLE_ID) { + // Max probability is 140. + return static_cast<float>(STRONG_BASE_PROBABILITY * (level + 1)); + } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) { + // Max probability is 160. + return static_cast<float>(AGGRESSIVE_BASE_PROBABILITY * (level + 1)); + } else { + return NOT_A_PROBABILITY; + } +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index b37353455..bb8690939 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -24,31 +24,39 @@ namespace latinime { -class DictionaryHeaderStructurePolicy; +class HeaderPolicy; class ForgettingCurveUtils { public: - static const int MAX_UNIGRAM_COUNT; - static const int MAX_UNIGRAM_COUNT_AFTER_GC; - static const int MAX_BIGRAM_COUNT; - static const int MAX_BIGRAM_COUNT_AFTER_GC; - static const HistoricalInfo createUpdatedHistoricalInfo( const HistoricalInfo *const originalHistoricalInfo, const int newProbability, - const int timestamp); + const int timestamp, const HeaderPolicy *const headerPolicy); static const HistoricalInfo createHistoricalInfoToSave( - const HistoricalInfo *const originalHistoricalInfo); + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy); - static int decodeProbability(const HistoricalInfo *const historicalInfo); + static int decodeProbability(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); static int getProbability(const int encodedUnigramProbability, const int encodedBigramProbability); - static bool needsToKeep(const HistoricalInfo *const historicalInfo); + static bool needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, - const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy); + const int bigramCount, const HeaderPolicy *const headerPolicy); + + AK_FORCE_INLINE static int getUnigramCountHardLimit(const int maxUnigramCount) { + return static_cast<int>(static_cast<float>(maxUnigramCount) + * UNIGRAM_COUNT_HARD_LIMIT_WEIGHT); + } + + AK_FORCE_INLINE static int getBigramCountHardLimit(const int maxBigramCount) { + return static_cast<int>(static_cast<float>(maxBigramCount) + * BIGRAM_COUNT_HARD_LIMIT_WEIGHT); + } private: DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); @@ -57,32 +65,46 @@ class ForgettingCurveUtils { public: ProbabilityTable(); - int getProbability(const int level, const int elapsedTimeStepCount) const { - return mTable[level][elapsedTimeStepCount]; + int getProbability(const int tableId, const int level, + const int elapsedTimeStepCount) const { + return mTables[tableId][level][elapsedTimeStepCount]; } private: DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); - std::vector<std::vector<int> > mTable; + static const int PROBABILITY_TABLE_COUNT; + static const int WEAK_PROBABILITY_TABLE_ID; + static const int MODEST_PROBABILITY_TABLE_ID; + static const int STRONG_PROBABILITY_TABLE_ID; + static const int AGGRESSIVE_PROBABILITY_TABLE_ID; + + static const int WEAK_MAX_PROBABILITY; + static const int MODEST_BASE_PROBABILITY; + static const int STRONG_BASE_PROBABILITY; + static const int AGGRESSIVE_BASE_PROBABILITY; + + std::vector<std::vector<std::vector<int> > > mTables; + + static int getBaseProbabilityForLevel(const int tableId, const int level); }; - static const int MAX_COMPUTED_PROBABILITY; + static const int MULTIPLIER_TWO_IN_PROBABILITY_SCALE; static const int DECAY_INTERVAL_SECONDS; static const int MAX_LEVEL; - static const int MAX_COUNT; static const int MIN_VALID_LEVEL; - static const int TIME_STEP_DURATION_IN_SECONDS; static const int MAX_ELAPSED_TIME_STEP_COUNT; static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; - static const int HALF_LIFE_TIME_IN_SECONDS; + + static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT; + static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT; static const ProbabilityTable sProbabilityTable; static int backoff(const int unigramProbability); - static int getElapsedTimeStepCount(const int timestamp); + static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown); }; } // namespace latinime #endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ |