diff options
Diffstat (limited to 'native')
16 files changed, 239 insertions, 164 deletions
diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 38159b0f3..8f21c50ec 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -67,7 +67,6 @@ static jboolean latinime_BinaryDictionary_createEmptyDictFile(JNIEnv *env, jclas valueChars[valueUtf8Length] = '\0'; HeaderReadWriteUtils::AttributeMap::mapped_type value; HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value); - attributeMap[key] = value; } diff --git a/native/jni/src/suggest/core/dictionary/shortcut_utils.h b/native/jni/src/suggest/core/dictionary/shortcut_utils.h index 461d7b454..9ccef020f 100644 --- a/native/jni/src/suggest/core/dictionary/shortcut_utils.h +++ b/native/jni/src/suggest/core/dictionary/shortcut_utils.h @@ -44,7 +44,7 @@ class ShortcutUtils { shortcutScore = finalScore; // Protection against int underflow shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1; - kind = Dictionary::KIND_CORRECTION; + kind = Dictionary::KIND_SHORTCUT; } outputTypes[outputWordIndex] = kind; frequencies[outputWordIndex] = shortcutScore; diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index a6829b476..5492c6070 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -37,6 +37,8 @@ class DictionaryHeaderStructurePolicy { virtual float getMultiWordCostMultiplier() const = 0; + virtual int getLastDecayedTime() const = 0; + virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const = 0; diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp index 9c3dc393f..b1170e251 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -43,7 +43,7 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const } *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); - if (mIsDecayingDict && !ForgettingCurveUtils::isValidBigram(*outProbability)) { + if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) { // This bigram is too weak to output. *outBigramPos = NOT_A_DICT_POS; } else { @@ -261,8 +261,8 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags( bigramFlags); const int probabilityToWrite = mIsDecayingDict ? - ForgettingCurveUtils::getUpdatedBigramProbabilityDelta( - originalProbability, probability) : probability; + ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + probability) : probability; const BigramListReadWriteUtils::BigramFlags updatedFlags = BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probabilityToWrite); @@ -294,7 +294,7 @@ bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, con int *const writingPos) { // hasNext is false because we are adding a new bigram entry at the end of the bigram list. const int probabilityToWrite = mIsDecayingDict ? - ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) : + ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) : probability; return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, probabilityToWrite, false /* hasNext */, writingPos); @@ -360,14 +360,14 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( } bool DynamicBigramListPolicy::updateProbabilityForDecay( - BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, + const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const { *outRemoved = false; if (mIsDecayingDict) { // Update bigram probability for decaying. - const int newProbability = ForgettingCurveUtils::getBigramProbabilityDeltaToSave( - BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags)); - if (ForgettingCurveUtils::isValidBigram(newProbability)) { + const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave( + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy); + if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { // Write new probability. const BigramListReadWriteUtils::BigramFlags updatedBigramFlags = BigramListReadWriteUtils::setProbabilityInFlags( diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h index b358b4ed5..0504b59d5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -27,6 +27,7 @@ namespace latinime { class BufferWithExtendableBuffer; +class DictionaryHeaderStructurePolicy; class DictionaryShortcutsStructurePolicy; /* @@ -34,10 +35,12 @@ class DictionaryShortcutsStructurePolicy; */ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { public: - DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer, + DynamicBigramListPolicy(const DictionaryHeaderStructurePolicy *const headerPolicy, + BufferWithExtendableBuffer *const buffer, const DictionaryShortcutsStructurePolicy *const shortcutPolicy, const bool isDecayingDict) - : mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {} + : mHeaderPolicy(headerPolicy), mBuffer(buffer), mShortcutPolicy(shortcutPolicy), + mIsDecayingDict(isDecayingDict) {} ~DynamicBigramListPolicy() {} @@ -74,6 +77,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT; static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT; + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; BufferWithExtendableBuffer *const mBuffer; const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; const bool mIsDecayingDict; @@ -81,7 +85,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { // Follow bigram link and return the position of bigram target PtNode that is currently valid. int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const; - bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags, + bool updateProbabilityForDecay(const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const; }; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp index 26ea6ab68..5724c5d88 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -16,6 +16,7 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -29,15 +30,16 @@ bool DynamicPatriciaTrieGcEventListeners bool isUselessPtNode = !node->isTerminal(); if (node->isTerminal() && mIsDecayingDict) { const int newProbability = - ForgettingCurveUtils::getUnigramProbabilityToSave(node->getProbability()); + ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability(), + mHeaderPolicy); int writingPos = node->getProbabilityFieldPos(); // Update probability. if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( mBuffer, newProbability, &writingPos)) { return false; } - if (!ForgettingCurveUtils::isValidUnigram(newProbability)) { - isUselessPtNode = false; + if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { + isUselessPtNode = true; } } if (mChildrenValue > 0) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h index 463715af5..9755120b0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -29,6 +29,8 @@ namespace latinime { +class DictionaryHeaderStructurePolicy; + class DynamicPatriciaTrieGcEventListeners { public: // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or @@ -38,10 +40,12 @@ class DynamicPatriciaTrieGcEventListeners { : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + const DictionaryHeaderStructurePolicy *const headerPolicy, DynamicPatriciaTrieWritingHelper *const writingHelper, BufferWithExtendableBuffer *const buffer, const bool isDecayingDict) - : mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict), - mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {} + : mHeaderPolicy(headerPolicy), mWritingHelper(writingHelper), mBuffer(buffer), + mIsDecayingDict(isDecayingDict), mValueStack(), mChildrenValue(0), + mValidUnigramCount(0) {} ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; @@ -56,6 +60,7 @@ class DynamicPatriciaTrieGcEventListeners { bool onDescend(const int ptNodeArrayPos) { mValueStack.push_back(0); + mChildrenValue = 0; return true; } @@ -72,9 +77,10 @@ class DynamicPatriciaTrieGcEventListeners { DISALLOW_IMPLICIT_CONSTRUCTORS( TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; DynamicPatriciaTrieWritingHelper *const mWritingHelper; BufferWithExtendableBuffer *const mBuffer; - const int mIsDecayingDict; + const bool mIsDecayingDict; std::vector<int> mValueStack; int mChildrenValue; int mValidUnigramCount; @@ -85,7 +91,8 @@ class DynamicPatriciaTrieGcEventListeners { class TraversePolicyToUpdateBigramProbability : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: - TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy) + TraversePolicyToUpdateBigramProbability( + DynamicBigramListPolicy *const bigramPolicy) : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {} bool onAscend() { return true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 60d0db0c0..a8ea69f3c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -37,12 +37,13 @@ namespace latinime { // BinaryDictionaryDecayingTests. const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = "SET_NEEDS_TO_DECAY_FOR_TESTING"; const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024; -const int DynamicPatriciaTriePolicy::DECAY_INTERVAL_FOR_DECAYING_DICTS = 2 * 60 * 60; void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { @@ -314,15 +315,15 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); return; } - const bool runGCwithDecay = needsToDecay(); - DynamicBigramListPolicy bigramListPolicyForGC(&mBufferWithExtendableBuffer, - &mShortcutListPolicy, runGCwithDecay); + const bool needsToDecay = mHeaderPolicy.isDecayingDict() + && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy)); + DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer, + &mShortcutListPolicy, needsToDecay); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &bigramListPolicyForGC, &mShortcutListPolicy, runGCwithDecay); + &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay); writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); - if (runGCwithDecay) { - mNeedsToDecayForTesting = false; - } + mNeedsToDecayForTesting = false; } bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -344,16 +345,8 @@ bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy.isDecayingDict()) { - if (mUnigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { - // Unigram count exceeds the limit. - return true; - } else if (mBigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { - // Bigram count exceeds the limit. - return true; - } else if (mindsBlockByGC && needsToDecay()) { - // Time to update probabilities for decaying. - return true; - } + return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy); } return false; } @@ -364,14 +357,17 @@ void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const snprintf(outResult, maxResultLength, "%d", mUnigramCount); } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { snprintf(outResult, maxResultLength, "%d", mBigramCount); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : + static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : + static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) { mNeedsToDecayForTesting = true; } } -bool DynamicPatriciaTriePolicy::needsToDecay() const { - return mHeaderPolicy.isDecayingDict() && (mNeedsToDecayForTesting - || mHeaderPolicy.getLastDecayedTime() + DECAY_INTERVAL_FOR_DECAYING_DICTS < time(0)); -} - } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index c3bbe9977..be97ee1a5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -37,7 +37,7 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mShortcutListPolicy(&mBufferWithExtendableBuffer), - mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy, + mBigramListPolicy(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()), mUnigramCount(mHeaderPolicy.getUnigramCount()), mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {} @@ -102,10 +102,11 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { static const char *const UNIGRAM_COUNT_QUERY; static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; static const int MAX_DICT_EXTENDED_REGION_SIZE; static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; - static const int DECAY_INTERVAL_FOR_DECAYING_DICTS; const MmappedBuffer *const mBuffer; const HeaderPolicy mHeaderPolicy; @@ -115,8 +116,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int mUnigramCount; int mBigramCount; int mNeedsToDecayForTesting; - - bool needsToDecay() const; }; } // namespace latinime #endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp index 601ee663b..f108c219f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp @@ -93,6 +93,12 @@ bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreor if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { return false; } + if (isEnd()) { + // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + } pushReadingStateToStack(); while (!isEnd()) { if (alreadyVisitedAllPtNodesInArray) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h index 512a4d818..a71c06971 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h @@ -279,7 +279,9 @@ class DynamicPatriciaTrieReadingHelper { } else { mReadingState = mReadingStateStack.back(); mReadingStateStack.pop_back(); - fetchPtNodeInfo(); + if (!isEnd()) { + fetchPtNodeInfo(); + } } } }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp index 9dc2abe20..052558bfc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -165,7 +165,10 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNod MAX_DICTIONARY_SIZE); int unigramCount = 0; int bigramCount = 0; - if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) { + if (mNeedsToDecay) { + ForgettingCurveUtils::sTimeKeeper.setCurrentTime(); + } + if (!runGC(rootPtNodeArrayPos, headerPolicy, &newDictBuffer, &unigramCount, &bigramCount)) { return; } BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); @@ -237,7 +240,8 @@ bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( int parentOffsetFieldPos = nodeReader->getHeadPos() + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( - mBuffer, movedPos, nodeReader->getHeadPos(), &parentOffsetFieldPos)) { + mBuffer, bigramLinkedNodePos, nodeReader->getHeadPos(), + &parentOffsetFieldPos)) { // Parent offset cannot be written because of a bug or a broken dictionary; thus, // we give up to update dictionary. return false; @@ -481,14 +485,14 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( } bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, - int *const outBigramCount) { + const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite, + int *const outUnigramCount, int *const outBigramCount) { DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); DynamicPatriciaTrieGcEventListeners ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - this, mBuffer, mNeedsToDecay); + headerPolicy, this, mBuffer, mNeedsToDecay); if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { return false; @@ -505,7 +509,6 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, &traversePolicyToUpdateBigramProbability)) { return false; } - if (mNeedsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { // TODO: Remove more bigrams. @@ -524,7 +527,7 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, // Create policy instance for the GCed dictionary. DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); - DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy, + DynamicBigramListPolicy newDictBigramPolicy(headerPolicy, bufferToWrite, &newDictShortcutPolicy, mNeedsToDecay); // Create reading helper for the GCed dictionary. DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, @@ -545,7 +548,7 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability, const int newProbability) { if (mNeedsToDecay) { - return ForgettingCurveUtils::getUpdatedUnigramProbability(originalProbability, + return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, newProbability); } else { return newProbability; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h index 0caf29120..ca8664729 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -128,8 +128,9 @@ class DynamicPatriciaTrieWritingHelper { const int probabilityOfNewPtNode, const int *const newNodeCodePoints, const int newNodeCodePointCount); - bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite, - int *const outUnigramCount, int *const outBigramCount); + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, + int *const outBigramCount); int getUpdatedProbability(const int originalProbability, const int newProbability); }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index c525a8ab5..1632fd072 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -14,8 +14,13 @@ * limitations under the License. */ +#include <cmath> +#include <ctime> +#include <stdlib.h> + #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { @@ -26,106 +31,125 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; -const int ForgettingCurveUtils::MAX_UNIGRAM_PROBABILITY = 120; -const int ForgettingCurveUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24; -const int ForgettingCurveUtils::UNIGRAM_PROBABILITY_STEP = 8; -const int ForgettingCurveUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15; -const int ForgettingCurveUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3; -const int ForgettingCurveUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1; +const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15; +const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3; +const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; +// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected +// duration of the decay is approximately 66hours. +const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; +const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; + +const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; +ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper; + +void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { + mCurrentTime = time(0); +} /* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, - const int encodedBigramProbabilityDelta) { + const int encodedBigramProbability) { if (encodedUnigramProbability == NOT_A_PROBABILITY) { return NOT_A_PROBABILITY; - } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) { - const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability( - encodedUnigramProbability)); - return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); + } else if (encodedBigramProbability == NOT_A_PROBABILITY) { + return backoff(decodeProbability(encodedUnigramProbability)); } else { - const int rawProbability = ProbabilityUtils::computeProbabilityForBigram( - decodeUnigramProbability(encodedUnigramProbability), - decodeBigramProbabilityDelta(encodedBigramProbabilityDelta)); - return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); + const int unigramProbability = decodeProbability(encodedUnigramProbability); + const int bigramProbability = decodeProbability(encodedBigramProbability); + return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); } } -/* static */ int ForgettingCurveUtils::getUpdatedUnigramProbability( +// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding +// (i.e. unigram probability + bigram probability delta). +/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability( const int originalEncodedProbability, const int newProbability) { if (originalEncodedProbability == NOT_A_PROBABILITY) { - // The unigram is not in this dictionary. + // The bigram relation is not in this dictionary. if (newProbability == NOT_A_PROBABILITY) { - // The unigram is not in other dictionaries. + // The bigram target is not in other dictionaries. return 0; } else { - return MIN_VALID_UNIGRAM_PROBABILITY; + return MIN_VALID_ENCODED_PROBABILITY; } } else { if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) { - return MIN_VALID_UNIGRAM_PROBABILITY; + && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) { + return MIN_VALID_ENCODED_PROBABILITY; } - return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY); + return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY); } } -/* static */ int ForgettingCurveUtils::getUnigramProbabilityToSave(const int encodedProbability) { - return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0); -} - -/* static */ int ForgettingCurveUtils::getBigramProbabilityDeltaToSave( - const int encodedProbabilityDelta) { - return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0); +/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) { + return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; } -/* static */ int ForgettingCurveUtils::getUpdatedBigramProbabilityDelta( - const int originalEncodedProbabilityDelta, const int newProbability) { - if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) { - // The bigram relation is not in this dictionary. - if (newProbability == NOT_A_PROBABILITY) { - // The bigram target is not in other dictionaries. - return 0; - } else { - return MIN_VALID_BIGRAM_PROBABILITY_DELTA; +/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime(); + const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1); + int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); + // TODO: Implement the decay in more proper way. + for (int i = 0; i < decayIterationCount; ++i) { + const float currentRate = static_cast<float>(currentEncodedProbability) + / static_cast<float>(MAX_ENCODED_PROBABILITY); + const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; + const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); + if (thresholdToDecay < randValue) { + currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, + 0); } - } else { - if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) { - return MIN_VALID_BIGRAM_PROBABILITY_DELTA; - } - return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP, - MAX_BIGRAM_PROBABILITY_DELTA); } + return currentEncodedProbability; } -/* static */ int ForgettingCurveUtils::isValidUnigram(const int encodedUnigramProbability) { - return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY; -} - -/* static */ int ForgettingCurveUtils::isValidBigram(const int encodedBigramProbabilityDelta) { - return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA; +/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, + const int unigramCount, const int bigramCount, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { + // Unigram count exceeds the limit. + return true; + } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { + // Bigram count exceeds the limit. + return true; + } + if (mindsBlockByDecay) { + return false; + } + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) { + // Time to decay. + return true; + } + return false; } -/* static */ int ForgettingCurveUtils::decodeUnigramProbability(const int encodedProbability) { - const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY; - if (probability < 0) { +/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) { + if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) { return NOT_A_PROBABILITY; } else { - return min(probability, MAX_UNIGRAM_PROBABILITY); + return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY); } } -/* static */ int ForgettingCurveUtils::decodeBigramProbabilityDelta( - const int encodedProbabilityDelta) { - const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA; - if (probabilityDelta < 0) { +// See comments in ProbabilityUtils::backoff(). +/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { + if (unigramProbability == NOT_A_PROBABILITY) { return NOT_A_PROBABILITY; } else { - return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA); + return max(unigramProbability - 8, 0); } } -/* static */ int ForgettingCurveUtils::getDecayedProbability(const int rawProbability) { - return rawProbability; +ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() { + // Table entry is as follows: + // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127. + // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used. + mTable.resize(MAX_ENCODED_PROBABILITY + 1); + for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) { + const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY), + static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY))); + mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability)); + } } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index 11ffb2e2f..2ad423874 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -17,54 +17,84 @@ #ifndef LATINIME_FORGETTING_CURVE_UTILS_H #define LATINIME_FORGETTING_CURVE_UTILS_H +#include <vector> + #include "defines.h" namespace latinime { +class DictionaryHeaderStructurePolicy; + // TODO: Check the elapsed time and decrease the probability depending on the time. Time field is // required to introduced to each terminal PtNode and bigram entry. // TODO: Quit using bigram probability to indicate the delta. -// TODO: Quit using bigram probability delta. class ForgettingCurveUtils { public: + class TimeKeeper { + public: + TimeKeeper() : mCurrentTime(0) {} + void setCurrentTime(); + int peekCurrentTime() const { return mCurrentTime; }; + + private: + DISALLOW_COPY_AND_ASSIGN(TimeKeeper); + + int mCurrentTime; + }; + static const int MAX_UNIGRAM_COUNT; static const int MAX_UNIGRAM_COUNT_AFTER_GC; static const int MAX_BIGRAM_COUNT; static const int MAX_BIGRAM_COUNT_AFTER_GC; - static int getProbability(const int encodedUnigramProbability, - const int encodedBigramProbabilityDelta); + static TimeKeeper sTimeKeeper; - static int getUpdatedUnigramProbability(const int originalEncodedProbability, - const int newProbability); + static int getProbability(const int encodedUnigramProbability, + const int encodedBigramProbability); - static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta, + static int getUpdatedEncodedProbability(const int originalEncodedProbability, const int newProbability); - static int isValidUnigram(const int encodedUnigramProbability); - - static int isValidBigram(const int encodedProbabilityDelta); + static int isValidEncodedProbability(const int encodedProbability); - static int getUnigramProbabilityToSave(const int encodedProbability); + static int getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy); - static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta); + static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, + const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy); private: DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); + class ProbabilityTable { + public: + ProbabilityTable(); + + int getProbability(const int encodedProbability) const { + if (encodedProbability < 0 || encodedProbability > static_cast<int>(mTable.size())) { + return NOT_A_PROBABILITY; + } + return mTable[encodedProbability]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); + + std::vector<int> mTable; + }; + static const int MAX_COMPUTED_PROBABILITY; - static const int MAX_UNIGRAM_PROBABILITY; - static const int MIN_VALID_UNIGRAM_PROBABILITY; - static const int UNIGRAM_PROBABILITY_STEP; - static const int MAX_BIGRAM_PROBABILITY_DELTA; - static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA; - static const int BIGRAM_PROBABILITY_DELTA_STEP; + static const int MAX_ENCODED_PROBABILITY; + static const int MIN_VALID_ENCODED_PROBABILITY; + static const int ENCODED_PROBABILITY_STEP; + static const float MIN_PROBABILITY_TO_DECAY; + static const int DECAY_INTERVAL_SECONDS; - static int decodeUnigramProbability(const int encodedProbability); + static const ProbabilityTable sProbabilityTable; - static int decodeBigramProbabilityDelta(const int encodedProbability); + static int decodeProbability(const int encodedProbability); - static int getDecayedProbability(const int rawProbability); + static int backoff(const int unigramProbability); }; } // namespace latinime #endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index ecceb60d3..104eb2a7a 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -27,30 +27,30 @@ const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4; -const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.132f; -const float ScoringParams::PROXIMITY_COST = 0.095f; -const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.102f; -const float ScoringParams::FIRST_PROXIMITY_COST = 0.019f; -const float ScoringParams::OMISSION_COST = 0.458f; -const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.491f; -const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.582f; -const float ScoringParams::INSERTION_COST = 0.730f; -const float ScoringParams::TERMINAL_INSERTION_COST = 0.93f; -const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.586f; -const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.70f; -const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.623f; -const float ScoringParams::TRANSPOSITION_COST = 0.526f; -const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.319f; -const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.380f; -const float ScoringParams::SUBSTITUTION_COST = 0.383f; -const float ScoringParams::COST_NEW_WORD = 0.042f; -const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.25f; -const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.123f; -const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.545f; -const float ScoringParams::COST_LOOKAHEAD = 0.073f; -const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.093f; -const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.041f; -const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.447f; +const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f; +const float ScoringParams::PROXIMITY_COST = 0.0694f; +const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.072f; +const float ScoringParams::FIRST_PROXIMITY_COST = 0.07788f; +const float ScoringParams::OMISSION_COST = 0.4676f; +const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.399f; +const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.5256f; +const float ScoringParams::INSERTION_COST = 0.7248f; +const float ScoringParams::TERMINAL_INSERTION_COST = 0.8128f; +const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f; +const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f; +const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f; +const float ScoringParams::TRANSPOSITION_COST = 0.5608f; +const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.339f; +const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.4576f; +const float ScoringParams::SUBSTITUTION_COST = 0.3806f; +const float ScoringParams::COST_NEW_WORD = 0.0312f; +const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f; +const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f; +const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.4836f; +const float ScoringParams::COST_LOOKAHEAD = 0.00624f; +const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.06836f; +const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f; +const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.4182f; const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f; const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f; const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.045f; |