diff options
Diffstat (limited to 'native/jni/src')
33 files changed, 522 insertions, 324 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index c920f64b4..742e388e4 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -298,9 +298,19 @@ static inline void prof_out(void) { #define NOT_AN_INDEX (-1) #define NOT_A_PROBABILITY (-1) #define NOT_A_DICT_POS (S_INT_MIN) + // A special value to mean the first word confidence makes no sense in this case, // e.g. this is not a multi-word suggestion. -#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MIN) +#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MAX) +// How high the confidence needs to be for us to auto-commit. Arbitrary. +// This needs to be the same as CONFIDENCE_FOR_AUTO_COMMIT in BinaryDictionary.java +#define CONFIDENCE_FOR_AUTO_COMMIT (1000000) +// 80% of the full confidence +#define DISTANCE_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100) +// 100% of the full confidence +#define LENGTH_WEIGHT_FOR_AUTO_COMMIT (CONFIDENCE_FOR_AUTO_COMMIT) +// 80% of the full confidence +#define SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100) #define KEYCODE_SPACE ' ' #define KEYCODE_SINGLE_QUOTE '\'' diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index 9099e8285..49cfdecac 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -271,7 +271,7 @@ class DicNode { return isTerminalNodes && currentNodeDepth > 0 && currentNodeDepth == terminalNodeDepth; } - bool shouldBeFilterdBySafetyNetForBigram() const { + bool shouldBeFilteredBySafetyNetForBigram() const { const uint16_t currentDepth = getNodeCodePointCount(); const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength() - mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1; @@ -321,6 +321,16 @@ class DicNode { DUMP_WORD_AND_SCORE("OUTPUT"); } + // "Total" in this context (and other methods in this class) means the whole suggestion. When + // this represents a multi-word suggestion, the referenced PtNode (in mDicNodeState) is only + // the one that corresponds to the last word of the suggestion, and all the previous words + // are concatenated together in mPrevWord - which contains a space at the end. + int getTotalNodeSpaceCount() const { + if (isFirstWord()) return 0; + return CharUtils::getSpaceCount(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, + mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength()); + } + int getSecondWordFirstInputIndex(const ProximityInfoState *const pInfoState) const { const int inputIndex = mDicNodeState.mDicNodeStatePrevWord.getSecondWordFirstInputIndex(); if (inputIndex == NOT_AN_INDEX) { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 5969b31cc..59ead1894 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -129,7 +129,7 @@ bool Dictionary::needsToRunGC(const bool mindsBlockByGC) { } void Dictionary::getProperty(const char *const query, char *const outResult, - const int maxResultLength) const { + const int maxResultLength) { return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength); } diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 43d3b964d..0195d5bf0 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -84,7 +84,7 @@ class Dictionary { bool needsToRunGC(const bool mindsBlockByGC); void getProperty(const char *const query, char *const outResult, - const int maxResultLength) const; + const int maxResultLength); const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { return mDictionaryStructureWithBufferPolicy; diff --git a/native/jni/src/suggest/core/dictionary/shortcut_utils.h b/native/jni/src/suggest/core/dictionary/shortcut_utils.h index 461d7b454..9ccef020f 100644 --- a/native/jni/src/suggest/core/dictionary/shortcut_utils.h +++ b/native/jni/src/suggest/core/dictionary/shortcut_utils.h @@ -44,7 +44,7 @@ class ShortcutUtils { shortcutScore = finalScore; // Protection against int underflow shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1; - kind = Dictionary::KIND_CORRECTION; + kind = Dictionary::KIND_SHORTCUT; } outputTypes[outputWordIndex] = kind; frequencies[outputWordIndex] = shortcutScore; diff --git a/native/jni/src/suggest/core/layout/proximity_info_params.cpp b/native/jni/src/suggest/core/layout/proximity_info_params.cpp index 0e887f700..49df10301 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_params.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info_params.cpp @@ -69,13 +69,13 @@ const float ProximityInfoParams::STRAIGHT_ANGLE_THRESHOLD = M_PI_F * 15.0f / 180 const float ProximityInfoParams::SKIP_CORNER_PROBABILITY = 0.4f; const float ProximityInfoParams::SPEED_MARGIN = 0.1f; const float ProximityInfoParams::CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION = 0.0f; -// TODO: The variance is critical for accuracy; thus, adjusting these parameter by machine +// TODO: The variance is critical for accuracy; thus, adjusting these parameters by machine // learning or something would be efficient. -const float ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DIVIATION = 0.3f; -const float ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDERD_DIVIATION = 0.25f; -const float ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DIVIATION = 0.5f; -const float ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDERD_DIVIATION = 0.15f; -const float ProximityInfoParams::MIN_STANDERD_DIVIATION = 0.37f; +const float ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION = 0.3f; +const float ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION = 0.25f; +const float ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION = 0.5f; +const float ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION = 0.15f; +const float ProximityInfoParams::MIN_STANDARD_DEVIATION = 0.37f; const float ProximityInfoParams::PREV_DISTANCE_WEIGHT = 0.5f; const float ProximityInfoParams::NEXT_DISTANCE_WEIGHT = 0.6f; diff --git a/native/jni/src/suggest/core/layout/proximity_info_params.h b/native/jni/src/suggest/core/layout/proximity_info_params.h index 4e47f7308..ae1f82c22 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_params.h +++ b/native/jni/src/suggest/core/layout/proximity_info_params.h @@ -73,11 +73,11 @@ class ProximityInfoParams { static const float SKIP_CORNER_PROBABILITY; static const float SPEED_MARGIN; static const float CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION; - static const float SPEEDxANGLE_WEIGHT_FOR_STANDARD_DIVIATION; - static const float MAX_SPEEDxANGLE_RATE_FOR_STANDERD_DIVIATION; - static const float SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DIVIATION; - static const float MAX_SPEEDxNEAREST_RATE_FOR_STANDERD_DIVIATION; - static const float MIN_STANDERD_DIVIATION; + static const float SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION; + static const float MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION; + static const float SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION; + static const float MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION; + static const float MIN_STANDARD_DEVIATION; static const float PREV_DISTANCE_WEIGHT; static const float NEXT_DISTANCE_WEIGHT; diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp index 904671f7f..e1b35340b 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp @@ -708,13 +708,13 @@ namespace latinime { const float inputCharProbability = 1.0f - skipProbability; const float speedxAngleRate = min(speedRate * currentAngle / M_PI_F - * ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DIVIATION, - ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDERD_DIVIATION); + * ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION, + ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION); const float speedxNearestKeyDistanceRate = min(speedRate * nearestKeyDistance - * ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DIVIATION, - ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDERD_DIVIATION); + * ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION, + ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION); const float sigma = speedxAngleRate + speedxNearestKeyDistanceRate - + ProximityInfoParams::MIN_STANDERD_DIVIATION; + + ProximityInfoParams::MIN_STANDARD_DEVIATION; ProximityInfoUtils::NormalDistribution distribution(ProximityInfoParams::CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION, sigma); diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index a6829b476..5492c6070 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -37,6 +37,8 @@ class DictionaryHeaderStructurePolicy { virtual float getMultiWordCostMultiplier() const = 0; + virtual int getLastDecayedTime() const = 0; + virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const = 0; diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index c7ffef0d5..41f82049f 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -80,8 +80,10 @@ class DictionaryStructureWithBufferPolicy { virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; + // Currently, this method is used only for testing. You may want to consider creating new + // dedicated method instead of this if you want to use this in the production. virtual void getProperty(const char *const query, char *const outResult, - const int maxResultLength) const = 0; + const int maxResultLength) = 0; protected: DictionaryStructureWithBufferPolicy() {} diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 51cfba17a..73ccebc88 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -166,7 +166,11 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen // TODO: have partial commit work even with multiple pointers. const bool outputSecondWordFirstLetterInputIndex = traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); - outputAutoCommitFirstWordConfidence[0] = computeFirstWordConfidence(); + if (terminalSize > 0) { + // If we have no suggestions, don't write this + outputAutoCommitFirstWordConfidence[0] = + computeFirstWordConfidence(&terminals[0]); + } // Output suggestion results here for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS; @@ -255,9 +259,55 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen return outputWordIndex; } -int Suggest::computeFirstWordConfidence() const { - // TODO: implement this. - return NOT_A_FIRST_WORD_CONFIDENCE; +int Suggest::computeFirstWordConfidence(const DicNode *const terminalDicNode) const { + // Get the number of spaces in the first suggestion + const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); + // Get the number of characters in the first suggestion + const int length = terminalDicNode->getTotalNodeCodePointCount(); + // Get the distance for the first word of the suggestion + const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); + + // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. + // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or + // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means + // we are very confident. + // Expected space count is 1 ~ 5 + static const int MIN_EXPECTED_SPACE_COUNT = 1; + static const int MAX_EXPECTED_SPACE_COUNT = 5; + // Expected length is about 4 ~ 30 + static const int MIN_EXPECTED_LENGTH = 4; + static const int MAX_EXPECTED_LENGTH = 30; + // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 + static const float MIN_EXPECTED_DISTANCE = 0.0; + static const float MAX_EXPECTED_DISTANCE = 2.0; + // This is not strict: it's where most stuff will be falling, but it's still fine if it's + // outside these values. We want to output a value that reflects all of these. Each factor + // contributes a bit. + + // We need at least a space. + if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; + + // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 + // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the + // weight of the distance. Clamp to avoid overflows. + const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE + : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; + const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT + * (MAX_EXPECTED_DISTANCE - clampedDistance) + / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); + // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no + // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the + // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. + const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT + * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); + // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no + // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the + // weight of the space count. + const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT + * (spaceCount - MIN_EXPECTED_SPACE_COUNT) + / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); + + return distanceContribution + lengthContribution + spaceContribution; } /** @@ -395,7 +445,7 @@ void Suggest::processTerminalDicNode( if (!dicNode->isTerminalWordNode()) { return; } - if (dicNode->shouldBeFilterdBySafetyNetForBigram()) { + if (dicNode->shouldBeFilteredBySafetyNetForBigram()) { return; } // Create a non-cached node here. diff --git a/native/jni/src/suggest/core/suggest.h b/native/jni/src/suggest/core/suggest.h index 0e8bd1195..b20343d29 100644 --- a/native/jni/src/suggest/core/suggest.h +++ b/native/jni/src/suggest/core/suggest.h @@ -58,7 +58,7 @@ class Suggest : public SuggestInterface { int outputSuggestions(DicTraverseSession *traverseSession, int *frequencies, int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes, int *outputAutoCommitFirstWordConfidence) const; - int computeFirstWordConfidence() const; + int computeFirstWordConfidence(const DicNode *const terminalDicNode) const; void initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const; void expandCurrentDicNodes(DicTraverseSession *traverseSession) const; void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp index 67a085de3..b1170e251 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -20,7 +20,7 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -43,7 +43,7 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const } *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); - if (mIsDecayingDict && !DecayingUtils::isValidBigram(*outProbability)) { + if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) { // This bigram is too weak to output. *outBigramPos = NOT_A_DICT_POS; } else { @@ -261,8 +261,8 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags( bigramFlags); const int probabilityToWrite = mIsDecayingDict ? - DecayingUtils::getUpdatedBigramProbabilityDelta( - originalProbability, probability) : probability; + ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + probability) : probability; const BigramListReadWriteUtils::BigramFlags updatedFlags = BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probabilityToWrite); @@ -294,7 +294,7 @@ bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, con int *const writingPos) { // hasNext is false because we are adding a new bigram entry at the end of the bigram list. const int probabilityToWrite = mIsDecayingDict ? - DecayingUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) : + ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) : probability; return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, probabilityToWrite, false /* hasNext */, writingPos); @@ -360,14 +360,14 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( } bool DynamicBigramListPolicy::updateProbabilityForDecay( - BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, + const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const { *outRemoved = false; if (mIsDecayingDict) { // Update bigram probability for decaying. - const int newProbability = DecayingUtils::getBigramProbabilityDeltaToSave( - BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags)); - if (DecayingUtils::isValidBigram(newProbability)) { + const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave( + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy); + if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { // Write new probability. const BigramListReadWriteUtils::BigramFlags updatedBigramFlags = BigramListReadWriteUtils::setProbabilityInFlags( diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h index b358b4ed5..0504b59d5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -27,6 +27,7 @@ namespace latinime { class BufferWithExtendableBuffer; +class DictionaryHeaderStructurePolicy; class DictionaryShortcutsStructurePolicy; /* @@ -34,10 +35,12 @@ class DictionaryShortcutsStructurePolicy; */ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { public: - DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer, + DynamicBigramListPolicy(const DictionaryHeaderStructurePolicy *const headerPolicy, + BufferWithExtendableBuffer *const buffer, const DictionaryShortcutsStructurePolicy *const shortcutPolicy, const bool isDecayingDict) - : mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {} + : mHeaderPolicy(headerPolicy), mBuffer(buffer), mShortcutPolicy(shortcutPolicy), + mIsDecayingDict(isDecayingDict) {} ~DynamicBigramListPolicy() {} @@ -74,6 +77,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT; static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT; + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; BufferWithExtendableBuffer *const mBuffer; const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; const bool mIsDecayingDict; @@ -81,7 +85,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { // Follow bigram link and return the position of bigram target PtNode that is currently valid. int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const; - bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags, + bool updateProbabilityForDecay(const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const; }; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp index 081163a4d..5724c5d88 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -16,7 +16,8 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" -#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -29,15 +30,16 @@ bool DynamicPatriciaTrieGcEventListeners bool isUselessPtNode = !node->isTerminal(); if (node->isTerminal() && mIsDecayingDict) { const int newProbability = - DecayingUtils::getUnigramProbabilityToSave(node->getProbability()); + ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability(), + mHeaderPolicy); int writingPos = node->getProbabilityFieldPos(); // Update probability. if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( mBuffer, newProbability, &writingPos)) { return false; } - if (!DecayingUtils::isValidUnigram(newProbability)) { - isUselessPtNode = false; + if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { + isUselessPtNode = true; } } if (mChildrenValue > 0) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h index 463715af5..9755120b0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -29,6 +29,8 @@ namespace latinime { +class DictionaryHeaderStructurePolicy; + class DynamicPatriciaTrieGcEventListeners { public: // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or @@ -38,10 +40,12 @@ class DynamicPatriciaTrieGcEventListeners { : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + const DictionaryHeaderStructurePolicy *const headerPolicy, DynamicPatriciaTrieWritingHelper *const writingHelper, BufferWithExtendableBuffer *const buffer, const bool isDecayingDict) - : mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict), - mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {} + : mHeaderPolicy(headerPolicy), mWritingHelper(writingHelper), mBuffer(buffer), + mIsDecayingDict(isDecayingDict), mValueStack(), mChildrenValue(0), + mValidUnigramCount(0) {} ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; @@ -56,6 +60,7 @@ class DynamicPatriciaTrieGcEventListeners { bool onDescend(const int ptNodeArrayPos) { mValueStack.push_back(0); + mChildrenValue = 0; return true; } @@ -72,9 +77,10 @@ class DynamicPatriciaTrieGcEventListeners { DISALLOW_IMPLICIT_CONSTRUCTORS( TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; DynamicPatriciaTrieWritingHelper *const mWritingHelper; BufferWithExtendableBuffer *const mBuffer; - const int mIsDecayingDict; + const bool mIsDecayingDict; std::vector<int> mValueStack; int mChildrenValue; int mValidUnigramCount; @@ -85,7 +91,8 @@ class DynamicPatriciaTrieGcEventListeners { class TraversePolicyToUpdateBigramProbability : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: - TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy) + TraversePolicyToUpdateBigramProbability( + DynamicBigramListPolicy *const bigramPolicy) : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {} bool onAscend() { return true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 0d8c92768..a8ea69f3c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -28,17 +28,22 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { +// Note that these are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = + "SET_NEEDS_TO_DECAY_FOR_TESTING"; const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024; -const int DynamicPatriciaTriePolicy::MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING = 2 * 60 * 60; void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { @@ -150,7 +155,7 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, const int bigramProbability) const { if (mHeaderPolicy.isDecayingDict()) { - return DecayingUtils::getProbability(unigramProbability, bigramProbability); + return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); } else { if (unigramProbability == NOT_A_PROBABILITY) { return NOT_A_PROBABILITY; @@ -301,7 +306,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) { return; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); + &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */); writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount); } @@ -310,9 +315,15 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); return; } + const bool needsToDecay = mHeaderPolicy.isDecayingDict() + && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy)); + DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer, + &mShortcutListPolicy, needsToDecay); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); + &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay); writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); + mNeedsToDecayForTesting = false; } bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -334,27 +345,28 @@ bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy.isDecayingDict()) { - if (mUnigramCount >= DecayingUtils::MAX_UNIGRAM_COUNT) { - // Unigram count exceeds the limit. - return true; - } else if (mBigramCount >= DecayingUtils::MAX_BIGRAM_COUNT) { - // Bigram count exceeds the limit. - return true; - } else if (mindsBlockByGC && mHeaderPolicy.getLastUpdatedTime() - + MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING < time(0)) { - // Time to update probabilities for decaying. - return true; - } + return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy); } return false; } void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult, - const int maxResultLength) const { + const int maxResultLength) { if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { snprintf(outResult, maxResultLength, "%d", mUnigramCount); } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { snprintf(outResult, maxResultLength, "%d", mBigramCount); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : + static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : + static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) { + mNeedsToDecayForTesting = true; } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index d3150c6fc..be97ee1a5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -37,10 +37,10 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mShortcutListPolicy(&mBufferWithExtendableBuffer), - mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy, + mBigramListPolicy(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()), mUnigramCount(mHeaderPolicy.getUnigramCount()), - mBigramCount(mHeaderPolicy.getBigramCount()) {} + mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {} ~DynamicPatriciaTriePolicy() { delete mBuffer; @@ -95,16 +95,18 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool needsToRunGC(const bool mindsBlockByGC) const; void getProperty(const char *const query, char *const outResult, - const int maxResultLength) const; + const int maxResultLength); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); - static const char*const UNIGRAM_COUNT_QUERY; - static const char*const BIGRAM_COUNT_QUERY; + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; static const int MAX_DICT_EXTENDED_REGION_SIZE; static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; - static const int MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING; const MmappedBuffer *const mBuffer; const HeaderPolicy mHeaderPolicy; @@ -113,6 +115,7 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { DynamicBigramListPolicy mBigramListPolicy; int mUnigramCount; int mBigramCount; + int mNeedsToDecayForTesting; }; } // namespace latinime #endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp index 601ee663b..f108c219f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp @@ -93,6 +93,12 @@ bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreor if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { return false; } + if (isEnd()) { + // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + } pushReadingStateToStack(); while (!isEnd()) { if (alreadyVisitedAllPtNodesInArray) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h index 512a4d818..a71c06971 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h @@ -279,7 +279,9 @@ class DynamicPatriciaTrieReadingHelper { } else { mReadingState = mReadingStateStack.back(); mReadingStateStack.pop_back(); - fetchPtNodeInfo(); + if (!isEnd()) { + fetchPtNodeInfo(); + } } } }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp index 28124d251..052558bfc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -25,8 +25,8 @@ #include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" #include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" #include "utils/hash_map_compat.h" namespace latinime { @@ -153,7 +153,7 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + mBuffer->getUsedAdditionalBufferSize(); if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */, - unigramCount, bigramCount, extendedRegionSize)) { + false /* updatesLastDecayedTime */, unigramCount, bigramCount, extendedRegionSize)) { return; } DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer); @@ -165,12 +165,15 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNod MAX_DICTIONARY_SIZE); int unigramCount = 0; int bigramCount = 0; - if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) { + if (mNeedsToDecay) { + ForgettingCurveUtils::sTimeKeeper.setCurrentTime(); + } + if (!runGC(rootPtNodeArrayPos, headerPolicy, &newDictBuffer, &unigramCount, &bigramCount)) { return; } BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, - unigramCount, bigramCount, 0 /* extendedRegionSize */)) { + mNeedsToDecay, unigramCount, bigramCount, 0 /* extendedRegionSize */)) { return; } DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); @@ -237,7 +240,8 @@ bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( int parentOffsetFieldPos = nodeReader->getHeadPos() + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( - mBuffer, movedPos, nodeReader->getHeadPos(), &parentOffsetFieldPos)) { + mBuffer, bigramLinkedNodePos, nodeReader->getHeadPos(), + &parentOffsetFieldPos)) { // Parent offset cannot be written because of a bug or a broken dictionary; thus, // we give up to update dictionary. return false; @@ -481,20 +485,20 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( } bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, - int *const outBigramCount) { + const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite, + int *const outUnigramCount, int *const outBigramCount) { DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); DynamicPatriciaTrieGcEventListeners ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - this, mBuffer, mIsDecayingDict); + headerPolicy, this, mBuffer, mNeedsToDecay); if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { return false; } - if (mIsDecayingDict && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - .getValidUnigramCount() > DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { + if (mNeedsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { // TODO: Remove more unigrams. } @@ -505,9 +509,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, &traversePolicyToUpdateBigramProbability)) { return false; } - - if (mIsDecayingDict && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() - > DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC) { + if (mNeedsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() + > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { // TODO: Remove more bigrams. } @@ -524,8 +527,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, // Create policy instance for the GCed dictionary. DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); - DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy, - mIsDecayingDict); + DynamicBigramListPolicy newDictBigramPolicy(headerPolicy, bufferToWrite, &newDictShortcutPolicy, + mNeedsToDecay); // Create reading helper for the GCed dictionary. DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, &newDictShortcutPolicy); @@ -544,8 +547,9 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability, const int newProbability) { - if (mIsDecayingDict) { - return DecayingUtils::getUpdatedUnigramProbability(originalProbability, newProbability); + if (mNeedsToDecay) { + return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + newProbability); } else { return newProbability; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h index ecee2cdbf..ca8664729 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -51,9 +51,9 @@ class DynamicPatriciaTrieWritingHelper { DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, DynamicBigramListPolicy *const bigramPolicy, - DynamicShortcutListPolicy *const shortcutPolicy, const bool isDecayingDict) + DynamicShortcutListPolicy *const shortcutPolicy, const bool needsToDecay) : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), - mIsDecayingDict(isDecayingDict) {} + mNeedsToDecay(needsToDecay) {} ~DynamicPatriciaTrieWritingHelper() {} @@ -94,7 +94,7 @@ class DynamicPatriciaTrieWritingHelper { BufferWithExtendableBuffer *const mBuffer; DynamicBigramListPolicy *const mBigramPolicy; DynamicShortcutListPolicy *const mShortcutPolicy; - const bool mIsDecayingDict; + const bool mNeedsToDecay; bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate, const int movedPos, const int bigramLinkedNodePos); @@ -128,8 +128,9 @@ class DynamicPatriciaTrieWritingHelper { const int probabilityOfNewPtNode, const int *const newNodeCodePoints, const int newNodeCodePointCount); - bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite, - int *const outUnigramCount, int *const outBigramCount); + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, + int *const outBigramCount); int getUpdatedProbability(const int originalProbability, const int newProbability); }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index 9ce9994dd..eb072fbaf 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -23,6 +23,7 @@ const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WOR // TODO: Change attribute string to "IS_DECAYING_DICT". const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; +const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; @@ -63,8 +64,8 @@ float HeaderPolicy::readMultipleWordCostMultiplier() const { } bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount, - const int extendedRegionSize) const { + const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, const int extendedRegionSize) const { int writingPos = 0; if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion, &writingPos)) { @@ -90,6 +91,11 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY, time(0)); } + if (updatesLastDecayedTime) { + // Set current time as a last updated time. + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_DECAYED_TIME_KEY, + time(0)); + } if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, &writingPos)) { return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 4261667fa..a9c7805a8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -40,6 +40,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { IS_DECAYING_DICT_KEY, false /* defaultValue */)), mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_DECAYED_TIME_KEY, time(0) /* defaultValue */)), mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, @@ -58,6 +60,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { IS_DECAYING_DICT_KEY, false /* defaultValue */)), mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {} ~HeaderPolicy() {} @@ -90,6 +94,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mLastUpdatedTime; } + AK_FORCE_INLINE int getLastDecayedTime() const { + return mLastDecayedTime; + } + AK_FORCE_INLINE int getUnigramCount() const { return mUnigramCount; } @@ -106,8 +114,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { int *outValue, int outValueSize) const; bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const bool updatesLastUpdatedTime, const int unigramCount, - const int bigramCount, const int extendedRegionSize) const; + const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, const int extendedRegionSize) const; private: DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); @@ -115,6 +123,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; static const char *const IS_DECAYING_DICT_KEY; static const char *const LAST_UPDATED_TIME_KEY; + static const char *const LAST_DECAYED_TIME_KEY; static const char *const UNIGRAM_COUNT_KEY; static const char *const BIGRAM_COUNT_KEY; static const char *const EXTENDED_REGION_SIZE_KEY; @@ -128,6 +137,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const float mMultiWordCostMultiplier; const bool mIsDecayingDict; const int mLastUpdatedTime; + const int mLastDecayedTime; const int mUnigramCount; const int mBigramCount; const int mExtendedRegionSize; diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h index 8d88c68e8..0f8662aea 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -114,7 +114,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { } void getProperty(const char *const query, char *const outResult, - const int maxResultLength) const { + const int maxResultLength) { // getProperty is not supported for this class. if (maxResultLength > 0) { outResult[0] = '\0'; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp deleted file mode 100644 index 942a74238..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" - -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" - -namespace latinime { - -const int DecayingUtils::MAX_UNIGRAM_COUNT = 12000; -const int DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000; -const int DecayingUtils::MAX_BIGRAM_COUNT = 12000; -const int DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; - -const int DecayingUtils::MAX_COMPUTED_PROBABILITY = 127; -const int DecayingUtils::MAX_UNIGRAM_PROBABILITY = 120; -const int DecayingUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24; -const int DecayingUtils::UNIGRAM_PROBABILITY_STEP = 8; -const int DecayingUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15; -const int DecayingUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3; -const int DecayingUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1; - -/* static */ int DecayingUtils::getProbability(const int encodedUnigramProbability, - const int encodedBigramProbabilityDelta) { - if (encodedUnigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) { - const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability( - encodedUnigramProbability)); - return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); - } else { - const int rawProbability = ProbabilityUtils::computeProbabilityForBigram( - decodeUnigramProbability(encodedUnigramProbability), - decodeBigramProbabilityDelta(encodedBigramProbabilityDelta)); - return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); - } -} - -/* static */ int DecayingUtils::getUpdatedUnigramProbability(const int originalEncodedProbability, - const int newProbability) { - if (originalEncodedProbability == NOT_A_PROBABILITY) { - // The unigram is not in this dictionary. - if (newProbability == NOT_A_PROBABILITY) { - // The unigram is not in other dictionaries. - return 0; - } else { - return MIN_VALID_UNIGRAM_PROBABILITY; - } - } else { - if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) { - return MIN_VALID_UNIGRAM_PROBABILITY; - } - return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY); - } -} - -/* static */ int DecayingUtils::getUnigramProbabilityToSave(const int encodedProbability) { - return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0); -} - -/* static */ int DecayingUtils::getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta) { - return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0); -} - -/* static */ int DecayingUtils::getUpdatedBigramProbabilityDelta( - const int originalEncodedProbabilityDelta, const int newProbability) { - if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) { - // The bigram relation is not in this dictionary. - if (newProbability == NOT_A_PROBABILITY) { - // The bigram target is not in other dictionaries. - return 0; - } else { - return MIN_VALID_BIGRAM_PROBABILITY_DELTA; - } - } else { - if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) { - return MIN_VALID_BIGRAM_PROBABILITY_DELTA; - } - return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP, - MAX_BIGRAM_PROBABILITY_DELTA); - } -} - -/* static */ int DecayingUtils::isValidUnigram(const int encodedUnigramProbability) { - return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY; -} - -/* static */ int DecayingUtils::isValidBigram(const int encodedBigramProbabilityDelta) { - return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA; -} - -/* static */ int DecayingUtils::decodeUnigramProbability(const int encodedProbability) { - const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY; - if (probability < 0) { - return NOT_A_PROBABILITY; - } else { - return min(probability, MAX_UNIGRAM_PROBABILITY); - } -} - -/* static */ int DecayingUtils::decodeBigramProbabilityDelta(const int encodedProbabilityDelta) { - const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA; - if (probabilityDelta < 0) { - return NOT_A_PROBABILITY; - } else { - return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA); - } -} - -/* static */ int DecayingUtils::getDecayedProbability(const int rawProbability) { - return rawProbability; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h deleted file mode 100644 index 1ca03918f..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DECAYING_UTILS_H -#define LATINIME_DECAYING_UTILS_H - -#include "defines.h" - -namespace latinime { - -// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is -// required to introduced to each terminal PtNode and bigram entry. -// TODO: Quit using bigram probability to indicate the delta. -// TODO: Quit using bigram probability delta. -class DecayingUtils { - public: - static const int MAX_UNIGRAM_COUNT; - static const int MAX_UNIGRAM_COUNT_AFTER_GC; - static const int MAX_BIGRAM_COUNT; - static const int MAX_BIGRAM_COUNT_AFTER_GC; - - static int getProbability(const int encodedUnigramProbability, - const int encodedBigramProbabilityDelta); - - static int getUpdatedUnigramProbability(const int originalEncodedProbability, - const int newProbability); - - static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta, - const int newProbability); - - static int isValidUnigram(const int encodedUnigramProbability); - - static int isValidBigram(const int encodedProbabilityDelta); - - static int getUnigramProbabilityToSave(const int encodedProbability); - - static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DecayingUtils); - - static const int MAX_COMPUTED_PROBABILITY; - static const int MAX_UNIGRAM_PROBABILITY; - static const int MIN_VALID_UNIGRAM_PROBABILITY; - static const int UNIGRAM_PROBABILITY_STEP; - static const int MAX_BIGRAM_PROBABILITY_DELTA; - static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA; - static const int BIGRAM_PROBABILITY_DELTA_STEP; - - static int decodeUnigramProbability(const int encodedProbability); - - static int decodeBigramProbabilityDelta(const int encodedProbability); - - static int getDecayedProbability(const int rawProbability); -}; -} // namespace latinime -#endif /* LATINIME_DECAYING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index f22e94c6a..994826fa8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -44,7 +44,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap); headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, - 0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */); + true /* updatesLastDecayedTime */, 0 /* unigramCount */, 0 /* bigramCount */, + 0 /* extendedRegionSize */); BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) { return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp new file mode 100644 index 000000000..1632fd072 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cmath> +#include <ctime> +#include <stdlib.h> + +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000; +const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000; +const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; +const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; + +const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; +const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15; +const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3; +const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; +// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected +// duration of the decay is approximately 66hours. +const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; +const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; + +const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; +ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper; + +void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { + mCurrentTime = time(0); +} + +/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, + const int encodedBigramProbability) { + if (encodedUnigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (encodedBigramProbability == NOT_A_PROBABILITY) { + return backoff(decodeProbability(encodedUnigramProbability)); + } else { + const int unigramProbability = decodeProbability(encodedUnigramProbability); + const int bigramProbability = decodeProbability(encodedBigramProbability); + return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); + } +} + +// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding +// (i.e. unigram probability + bigram probability delta). +/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability( + const int originalEncodedProbability, const int newProbability) { + if (originalEncodedProbability == NOT_A_PROBABILITY) { + // The bigram relation is not in this dictionary. + if (newProbability == NOT_A_PROBABILITY) { + // The bigram target is not in other dictionaries. + return 0; + } else { + return MIN_VALID_ENCODED_PROBABILITY; + } + } else { + if (newProbability != NOT_A_PROBABILITY + && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) { + return MIN_VALID_ENCODED_PROBABILITY; + } + return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY); + } +} + +/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) { + return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; +} + +/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime(); + const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1); + int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); + // TODO: Implement the decay in more proper way. + for (int i = 0; i < decayIterationCount; ++i) { + const float currentRate = static_cast<float>(currentEncodedProbability) + / static_cast<float>(MAX_ENCODED_PROBABILITY); + const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; + const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); + if (thresholdToDecay < randValue) { + currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, + 0); + } + } + return currentEncodedProbability; +} + +/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, + const int unigramCount, const int bigramCount, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { + // Unigram count exceeds the limit. + return true; + } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { + // Bigram count exceeds the limit. + return true; + } + if (mindsBlockByDecay) { + return false; + } + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) { + // Time to decay. + return true; + } + return false; +} + +/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) { + if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) { + return NOT_A_PROBABILITY; + } else { + return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY); + } +} + +// See comments in ProbabilityUtils::backoff(). +/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else { + return max(unigramProbability - 8, 0); + } +} + +ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() { + // Table entry is as follows: + // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127. + // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used. + mTable.resize(MAX_ENCODED_PROBABILITY + 1); + for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) { + const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY), + static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY))); + mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability)); + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h new file mode 100644 index 000000000..2ad423874 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORGETTING_CURVE_UTILS_H +#define LATINIME_FORGETTING_CURVE_UTILS_H + +#include <vector> + +#include "defines.h" + +namespace latinime { + +class DictionaryHeaderStructurePolicy; + +// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is +// required to introduced to each terminal PtNode and bigram entry. +// TODO: Quit using bigram probability to indicate the delta. +class ForgettingCurveUtils { + public: + class TimeKeeper { + public: + TimeKeeper() : mCurrentTime(0) {} + void setCurrentTime(); + int peekCurrentTime() const { return mCurrentTime; }; + + private: + DISALLOW_COPY_AND_ASSIGN(TimeKeeper); + + int mCurrentTime; + }; + + static const int MAX_UNIGRAM_COUNT; + static const int MAX_UNIGRAM_COUNT_AFTER_GC; + static const int MAX_BIGRAM_COUNT; + static const int MAX_BIGRAM_COUNT_AFTER_GC; + + static TimeKeeper sTimeKeeper; + + static int getProbability(const int encodedUnigramProbability, + const int encodedBigramProbability); + + static int getUpdatedEncodedProbability(const int originalEncodedProbability, + const int newProbability); + + static int isValidEncodedProbability(const int encodedProbability); + + static int getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy); + + static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, + const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); + + class ProbabilityTable { + public: + ProbabilityTable(); + + int getProbability(const int encodedProbability) const { + if (encodedProbability < 0 || encodedProbability > static_cast<int>(mTable.size())) { + return NOT_A_PROBABILITY; + } + return mTable[encodedProbability]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); + + std::vector<int> mTable; + }; + + static const int MAX_COMPUTED_PROBABILITY; + static const int MAX_ENCODED_PROBABILITY; + static const int MIN_VALID_ENCODED_PROBABILITY; + static const int ENCODED_PROBABILITY_STEP; + static const float MIN_PROBABILITY_TO_DECAY; + static const int DECAY_INTERVAL_SECONDS; + + static const ProbabilityTable sProbabilityTable; + + static int decodeProbability(const int encodedProbability); + + static int backoff(const int unigramProbability); +}; +} // namespace latinime +#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index ecceb60d3..104eb2a7a 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -27,30 +27,30 @@ const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4; -const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.132f; -const float ScoringParams::PROXIMITY_COST = 0.095f; -const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.102f; -const float ScoringParams::FIRST_PROXIMITY_COST = 0.019f; -const float ScoringParams::OMISSION_COST = 0.458f; -const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.491f; -const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.582f; -const float ScoringParams::INSERTION_COST = 0.730f; -const float ScoringParams::TERMINAL_INSERTION_COST = 0.93f; -const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.586f; -const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.70f; -const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.623f; -const float ScoringParams::TRANSPOSITION_COST = 0.526f; -const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.319f; -const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.380f; -const float ScoringParams::SUBSTITUTION_COST = 0.383f; -const float ScoringParams::COST_NEW_WORD = 0.042f; -const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.25f; -const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.123f; -const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.545f; -const float ScoringParams::COST_LOOKAHEAD = 0.073f; -const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.093f; -const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.041f; -const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.447f; +const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f; +const float ScoringParams::PROXIMITY_COST = 0.0694f; +const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.072f; +const float ScoringParams::FIRST_PROXIMITY_COST = 0.07788f; +const float ScoringParams::OMISSION_COST = 0.4676f; +const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.399f; +const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.5256f; +const float ScoringParams::INSERTION_COST = 0.7248f; +const float ScoringParams::TERMINAL_INSERTION_COST = 0.8128f; +const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f; +const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f; +const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f; +const float ScoringParams::TRANSPOSITION_COST = 0.5608f; +const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.339f; +const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.4576f; +const float ScoringParams::SUBSTITUTION_COST = 0.3806f; +const float ScoringParams::COST_NEW_WORD = 0.0312f; +const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f; +const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f; +const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.4836f; +const float ScoringParams::COST_LOOKAHEAD = 0.00624f; +const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.06836f; +const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f; +const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.4182f; const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f; const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f; const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.045f; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h index 89e53f441..007c19e0a 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h @@ -101,7 +101,7 @@ class TypingTraversal : public Traversal { } const int16_t pointIndex = dicNode->getInputIndex(0); return pointIndex <= inputSize && !dicNode->isTotalInputSizeExceedingLimit() - && !dicNode->shouldBeFilterdBySafetyNetForBigram(); + && !dicNode->shouldBeFilteredBySafetyNetForBigram(); } AK_FORCE_INLINE bool shouldDepthLevelCache( diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index 2e735a81c..41663c81a 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -75,6 +75,16 @@ class CharUtils { return c; } + static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) { + int spaceCount = 0; + for (int i = 0; i < length; ++i) { + if (codePointBuffer[i] == KEYCODE_SPACE) { + ++spaceCount; + } + } + return spaceCount; + } + static unsigned short latin_tolower(const unsigned short c); private: |