aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src')
-rw-r--r--native/jni/src/defines.h12
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node.h12
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.cpp2
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.h2
-rw-r--r--native/jni/src/suggest/core/dictionary/shortcut_utils.h2
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_params.cpp12
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_params.h10
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp10
-rw-r--r--native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h2
-rw-r--r--native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h4
-rw-r--r--native/jni/src/suggest/core/suggest.cpp60
-rw-r--r--native/jni/src/suggest/core/suggest.h2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp18
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h10
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp10
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h15
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp46
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h15
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp6
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp38
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h11
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp10
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h14
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp129
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h70
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp3
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp155
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h100
-rw-r--r--native/jni/src/suggest/policyimpl/typing/scoring_params.cpp48
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_traversal.h2
-rw-r--r--native/jni/src/utils/char_utils.h10
33 files changed, 522 insertions, 324 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h
index c920f64b4..742e388e4 100644
--- a/native/jni/src/defines.h
+++ b/native/jni/src/defines.h
@@ -298,9 +298,19 @@ static inline void prof_out(void) {
#define NOT_AN_INDEX (-1)
#define NOT_A_PROBABILITY (-1)
#define NOT_A_DICT_POS (S_INT_MIN)
+
// A special value to mean the first word confidence makes no sense in this case,
// e.g. this is not a multi-word suggestion.
-#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MIN)
+#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MAX)
+// How high the confidence needs to be for us to auto-commit. Arbitrary.
+// This needs to be the same as CONFIDENCE_FOR_AUTO_COMMIT in BinaryDictionary.java
+#define CONFIDENCE_FOR_AUTO_COMMIT (1000000)
+// 80% of the full confidence
+#define DISTANCE_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100)
+// 100% of the full confidence
+#define LENGTH_WEIGHT_FOR_AUTO_COMMIT (CONFIDENCE_FOR_AUTO_COMMIT)
+// 80% of the full confidence
+#define SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100)
#define KEYCODE_SPACE ' '
#define KEYCODE_SINGLE_QUOTE '\''
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h
index 9099e8285..49cfdecac 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node.h
@@ -271,7 +271,7 @@ class DicNode {
return isTerminalNodes && currentNodeDepth > 0 && currentNodeDepth == terminalNodeDepth;
}
- bool shouldBeFilterdBySafetyNetForBigram() const {
+ bool shouldBeFilteredBySafetyNetForBigram() const {
const uint16_t currentDepth = getNodeCodePointCount();
const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength()
- mDicNodeState.mDicNodeStatePrevWord.getPrevWordStart() - 1;
@@ -321,6 +321,16 @@ class DicNode {
DUMP_WORD_AND_SCORE("OUTPUT");
}
+ // "Total" in this context (and other methods in this class) means the whole suggestion. When
+ // this represents a multi-word suggestion, the referenced PtNode (in mDicNodeState) is only
+ // the one that corresponds to the last word of the suggestion, and all the previous words
+ // are concatenated together in mPrevWord - which contains a space at the end.
+ int getTotalNodeSpaceCount() const {
+ if (isFirstWord()) return 0;
+ return CharUtils::getSpaceCount(mDicNodeState.mDicNodeStatePrevWord.mPrevWord,
+ mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength());
+ }
+
int getSecondWordFirstInputIndex(const ProximityInfoState *const pInfoState) const {
const int inputIndex = mDicNodeState.mDicNodeStatePrevWord.getSecondWordFirstInputIndex();
if (inputIndex == NOT_AN_INDEX) {
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp
index 5969b31cc..59ead1894 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp
@@ -129,7 +129,7 @@ bool Dictionary::needsToRunGC(const bool mindsBlockByGC) {
}
void Dictionary::getProperty(const char *const query, char *const outResult,
- const int maxResultLength) const {
+ const int maxResultLength) {
return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength);
}
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h
index 43d3b964d..0195d5bf0 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.h
+++ b/native/jni/src/suggest/core/dictionary/dictionary.h
@@ -84,7 +84,7 @@ class Dictionary {
bool needsToRunGC(const bool mindsBlockByGC);
void getProperty(const char *const query, char *const outResult,
- const int maxResultLength) const;
+ const int maxResultLength);
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
return mDictionaryStructureWithBufferPolicy;
diff --git a/native/jni/src/suggest/core/dictionary/shortcut_utils.h b/native/jni/src/suggest/core/dictionary/shortcut_utils.h
index 461d7b454..9ccef020f 100644
--- a/native/jni/src/suggest/core/dictionary/shortcut_utils.h
+++ b/native/jni/src/suggest/core/dictionary/shortcut_utils.h
@@ -44,7 +44,7 @@ class ShortcutUtils {
shortcutScore = finalScore;
// Protection against int underflow
shortcutScore = max(S_INT_MIN + 1, shortcutScore) - 1;
- kind = Dictionary::KIND_CORRECTION;
+ kind = Dictionary::KIND_SHORTCUT;
}
outputTypes[outputWordIndex] = kind;
frequencies[outputWordIndex] = shortcutScore;
diff --git a/native/jni/src/suggest/core/layout/proximity_info_params.cpp b/native/jni/src/suggest/core/layout/proximity_info_params.cpp
index 0e887f700..49df10301 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_params.cpp
+++ b/native/jni/src/suggest/core/layout/proximity_info_params.cpp
@@ -69,13 +69,13 @@ const float ProximityInfoParams::STRAIGHT_ANGLE_THRESHOLD = M_PI_F * 15.0f / 180
const float ProximityInfoParams::SKIP_CORNER_PROBABILITY = 0.4f;
const float ProximityInfoParams::SPEED_MARGIN = 0.1f;
const float ProximityInfoParams::CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION = 0.0f;
-// TODO: The variance is critical for accuracy; thus, adjusting these parameter by machine
+// TODO: The variance is critical for accuracy; thus, adjusting these parameters by machine
// learning or something would be efficient.
-const float ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DIVIATION = 0.3f;
-const float ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDERD_DIVIATION = 0.25f;
-const float ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DIVIATION = 0.5f;
-const float ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDERD_DIVIATION = 0.15f;
-const float ProximityInfoParams::MIN_STANDERD_DIVIATION = 0.37f;
+const float ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION = 0.3f;
+const float ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION = 0.25f;
+const float ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION = 0.5f;
+const float ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION = 0.15f;
+const float ProximityInfoParams::MIN_STANDARD_DEVIATION = 0.37f;
const float ProximityInfoParams::PREV_DISTANCE_WEIGHT = 0.5f;
const float ProximityInfoParams::NEXT_DISTANCE_WEIGHT = 0.6f;
diff --git a/native/jni/src/suggest/core/layout/proximity_info_params.h b/native/jni/src/suggest/core/layout/proximity_info_params.h
index 4e47f7308..ae1f82c22 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_params.h
+++ b/native/jni/src/suggest/core/layout/proximity_info_params.h
@@ -73,11 +73,11 @@ class ProximityInfoParams {
static const float SKIP_CORNER_PROBABILITY;
static const float SPEED_MARGIN;
static const float CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION;
- static const float SPEEDxANGLE_WEIGHT_FOR_STANDARD_DIVIATION;
- static const float MAX_SPEEDxANGLE_RATE_FOR_STANDERD_DIVIATION;
- static const float SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DIVIATION;
- static const float MAX_SPEEDxNEAREST_RATE_FOR_STANDERD_DIVIATION;
- static const float MIN_STANDERD_DIVIATION;
+ static const float SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION;
+ static const float MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION;
+ static const float SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION;
+ static const float MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION;
+ static const float MIN_STANDARD_DEVIATION;
static const float PREV_DISTANCE_WEIGHT;
static const float NEXT_DISTANCE_WEIGHT;
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp
index 904671f7f..e1b35340b 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp
+++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp
@@ -708,13 +708,13 @@ namespace latinime {
const float inputCharProbability = 1.0f - skipProbability;
const float speedxAngleRate = min(speedRate * currentAngle / M_PI_F
- * ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DIVIATION,
- ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDERD_DIVIATION);
+ * ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION,
+ ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION);
const float speedxNearestKeyDistanceRate = min(speedRate * nearestKeyDistance
- * ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DIVIATION,
- ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDERD_DIVIATION);
+ * ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION,
+ ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION);
const float sigma = speedxAngleRate + speedxNearestKeyDistanceRate
- + ProximityInfoParams::MIN_STANDERD_DIVIATION;
+ + ProximityInfoParams::MIN_STANDARD_DEVIATION;
ProximityInfoUtils::NormalDistribution
distribution(ProximityInfoParams::CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION, sigma);
diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
index a6829b476..5492c6070 100644
--- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
+++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
@@ -37,6 +37,8 @@ class DictionaryHeaderStructurePolicy {
virtual float getMultiWordCostMultiplier() const = 0;
+ virtual int getLastDecayedTime() const = 0;
+
virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue,
int outValueSize) const = 0;
diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
index c7ffef0d5..41f82049f 100644
--- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
+++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
@@ -80,8 +80,10 @@ class DictionaryStructureWithBufferPolicy {
virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0;
+ // Currently, this method is used only for testing. You may want to consider creating new
+ // dedicated method instead of this if you want to use this in the production.
virtual void getProperty(const char *const query, char *const outResult,
- const int maxResultLength) const = 0;
+ const int maxResultLength) = 0;
protected:
DictionaryStructureWithBufferPolicy() {}
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index 51cfba17a..73ccebc88 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -166,7 +166,11 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
// TODO: have partial commit work even with multiple pointers.
const bool outputSecondWordFirstLetterInputIndex =
traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
- outputAutoCommitFirstWordConfidence[0] = computeFirstWordConfidence();
+ if (terminalSize > 0) {
+ // If we have no suggestions, don't write this
+ outputAutoCommitFirstWordConfidence[0] =
+ computeFirstWordConfidence(&terminals[0]);
+ }
// Output suggestion results here
for (int terminalIndex = 0; terminalIndex < terminalSize && outputWordIndex < MAX_RESULTS;
@@ -255,9 +259,55 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
return outputWordIndex;
}
-int Suggest::computeFirstWordConfidence() const {
- // TODO: implement this.
- return NOT_A_FIRST_WORD_CONFIDENCE;
+int Suggest::computeFirstWordConfidence(const DicNode *const terminalDicNode) const {
+ // Get the number of spaces in the first suggestion
+ const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
+ // Get the number of characters in the first suggestion
+ const int length = terminalDicNode->getTotalNodeCodePointCount();
+ // Get the distance for the first word of the suggestion
+ const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
+
+ // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
+ // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
+ // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
+ // we are very confident.
+ // Expected space count is 1 ~ 5
+ static const int MIN_EXPECTED_SPACE_COUNT = 1;
+ static const int MAX_EXPECTED_SPACE_COUNT = 5;
+ // Expected length is about 4 ~ 30
+ static const int MIN_EXPECTED_LENGTH = 4;
+ static const int MAX_EXPECTED_LENGTH = 30;
+ // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
+ static const float MIN_EXPECTED_DISTANCE = 0.0;
+ static const float MAX_EXPECTED_DISTANCE = 2.0;
+ // This is not strict: it's where most stuff will be falling, but it's still fine if it's
+ // outside these values. We want to output a value that reflects all of these. Each factor
+ // contributes a bit.
+
+ // We need at least a space.
+ if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
+
+ // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
+ // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
+ // weight of the distance. Clamp to avoid overflows.
+ const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
+ : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
+ const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
+ * (MAX_EXPECTED_DISTANCE - clampedDistance)
+ / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
+ // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
+ // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
+ // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
+ const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
+ * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
+ // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
+ // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
+ // weight of the space count.
+ const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
+ * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
+ / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
+
+ return distanceContribution + lengthContribution + spaceContribution;
}
/**
@@ -395,7 +445,7 @@ void Suggest::processTerminalDicNode(
if (!dicNode->isTerminalWordNode()) {
return;
}
- if (dicNode->shouldBeFilterdBySafetyNetForBigram()) {
+ if (dicNode->shouldBeFilteredBySafetyNetForBigram()) {
return;
}
// Create a non-cached node here.
diff --git a/native/jni/src/suggest/core/suggest.h b/native/jni/src/suggest/core/suggest.h
index 0e8bd1195..b20343d29 100644
--- a/native/jni/src/suggest/core/suggest.h
+++ b/native/jni/src/suggest/core/suggest.h
@@ -58,7 +58,7 @@ class Suggest : public SuggestInterface {
int outputSuggestions(DicTraverseSession *traverseSession, int *frequencies,
int *outputCodePoints, int *outputIndicesToPartialCommit, int *outputTypes,
int *outputAutoCommitFirstWordConfidence) const;
- int computeFirstWordConfidence() const;
+ int computeFirstWordConfidence(const DicNode *const terminalDicNode) const;
void initializeSearch(DicTraverseSession *traverseSession, int commitPoint) const;
void expandCurrentDicNodes(DicTraverseSession *traverseSession) const;
void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
index 67a085de3..b1170e251 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
@@ -20,7 +20,7 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
@@ -43,7 +43,7 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const
}
*outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags);
*outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags);
- if (mIsDecayingDict && !DecayingUtils::isValidBigram(*outProbability)) {
+ if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) {
// This bigram is too weak to output.
*outBigramPos = NOT_A_DICT_POS;
} else {
@@ -261,8 +261,8 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags(
bigramFlags);
const int probabilityToWrite = mIsDecayingDict ?
- DecayingUtils::getUpdatedBigramProbabilityDelta(
- originalProbability, probability) : probability;
+ ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability,
+ probability) : probability;
const BigramListReadWriteUtils::BigramFlags updatedFlags =
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags,
probabilityToWrite);
@@ -294,7 +294,7 @@ bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, con
int *const writingPos) {
// hasNext is false because we are adding a new bigram entry at the end of the bigram list.
const int probabilityToWrite = mIsDecayingDict ?
- DecayingUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) :
+ ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) :
probability;
return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos,
probabilityToWrite, false /* hasNext */, writingPos);
@@ -360,14 +360,14 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos(
}
bool DynamicBigramListPolicy::updateProbabilityForDecay(
- BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos,
+ const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos,
int *const bigramEntryPos, bool *const outRemoved) const {
*outRemoved = false;
if (mIsDecayingDict) {
// Update bigram probability for decaying.
- const int newProbability = DecayingUtils::getBigramProbabilityDeltaToSave(
- BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags));
- if (DecayingUtils::isValidBigram(newProbability)) {
+ const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave(
+ BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy);
+ if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
// Write new probability.
const BigramListReadWriteUtils::BigramFlags updatedBigramFlags =
BigramListReadWriteUtils::setProbabilityInFlags(
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
index b358b4ed5..0504b59d5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
@@ -27,6 +27,7 @@
namespace latinime {
class BufferWithExtendableBuffer;
+class DictionaryHeaderStructurePolicy;
class DictionaryShortcutsStructurePolicy;
/*
@@ -34,10 +35,12 @@ class DictionaryShortcutsStructurePolicy;
*/
class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
public:
- DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer,
+ DynamicBigramListPolicy(const DictionaryHeaderStructurePolicy *const headerPolicy,
+ BufferWithExtendableBuffer *const buffer,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const bool isDecayingDict)
- : mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {}
+ : mHeaderPolicy(headerPolicy), mBuffer(buffer), mShortcutPolicy(shortcutPolicy),
+ mIsDecayingDict(isDecayingDict) {}
~DynamicBigramListPolicy() {}
@@ -74,6 +77,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT;
static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT;
+ const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
BufferWithExtendableBuffer *const mBuffer;
const DictionaryShortcutsStructurePolicy *const mShortcutPolicy;
const bool mIsDecayingDict;
@@ -81,7 +85,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
// Follow bigram link and return the position of bigram target PtNode that is currently valid.
int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const;
- bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags,
+ bool updateProbabilityForDecay(const BigramListReadWriteUtils::BigramFlags bigramFlags,
const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const;
};
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
index 081163a4d..5724c5d88 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
@@ -16,7 +16,8 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h"
-#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+#include "suggest/core/policy/dictionary_header_structure_policy.h"
+#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
@@ -29,15 +30,16 @@ bool DynamicPatriciaTrieGcEventListeners
bool isUselessPtNode = !node->isTerminal();
if (node->isTerminal() && mIsDecayingDict) {
const int newProbability =
- DecayingUtils::getUnigramProbabilityToSave(node->getProbability());
+ ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability(),
+ mHeaderPolicy);
int writingPos = node->getProbabilityFieldPos();
// Update probability.
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(
mBuffer, newProbability, &writingPos)) {
return false;
}
- if (!DecayingUtils::isValidUnigram(newProbability)) {
- isUselessPtNode = false;
+ if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
+ isUselessPtNode = true;
}
}
if (mChildrenValue > 0) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
index 463715af5..9755120b0 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
@@ -29,6 +29,8 @@
namespace latinime {
+class DictionaryHeaderStructurePolicy;
+
class DynamicPatriciaTrieGcEventListeners {
public:
// Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or
@@ -38,10 +40,12 @@ class DynamicPatriciaTrieGcEventListeners {
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public:
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
+ const DictionaryHeaderStructurePolicy *const headerPolicy,
DynamicPatriciaTrieWritingHelper *const writingHelper,
BufferWithExtendableBuffer *const buffer, const bool isDecayingDict)
- : mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict),
- mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {}
+ : mHeaderPolicy(headerPolicy), mWritingHelper(writingHelper), mBuffer(buffer),
+ mIsDecayingDict(isDecayingDict), mValueStack(), mChildrenValue(0),
+ mValidUnigramCount(0) {}
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
@@ -56,6 +60,7 @@ class DynamicPatriciaTrieGcEventListeners {
bool onDescend(const int ptNodeArrayPos) {
mValueStack.push_back(0);
+ mChildrenValue = 0;
return true;
}
@@ -72,9 +77,10 @@ class DynamicPatriciaTrieGcEventListeners {
DISALLOW_IMPLICIT_CONSTRUCTORS(
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
+ const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
DynamicPatriciaTrieWritingHelper *const mWritingHelper;
BufferWithExtendableBuffer *const mBuffer;
- const int mIsDecayingDict;
+ const bool mIsDecayingDict;
std::vector<int> mValueStack;
int mChildrenValue;
int mValidUnigramCount;
@@ -85,7 +91,8 @@ class DynamicPatriciaTrieGcEventListeners {
class TraversePolicyToUpdateBigramProbability
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public:
- TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
+ TraversePolicyToUpdateBigramProbability(
+ DynamicBigramListPolicy *const bigramPolicy)
: mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {}
bool onAscend() { return true; }
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
index 0d8c92768..a8ea69f3c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
@@ -28,17 +28,22 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime {
+// Note that these are corresponding definitions in Java side in BinaryDictionaryTests and
+// BinaryDictionaryDecayingTests.
const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
+const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
+const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY =
+ "SET_NEEDS_TO_DECAY_FOR_TESTING";
const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024;
-const int DynamicPatriciaTriePolicy::MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING = 2 * 60 * 60;
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
@@ -150,7 +155,7 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
if (mHeaderPolicy.isDecayingDict()) {
- return DecayingUtils::getProbability(unigramProbability, bigramProbability);
+ return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability);
} else {
if (unigramProbability == NOT_A_PROBABILITY) {
return NOT_A_PROBABILITY;
@@ -301,7 +306,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
return;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
+ &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */);
writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
}
@@ -310,9 +315,15 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
return;
}
+ const bool needsToDecay = mHeaderPolicy.isDecayingDict()
+ && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
+ false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy));
+ DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer,
+ &mShortcutListPolicy, needsToDecay);
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
+ &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay);
writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy);
+ mNeedsToDecayForTesting = false;
}
bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
@@ -334,27 +345,28 @@ bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
// Needs to reduce dictionary size.
return true;
} else if (mHeaderPolicy.isDecayingDict()) {
- if (mUnigramCount >= DecayingUtils::MAX_UNIGRAM_COUNT) {
- // Unigram count exceeds the limit.
- return true;
- } else if (mBigramCount >= DecayingUtils::MAX_BIGRAM_COUNT) {
- // Bigram count exceeds the limit.
- return true;
- } else if (mindsBlockByGC && mHeaderPolicy.getLastUpdatedTime()
- + MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING < time(0)) {
- // Time to update probabilities for decaying.
- return true;
- }
+ return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
+ mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy);
}
return false;
}
void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
- const int maxResultLength) const {
+ const int maxResultLength) {
if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d", mUnigramCount);
} else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d", mBigramCount);
+ } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT :
+ static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE));
+ } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
+ static_cast<int>(DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE));
+ } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) {
+ mNeedsToDecayForTesting = true;
}
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
index d3150c6fc..be97ee1a5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
@@ -37,10 +37,10 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
mShortcutListPolicy(&mBufferWithExtendableBuffer),
- mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy,
+ mBigramListPolicy(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy,
mHeaderPolicy.isDecayingDict()),
mUnigramCount(mHeaderPolicy.getUnigramCount()),
- mBigramCount(mHeaderPolicy.getBigramCount()) {}
+ mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {}
~DynamicPatriciaTriePolicy() {
delete mBuffer;
@@ -95,16 +95,18 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
bool needsToRunGC(const bool mindsBlockByGC) const;
void getProperty(const char *const query, char *const outResult,
- const int maxResultLength) const;
+ const int maxResultLength);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
- static const char*const UNIGRAM_COUNT_QUERY;
- static const char*const BIGRAM_COUNT_QUERY;
+ static const char *const UNIGRAM_COUNT_QUERY;
+ static const char *const BIGRAM_COUNT_QUERY;
+ static const char *const MAX_UNIGRAM_COUNT_QUERY;
+ static const char *const MAX_BIGRAM_COUNT_QUERY;
+ static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY;
static const int MAX_DICT_EXTENDED_REGION_SIZE;
static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
- static const int MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING;
const MmappedBuffer *const mBuffer;
const HeaderPolicy mHeaderPolicy;
@@ -113,6 +115,7 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
DynamicBigramListPolicy mBigramListPolicy;
int mUnigramCount;
int mBigramCount;
+ int mNeedsToDecayForTesting;
};
} // namespace latinime
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp
index 601ee663b..f108c219f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp
@@ -93,6 +93,12 @@ bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreor
if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
return false;
}
+ if (isEnd()) {
+ // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array.
+ if (!listener->onReadingPtNodeArrayTail()) {
+ return false;
+ }
+ }
pushReadingStateToStack();
while (!isEnd()) {
if (alreadyVisitedAllPtNodesInArray) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
index 512a4d818..a71c06971 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
@@ -279,7 +279,9 @@ class DynamicPatriciaTrieReadingHelper {
} else {
mReadingState = mReadingStateStack.back();
mReadingStateStack.pop_back();
- fetchPtNodeInfo();
+ if (!isEnd()) {
+ fetchPtNodeInfo();
+ }
}
}
};
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
index 28124d251..052558bfc 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
@@ -25,8 +25,8 @@
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
+#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
#include "utils/hash_map_compat.h"
namespace latinime {
@@ -153,7 +153,7 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam
const int extendedRegionSize = headerPolicy->getExtendedRegionSize() +
mBuffer->getUsedAdditionalBufferSize();
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */,
- unigramCount, bigramCount, extendedRegionSize)) {
+ false /* updatesLastDecayedTime */, unigramCount, bigramCount, extendedRegionSize)) {
return;
}
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer);
@@ -165,12 +165,15 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNod
MAX_DICTIONARY_SIZE);
int unigramCount = 0;
int bigramCount = 0;
- if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) {
+ if (mNeedsToDecay) {
+ ForgettingCurveUtils::sTimeKeeper.setCurrentTime();
+ }
+ if (!runGC(rootPtNodeArrayPos, headerPolicy, &newDictBuffer, &unigramCount, &bigramCount)) {
return;
}
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
- unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
+ mNeedsToDecay, unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
return;
}
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer);
@@ -237,7 +240,8 @@ bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition(
int parentOffsetFieldPos = nodeReader->getHeadPos()
+ DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE;
if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(
- mBuffer, movedPos, nodeReader->getHeadPos(), &parentOffsetFieldPos)) {
+ mBuffer, bigramLinkedNodePos, nodeReader->getHeadPos(),
+ &parentOffsetFieldPos)) {
// Parent offset cannot be written because of a bug or a broken dictionary; thus,
// we give up to update dictionary.
return false;
@@ -481,20 +485,20 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes(
}
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
- BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
- int *const outBigramCount) {
+ const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite,
+ int *const outUnigramCount, int *const outBigramCount) {
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
- this, mBuffer, mIsDecayingDict);
+ headerPolicy, this, mBuffer, mNeedsToDecay);
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
return false;
}
- if (mIsDecayingDict && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
- .getValidUnigramCount() > DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
+ if (mNeedsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ .getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
// TODO: Remove more unigrams.
}
@@ -505,9 +509,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
&traversePolicyToUpdateBigramProbability)) {
return false;
}
-
- if (mIsDecayingDict && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
- > DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
+ if (mNeedsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
+ > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
// TODO: Remove more bigrams.
}
@@ -524,8 +527,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
// Create policy instance for the GCed dictionary.
DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite);
- DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy,
- mIsDecayingDict);
+ DynamicBigramListPolicy newDictBigramPolicy(headerPolicy, bufferToWrite, &newDictShortcutPolicy,
+ mNeedsToDecay);
// Create reading helper for the GCed dictionary.
DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy,
&newDictShortcutPolicy);
@@ -544,8 +547,9 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability,
const int newProbability) {
- if (mIsDecayingDict) {
- return DecayingUtils::getUpdatedUnigramProbability(originalProbability, newProbability);
+ if (mNeedsToDecay) {
+ return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability,
+ newProbability);
} else {
return newProbability;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
index ecee2cdbf..ca8664729 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
@@ -51,9 +51,9 @@ class DynamicPatriciaTrieWritingHelper {
DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer,
DynamicBigramListPolicy *const bigramPolicy,
- DynamicShortcutListPolicy *const shortcutPolicy, const bool isDecayingDict)
+ DynamicShortcutListPolicy *const shortcutPolicy, const bool needsToDecay)
: mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy),
- mIsDecayingDict(isDecayingDict) {}
+ mNeedsToDecay(needsToDecay) {}
~DynamicPatriciaTrieWritingHelper() {}
@@ -94,7 +94,7 @@ class DynamicPatriciaTrieWritingHelper {
BufferWithExtendableBuffer *const mBuffer;
DynamicBigramListPolicy *const mBigramPolicy;
DynamicShortcutListPolicy *const mShortcutPolicy;
- const bool mIsDecayingDict;
+ const bool mNeedsToDecay;
bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate,
const int movedPos, const int bigramLinkedNodePos);
@@ -128,8 +128,9 @@ class DynamicPatriciaTrieWritingHelper {
const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
const int newNodeCodePointCount);
- bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
- int *const outUnigramCount, int *const outBigramCount);
+ bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
+ BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
+ int *const outBigramCount);
int getUpdatedProbability(const int originalProbability, const int newProbability);
};
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
index 9ce9994dd..eb072fbaf 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
@@ -23,6 +23,7 @@ const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WOR
// TODO: Change attribute string to "IS_DECAYING_DICT".
const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date";
+const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
@@ -63,8 +64,8 @@ float HeaderPolicy::readMultipleWordCostMultiplier() const {
}
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
- const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount,
- const int extendedRegionSize) const {
+ const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime,
+ const int unigramCount, const int bigramCount, const int extendedRegionSize) const {
int writingPos = 0;
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
&writingPos)) {
@@ -90,6 +91,11 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY,
time(0));
}
+ if (updatesLastDecayedTime) {
+ // Set current time as a last updated time.
+ HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_DECAYED_TIME_KEY,
+ time(0));
+ }
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
&writingPos)) {
return false;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index 4261667fa..a9c7805a8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
@@ -40,6 +40,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
IS_DECAYING_DICT_KEY, false /* defaultValue */)),
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
+ mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ LAST_DECAYED_TIME_KEY, time(0) /* defaultValue */)),
mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
@@ -58,6 +60,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
IS_DECAYING_DICT_KEY, false /* defaultValue */)),
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
+ mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {}
~HeaderPolicy() {}
@@ -90,6 +94,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mLastUpdatedTime;
}
+ AK_FORCE_INLINE int getLastDecayedTime() const {
+ return mLastDecayedTime;
+ }
+
AK_FORCE_INLINE int getUnigramCount() const {
return mUnigramCount;
}
@@ -106,8 +114,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
int *outValue, int outValueSize) const;
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
- const bool updatesLastUpdatedTime, const int unigramCount,
- const int bigramCount, const int extendedRegionSize) const;
+ const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime,
+ const int unigramCount, const int bigramCount, const int extendedRegionSize) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
@@ -115,6 +123,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
static const char *const IS_DECAYING_DICT_KEY;
static const char *const LAST_UPDATED_TIME_KEY;
+ static const char *const LAST_DECAYED_TIME_KEY;
static const char *const UNIGRAM_COUNT_KEY;
static const char *const BIGRAM_COUNT_KEY;
static const char *const EXTENDED_REGION_SIZE_KEY;
@@ -128,6 +137,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const float mMultiWordCostMultiplier;
const bool mIsDecayingDict;
const int mLastUpdatedTime;
+ const int mLastDecayedTime;
const int mUnigramCount;
const int mBigramCount;
const int mExtendedRegionSize;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h
index 8d88c68e8..0f8662aea 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h
@@ -114,7 +114,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
}
void getProperty(const char *const query, char *const outResult,
- const int maxResultLength) const {
+ const int maxResultLength) {
// getProperty is not supported for this class.
if (maxResultLength > 0) {
outResult[0] = '\0';
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp
deleted file mode 100644
index 942a74238..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
-
-#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
-
-namespace latinime {
-
-const int DecayingUtils::MAX_UNIGRAM_COUNT = 12000;
-const int DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
-const int DecayingUtils::MAX_BIGRAM_COUNT = 12000;
-const int DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
-
-const int DecayingUtils::MAX_COMPUTED_PROBABILITY = 127;
-const int DecayingUtils::MAX_UNIGRAM_PROBABILITY = 120;
-const int DecayingUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24;
-const int DecayingUtils::UNIGRAM_PROBABILITY_STEP = 8;
-const int DecayingUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15;
-const int DecayingUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3;
-const int DecayingUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1;
-
-/* static */ int DecayingUtils::getProbability(const int encodedUnigramProbability,
- const int encodedBigramProbabilityDelta) {
- if (encodedUnigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) {
- const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability(
- encodedUnigramProbability));
- return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
- } else {
- const int rawProbability = ProbabilityUtils::computeProbabilityForBigram(
- decodeUnigramProbability(encodedUnigramProbability),
- decodeBigramProbabilityDelta(encodedBigramProbabilityDelta));
- return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
- }
-}
-
-/* static */ int DecayingUtils::getUpdatedUnigramProbability(const int originalEncodedProbability,
- const int newProbability) {
- if (originalEncodedProbability == NOT_A_PROBABILITY) {
- // The unigram is not in this dictionary.
- if (newProbability == NOT_A_PROBABILITY) {
- // The unigram is not in other dictionaries.
- return 0;
- } else {
- return MIN_VALID_UNIGRAM_PROBABILITY;
- }
- } else {
- if (newProbability != NOT_A_PROBABILITY
- && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) {
- return MIN_VALID_UNIGRAM_PROBABILITY;
- }
- return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY);
- }
-}
-
-/* static */ int DecayingUtils::getUnigramProbabilityToSave(const int encodedProbability) {
- return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0);
-}
-
-/* static */ int DecayingUtils::getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta) {
- return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0);
-}
-
-/* static */ int DecayingUtils::getUpdatedBigramProbabilityDelta(
- const int originalEncodedProbabilityDelta, const int newProbability) {
- if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) {
- // The bigram relation is not in this dictionary.
- if (newProbability == NOT_A_PROBABILITY) {
- // The bigram target is not in other dictionaries.
- return 0;
- } else {
- return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
- }
- } else {
- if (newProbability != NOT_A_PROBABILITY
- && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) {
- return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
- }
- return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP,
- MAX_BIGRAM_PROBABILITY_DELTA);
- }
-}
-
-/* static */ int DecayingUtils::isValidUnigram(const int encodedUnigramProbability) {
- return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY;
-}
-
-/* static */ int DecayingUtils::isValidBigram(const int encodedBigramProbabilityDelta) {
- return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA;
-}
-
-/* static */ int DecayingUtils::decodeUnigramProbability(const int encodedProbability) {
- const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY;
- if (probability < 0) {
- return NOT_A_PROBABILITY;
- } else {
- return min(probability, MAX_UNIGRAM_PROBABILITY);
- }
-}
-
-/* static */ int DecayingUtils::decodeBigramProbabilityDelta(const int encodedProbabilityDelta) {
- const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA;
- if (probabilityDelta < 0) {
- return NOT_A_PROBABILITY;
- } else {
- return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA);
- }
-}
-
-/* static */ int DecayingUtils::getDecayedProbability(const int rawProbability) {
- return rawProbability;
-}
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h
deleted file mode 100644
index 1ca03918f..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_DECAYING_UTILS_H
-#define LATINIME_DECAYING_UTILS_H
-
-#include "defines.h"
-
-namespace latinime {
-
-// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
-// required to introduced to each terminal PtNode and bigram entry.
-// TODO: Quit using bigram probability to indicate the delta.
-// TODO: Quit using bigram probability delta.
-class DecayingUtils {
- public:
- static const int MAX_UNIGRAM_COUNT;
- static const int MAX_UNIGRAM_COUNT_AFTER_GC;
- static const int MAX_BIGRAM_COUNT;
- static const int MAX_BIGRAM_COUNT_AFTER_GC;
-
- static int getProbability(const int encodedUnigramProbability,
- const int encodedBigramProbabilityDelta);
-
- static int getUpdatedUnigramProbability(const int originalEncodedProbability,
- const int newProbability);
-
- static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta,
- const int newProbability);
-
- static int isValidUnigram(const int encodedUnigramProbability);
-
- static int isValidBigram(const int encodedProbabilityDelta);
-
- static int getUnigramProbabilityToSave(const int encodedProbability);
-
- static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta);
-
- private:
- DISALLOW_IMPLICIT_CONSTRUCTORS(DecayingUtils);
-
- static const int MAX_COMPUTED_PROBABILITY;
- static const int MAX_UNIGRAM_PROBABILITY;
- static const int MIN_VALID_UNIGRAM_PROBABILITY;
- static const int UNIGRAM_PROBABILITY_STEP;
- static const int MAX_BIGRAM_PROBABILITY_DELTA;
- static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA;
- static const int BIGRAM_PROBABILITY_DELTA_STEP;
-
- static int decodeUnigramProbability(const int encodedProbability);
-
- static int decodeBigramProbabilityDelta(const int encodedProbability);
-
- static int getDecayedProbability(const int rawProbability);
-};
-} // namespace latinime
-#endif /* LATINIME_DECAYING_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
index f22e94c6a..994826fa8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
@@ -44,7 +44,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE =
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap);
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
- 0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */);
+ true /* updatesLastDecayedTime */, 0 /* unigramCount */, 0 /* bigramCount */,
+ 0 /* extendedRegionSize */);
BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) {
return false;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
new file mode 100644
index 000000000..1632fd072
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <ctime>
+#include <stdlib.h>
+
+#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+
+#include "suggest/core/policy/dictionary_header_structure_policy.h"
+#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+
+namespace latinime {
+
+const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000;
+const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
+const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
+const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
+
+const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
+const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15;
+const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3;
+const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1;
+// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected
+// duration of the decay is approximately 66hours.
+const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f;
+const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
+
+const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
+ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper;
+
+void ForgettingCurveUtils::TimeKeeper::setCurrentTime() {
+ mCurrentTime = time(0);
+}
+
+/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability,
+ const int encodedBigramProbability) {
+ if (encodedUnigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else if (encodedBigramProbability == NOT_A_PROBABILITY) {
+ return backoff(decodeProbability(encodedUnigramProbability));
+ } else {
+ const int unigramProbability = decodeProbability(encodedUnigramProbability);
+ const int bigramProbability = decodeProbability(encodedBigramProbability);
+ return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
+ }
+}
+
+// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding
+// (i.e. unigram probability + bigram probability delta).
+/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability(
+ const int originalEncodedProbability, const int newProbability) {
+ if (originalEncodedProbability == NOT_A_PROBABILITY) {
+ // The bigram relation is not in this dictionary.
+ if (newProbability == NOT_A_PROBABILITY) {
+ // The bigram target is not in other dictionaries.
+ return 0;
+ } else {
+ return MIN_VALID_ENCODED_PROBABILITY;
+ }
+ } else {
+ if (newProbability != NOT_A_PROBABILITY
+ && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
+ return MIN_VALID_ENCODED_PROBABILITY;
+ }
+ return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY);
+ }
+}
+
+/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) {
+ return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY;
+}
+
+/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability,
+ const DictionaryHeaderStructurePolicy *const headerPolicy) {
+ const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime();
+ const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1);
+ int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
+ // TODO: Implement the decay in more proper way.
+ for (int i = 0; i < decayIterationCount; ++i) {
+ const float currentRate = static_cast<float>(currentEncodedProbability)
+ / static_cast<float>(MAX_ENCODED_PROBABILITY);
+ const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
+ const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+ if (thresholdToDecay < randValue) {
+ currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP,
+ 0);
+ }
+ }
+ return currentEncodedProbability;
+}
+
+/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
+ const int unigramCount, const int bigramCount,
+ const DictionaryHeaderStructurePolicy *const headerPolicy) {
+ if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) {
+ // Unigram count exceeds the limit.
+ return true;
+ } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) {
+ // Bigram count exceeds the limit.
+ return true;
+ }
+ if (mindsBlockByDecay) {
+ return false;
+ }
+ if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) {
+ // Time to decay.
+ return true;
+ }
+ return false;
+}
+
+/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) {
+ if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else {
+ return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY);
+ }
+}
+
+// See comments in ProbabilityUtils::backoff().
+/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else {
+ return max(unigramProbability - 8, 0);
+ }
+}
+
+ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() {
+ // Table entry is as follows:
+ // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127.
+ // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used.
+ mTable.resize(MAX_ENCODED_PROBABILITY + 1);
+ for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) {
+ const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY),
+ static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY)));
+ mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability));
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
new file mode 100644
index 000000000..2ad423874
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_FORGETTING_CURVE_UTILS_H
+#define LATINIME_FORGETTING_CURVE_UTILS_H
+
+#include <vector>
+
+#include "defines.h"
+
+namespace latinime {
+
+class DictionaryHeaderStructurePolicy;
+
+// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
+// required to introduced to each terminal PtNode and bigram entry.
+// TODO: Quit using bigram probability to indicate the delta.
+class ForgettingCurveUtils {
+ public:
+ class TimeKeeper {
+ public:
+ TimeKeeper() : mCurrentTime(0) {}
+ void setCurrentTime();
+ int peekCurrentTime() const { return mCurrentTime; };
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(TimeKeeper);
+
+ int mCurrentTime;
+ };
+
+ static const int MAX_UNIGRAM_COUNT;
+ static const int MAX_UNIGRAM_COUNT_AFTER_GC;
+ static const int MAX_BIGRAM_COUNT;
+ static const int MAX_BIGRAM_COUNT_AFTER_GC;
+
+ static TimeKeeper sTimeKeeper;
+
+ static int getProbability(const int encodedUnigramProbability,
+ const int encodedBigramProbability);
+
+ static int getUpdatedEncodedProbability(const int originalEncodedProbability,
+ const int newProbability);
+
+ static int isValidEncodedProbability(const int encodedProbability);
+
+ static int getEncodedProbabilityToSave(const int encodedProbability,
+ const DictionaryHeaderStructurePolicy *const headerPolicy);
+
+ static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
+ const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
+
+ class ProbabilityTable {
+ public:
+ ProbabilityTable();
+
+ int getProbability(const int encodedProbability) const {
+ if (encodedProbability < 0 || encodedProbability > static_cast<int>(mTable.size())) {
+ return NOT_A_PROBABILITY;
+ }
+ return mTable[encodedProbability];
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(ProbabilityTable);
+
+ std::vector<int> mTable;
+ };
+
+ static const int MAX_COMPUTED_PROBABILITY;
+ static const int MAX_ENCODED_PROBABILITY;
+ static const int MIN_VALID_ENCODED_PROBABILITY;
+ static const int ENCODED_PROBABILITY_STEP;
+ static const float MIN_PROBABILITY_TO_DECAY;
+ static const int DECAY_INTERVAL_SECONDS;
+
+ static const ProbabilityTable sProbabilityTable;
+
+ static int decodeProbability(const int encodedProbability);
+
+ static int backoff(const int unigramProbability);
+};
+} // namespace latinime
+#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
index ecceb60d3..104eb2a7a 100644
--- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
+++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
@@ -27,30 +27,30 @@ const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170;
const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310;
const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4;
-const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.132f;
-const float ScoringParams::PROXIMITY_COST = 0.095f;
-const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.102f;
-const float ScoringParams::FIRST_PROXIMITY_COST = 0.019f;
-const float ScoringParams::OMISSION_COST = 0.458f;
-const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.491f;
-const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.582f;
-const float ScoringParams::INSERTION_COST = 0.730f;
-const float ScoringParams::TERMINAL_INSERTION_COST = 0.93f;
-const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.586f;
-const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.70f;
-const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.623f;
-const float ScoringParams::TRANSPOSITION_COST = 0.526f;
-const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.319f;
-const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.380f;
-const float ScoringParams::SUBSTITUTION_COST = 0.383f;
-const float ScoringParams::COST_NEW_WORD = 0.042f;
-const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.25f;
-const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.123f;
-const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.545f;
-const float ScoringParams::COST_LOOKAHEAD = 0.073f;
-const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.093f;
-const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.041f;
-const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.447f;
+const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f;
+const float ScoringParams::PROXIMITY_COST = 0.0694f;
+const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.072f;
+const float ScoringParams::FIRST_PROXIMITY_COST = 0.07788f;
+const float ScoringParams::OMISSION_COST = 0.4676f;
+const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.399f;
+const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.5256f;
+const float ScoringParams::INSERTION_COST = 0.7248f;
+const float ScoringParams::TERMINAL_INSERTION_COST = 0.8128f;
+const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f;
+const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f;
+const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f;
+const float ScoringParams::TRANSPOSITION_COST = 0.5608f;
+const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.339f;
+const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.4576f;
+const float ScoringParams::SUBSTITUTION_COST = 0.3806f;
+const float ScoringParams::COST_NEW_WORD = 0.0312f;
+const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f;
+const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f;
+const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.4836f;
+const float ScoringParams::COST_LOOKAHEAD = 0.00624f;
+const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.06836f;
+const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f;
+const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.4182f;
const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f;
const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f;
const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.045f;
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
index 89e53f441..007c19e0a 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
@@ -101,7 +101,7 @@ class TypingTraversal : public Traversal {
}
const int16_t pointIndex = dicNode->getInputIndex(0);
return pointIndex <= inputSize && !dicNode->isTotalInputSizeExceedingLimit()
- && !dicNode->shouldBeFilterdBySafetyNetForBigram();
+ && !dicNode->shouldBeFilteredBySafetyNetForBigram();
}
AK_FORCE_INLINE bool shouldDepthLevelCache(
diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h
index 2e735a81c..41663c81a 100644
--- a/native/jni/src/utils/char_utils.h
+++ b/native/jni/src/utils/char_utils.h
@@ -75,6 +75,16 @@ class CharUtils {
return c;
}
+ static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) {
+ int spaceCount = 0;
+ for (int i = 0; i < length; ++i) {
+ if (codePointBuffer[i] == KEYCODE_SPACE) {
+ ++spaceCount;
+ }
+ }
+ return spaceCount;
+ }
+
static unsigned short latin_tolower(const unsigned short c);
private: