aboutsummaryrefslogtreecommitdiffstats
path: root/native
diff options
context:
space:
mode:
Diffstat (limited to 'native')
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp11
-rw-r--r--native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp10
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp20
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h70
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp7
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp13
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h8
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp9
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h7
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp11
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h6
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp40
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp142
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h60
16 files changed, 301 insertions, 117 deletions
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp
index e1b35340b..bc4ca8e9e 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp
+++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.cpp
@@ -992,7 +992,16 @@ namespace latinime {
}
}
if (character != NOT_AN_INDEX) {
- codePointBuf[index] = proximityInfo->getCodePointOf(character);
+ const int codePoint = proximityInfo->getCodePointOf(character);
+ if (codePoint == NOT_A_CODE_POINT) {
+ AKLOGE("Key index(%d) is not found. Cannot construct most probable string",
+ character);
+ ASSERT(false);
+ // Make the length zero, which means most probable string won't be used.
+ index = 0;
+ break;
+ }
+ codePointBuf[index] = codePoint;
index++;
}
sumLogProbability += minLogProbability;
diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
index 59748c80d..a8dab9fcd 100644
--- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
+++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
@@ -44,8 +44,6 @@ class DictionaryHeaderStructurePolicy {
virtual float getMultiWordCostMultiplier() const = 0;
- virtual int getLastDecayedTime() const = 0;
-
virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue,
int outValueSize) const = 0;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp
index cd2243025..5df2096a4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp
@@ -37,7 +37,8 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out
if (outProbability) {
if (bigramEntry.hasHistoricalInfo()) {
*outProbability =
- ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo());
+ ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(),
+ mHeaderPolicy);
} else {
*outProbability = bigramEntry.getProbability();
}
@@ -160,8 +161,8 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i
}
} else if (bigramEntry.hasHistoricalInfo()) {
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
- bigramEntry.getHistoricalInfo());
- if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) {
+ bigramEntry.getHistoricalInfo(), mHeaderPolicy);
+ if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) {
const BigramEntry updatedBigramEntry =
bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
@@ -230,7 +231,8 @@ const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
const HistoricalInfo updatedHistoricalInfo =
ForgettingCurveUtils::createUpdatedHistoricalInfo(
- originalBigramEntry->getHistoricalInfo(), newProbability, timestamp);
+ originalBigramEntry->getHistoricalInfo(), newProbability, timestamp,
+ mHeaderPolicy);
return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
} else {
return originalBigramEntry->updateProbabilityAndGetEntry(newProbability);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
index 3ce57d910..7c7b05ca8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
@@ -18,7 +18,7 @@
namespace latinime {
-// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
+// Note that these are corresponding definitions in Java side in DictionaryHeader.
const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
"REQUIRES_GERMAN_UMLAUT_PROCESSING";
@@ -33,8 +33,26 @@ const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE
// count.
const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
+const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY =
+ "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
+const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
+ "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
+const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
+ "FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
+
+const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
+const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
+
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
+const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 4;
+const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 0;
+// 4 days
+const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS =
+ 4 * 24 * 60 * 60;
+
+const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000;
+const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000;
// Used for logging. Question mark is used to indicate that the key is not found.
void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index fc347618c..66824245e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
@@ -52,7 +52,20 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
- &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)) {}
+ &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
+ mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
+ DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
+ mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
+ DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
+ mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
+ DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
+ mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
+ mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
// Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@@ -71,8 +84,20 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0),
mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
- &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)) {
- }
+ &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
+ mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
+ DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
+ mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
+ DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
+ mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
+ DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
+ mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
+ mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
// Temporary dummy header.
HeaderPolicy()
@@ -80,7 +105,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
- mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false) {}
+ mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
+ mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
+ mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
~HeaderPolicy() {}
@@ -159,6 +186,26 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return &mAttributeMap;
}
+ AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const {
+ return mForgettingCurveOccurrencesToLevelUp;
+ }
+
+ AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
+ return mForgettingCurveProbabilityValuesTableId;
+ }
+
+ AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const {
+ return mForgettingCurveDurationToLevelDown;
+ }
+
+ AK_FORCE_INLINE int getMaxUnigramCount() const {
+ return mMaxUnigramCount;
+ }
+
+ AK_FORCE_INLINE int getMaxBigramCount() const {
+ return mMaxBigramCount;
+ }
+
void readHeaderValueOrQuestionMark(const char *const key,
int *outValue, int outValueSize) const;
@@ -183,8 +230,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const EXTENDED_REGION_SIZE_KEY;
static const char *const HAS_HISTORICAL_INFO_KEY;
static const char *const LOCALE_KEY;
+ static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
+ static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
+ static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
+ static const char *const MAX_UNIGRAM_COUNT_KEY;
+ static const char *const MAX_BIGRAM_COUNT_KEY;
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
+ static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
+ static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
+ static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS;
+ static const int DEFAULT_MAX_UNIGRAM_COUNT;
+ static const int DEFAULT_MAX_BIGRAM_COUNT;
const FormatUtils::FORMAT_VERSION mDictFormatVersion;
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
@@ -200,6 +257,11 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const int mBigramCount;
const int mExtendedRegionSize;
const bool mHasHistoricalInfoOfWords;
+ const int mForgettingCurveOccurrencesToLevelUp;
+ const int mForgettingCurveProbabilityValuesTableId;
+ const int mForgettingCurveDurationToLevelDown;
+ const int mMaxUnigramCount;
+ const int mMaxBigramCount;
const std::vector<int> readLocale() const;
float readMultipleWordCostMultiplier() const;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp
index cb9d450ec..279f5b33a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp
@@ -23,6 +23,13 @@ namespace latinime {
const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
int *const bigramEntryPos) const {
const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer();
+ if (*bigramEntryPos < 0 || *bigramEntryPos >= bigramListBuffer->getTailPosition()) {
+ AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bufSize: %d",
+ *bigramEntryPos, bigramListBuffer->getTailPosition());
+ ASSERT(false);
+ return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
+ Ver4DictConstants::NOT_A_TERMINAL_ID);
+ }
const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition(
Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos);
const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp
index 29972a4e8..64d7bc0a5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp
@@ -24,6 +24,19 @@ void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePo
int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
bool *const outhasNext, int *const shortcutEntryPos) const {
const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer();
+ if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) {
+ AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d",
+ *shortcutEntryPos, shortcutListBuffer->getTailPosition());
+ ASSERT(false);
+ if (outhasNext) {
+ *outhasNext = false;
+ }
+ if (outCodePointCount) {
+ *outCodePointCount = 0;
+ }
+ return;
+ }
+
const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition(
Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
if (outProbability) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
index 17fc9483b..f149781f4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
@@ -65,7 +65,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
mProbabilityDictContent->getProbabilityEntry(terminalId);
if (probabilityEntry.hasHistoricalInfo()) {
probability = ForgettingCurveUtils::decodeProbability(
- probabilityEntry.getHistoricalInfo());
+ probabilityEntry.getHistoricalInfo(), mHeaderPolicy);
} else {
probability = probabilityEntry.getProbability();
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
index 9d932457c..1db9ea026 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
@@ -26,6 +26,7 @@
namespace latinime {
class BufferWithExtendableBuffer;
+class HeaderPolicy;
class ProbabilityDictContent;
/*
@@ -35,8 +36,10 @@ class ProbabilityDictContent;
class Ver4PatriciaTrieNodeReader : public PtNodeReader {
public:
Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer,
- const ProbabilityDictContent *const probabilityDictContent)
- : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent) {}
+ const ProbabilityDictContent *const probabilityDictContent,
+ const HeaderPolicy *const headerPolicy)
+ : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent),
+ mHeaderPolicy(headerPolicy) {}
~Ver4PatriciaTrieNodeReader() {}
@@ -50,6 +53,7 @@ class Ver4PatriciaTrieNodeReader : public PtNodeReader {
const BufferWithExtendableBuffer *const mBuffer;
const ProbabilityDictContent *const mProbabilityDictContent;
+ const HeaderPolicy *const mHeaderPolicy;
const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos,
const int siblingNodePos) const;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
index 32576cf0a..f24c2e1af 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
@@ -159,7 +159,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA
toBeUpdatedPtNodeParams->getTerminalId());
if (originalProbabilityEntry.hasHistoricalInfo()) {
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
- originalProbabilityEntry.getHistoricalInfo());
+ originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy);
const ProbabilityEntry probabilityEntry =
originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
@@ -168,7 +168,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA
toBeUpdatedPtNodeParams->getTerminalId());
return false;
}
- const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo);
+ const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
if (!isValid) {
if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
@@ -382,10 +382,11 @@ const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
const ProbabilityEntry *const originalProbabilityEntry, const int newProbability,
const int timestamp) const {
// TODO: Consolidate historical info and probability.
- if (mBuffers->getHeaderPolicy()->hasHistoricalInfoOfWords()) {
+ if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
const HistoricalInfo updatedHistoricalInfo =
ForgettingCurveUtils::createUpdatedHistoricalInfo(
- originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp);
+ originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp,
+ mHeaderPolicy);
return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
&updatedHistoricalInfo);
} else {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
index 66845bbd6..f01b3af0e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
@@ -28,6 +28,7 @@
namespace latinime {
class BufferWithExtendableBuffer;
+class HeaderPolicy;
class Ver4BigramListPolicy;
class Ver4DictBuffers;
class Ver4PatriciaTrieNodeReader;
@@ -40,10 +41,11 @@ class Ver4ShortcutListPolicy;
class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
public:
Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer,
- Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader,
+ Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy,
+ const PtNodeReader *const ptNodeReader,
const PtNodeArrayReader *const ptNodeArrayReader,
Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy)
- : mTrieBuffer(trieBuffer), mBuffers(buffers),
+ : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy),
mReadingHelper(ptNodeReader, ptNodeArrayReader), mBigramPolicy(bigramPolicy),
mShortcutPolicy(shortcutPolicy) {}
@@ -116,6 +118,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
BufferWithExtendableBuffer *const mTrieBuffer;
Ver4DictBuffers *const mBuffers;
+ const HeaderPolicy *const mHeaderPolicy;
DynamicPtReadingHelper mReadingHelper;
Ver4BigramListPolicy *const mBigramPolicy;
Ver4ShortcutListPolicy *const mShortcutPolicy;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
index b5d80be1d..4d1b0dadb 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -329,11 +329,15 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer
snprintf(outResult, maxResultLength, "%d", mBigramCount);
} else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
snprintf(outResult, maxResultLength, "%d",
- mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT :
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getUnigramCountHardLimit(
+ mHeaderPolicy->getMaxUnigramCount()) :
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
} else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
snprintf(outResult, maxResultLength, "%d",
- mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getBigramCountHardLimit(
+ mHeaderPolicy->getMaxBigramCount()) :
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
}
}
@@ -382,7 +386,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
bigramWord1CodePoints + codePointCount);
const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
const int probability = bigramEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) :
+ ForgettingCurveUtils::decodeProbability(
+ bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
bigramEntry.getProbability();
bigrams.push_back(WordProperty::BigramProperty(&word1, probability,
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
index 7796e2ddc..639c153a1 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
@@ -47,10 +47,10 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy),
mShortcutPolicy(mBuffers.get()->getMutableShortcutDictContent(),
mBuffers.get()->getTerminalPositionLookupTable()),
- mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()),
+ mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent(), mHeaderPolicy),
mPtNodeArrayReader(mDictBuffer),
- mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader,
- &mBigramPolicy, &mShortcutPolicy),
+ mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader,
+ &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy),
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
mWritingHelper(mBuffers.get()),
mUnigramCount(mHeaderPolicy->getUnigramCount()),
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
index 93053c38d..3907c84a0 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
@@ -74,14 +74,15 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
int *const outUnigramCount, int *const outBigramCount) {
Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
- mBuffers->getProbabilityDictContent());
+ mBuffers->getProbabilityDictContent(), headerPolicy);
Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
mBuffers->getTerminalPositionLookupTable(), headerPolicy);
Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
mBuffers->getTerminalPositionLookupTable());
Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
- mBuffers, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, &shortcutPolicy);
+ mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
+ &shortcutPolicy);
DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
@@ -95,12 +96,11 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
}
const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
.getValidUnigramCount();
- if (headerPolicy->isDecayingDict()
- && unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
- if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter,
- ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) {
+ const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
+ if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
+ if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
- ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC);
+ maxUnigramCount);
return false;
}
}
@@ -113,11 +113,10 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return false;
}
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
- if (headerPolicy->isDecayingDict()
- && bigramCount > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
- if (!truncateBigrams(ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC)) {
- AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount,
- ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC);
+ const int maxBigramCount = headerPolicy->getMaxBigramCount();
+ if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
+ if (!truncateBigrams(maxBigramCount)) {
+ AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
return false;
}
}
@@ -126,7 +125,8 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
- buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, &shortcutPolicy);
+ buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
+ &shortcutPolicy);
DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
@@ -137,14 +137,14 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
// Create policy instances for the GCed dictionary.
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
- buffersToWrite->getProbabilityDictContent());
+ buffersToWrite->getProbabilityDictContent(), headerPolicy);
Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
buffersToWrite->getTerminalPositionLookupTable());
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
- buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
+ buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
&newShortcutPolicy);
// Re-assign terminal IDs for valid terminal PtNodes.
TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
@@ -202,8 +202,9 @@ bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
const ProbabilityEntry probabilityEntry =
mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
const int probability = probabilityEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo()) :
- probabilityEntry.getProbability();
+ ForgettingCurveUtils::decodeProbability(
+ probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
+ probabilityEntry.getProbability();
priorityQueue.push(DictProbability(terminalPos, probability,
probabilityEntry.getHistoricalInfo()->getTimeStamp()));
}
@@ -245,8 +246,9 @@ bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
continue;
}
const int probability = bigramEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) :
- bigramEntry.getProbability();
+ ForgettingCurveUtils::decodeProbability(
+ bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
+ bigramEntry.getProbability();
priorityQueue.push(DictProbability(entryPos, probability,
bigramEntry.getHistoricalInfo()->getTimeStamp()));
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
index d58d25989..35e05d77a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
@@ -19,33 +19,29 @@
#include <cmath>
#include <stdlib.h>
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
+#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
#include "utils/time_keeper.h"
namespace latinime {
-const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000;
-const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
-const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
-const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
-
-const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
+const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
const int ForgettingCurveUtils::MAX_LEVEL = 3;
-const int ForgettingCurveUtils::MAX_COUNT = 3;
const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1;
-const int ForgettingCurveUtils::TIME_STEP_DURATION_IN_SECONDS = 6 * 60 * 60;
const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15;
const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14;
+const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
+const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
+
const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
// TODO: Revise the logic to decide the initial probability depending on the given probability.
/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo(
const HistoricalInfo *const originalHistoricalInfo,
- const int newProbability, const int timestamp) {
+ const int newProbability, const int timestamp, const HeaderPolicy *const headerPolicy) {
if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */);
} else if (!originalHistoricalInfo->isValid()) {
@@ -53,7 +49,7 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */);
} else {
const int updatedCount = originalHistoricalInfo->getCount() + 1;
- if (updatedCount > MAX_COUNT) {
+ if (updatedCount >= headerPolicy->getForgettingCurveOccurrencesToLevelUp()) {
// The count exceeds the max value the level can be incremented.
if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) {
// The level is already max.
@@ -71,9 +67,11 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
}
/* static */ int ForgettingCurveUtils::decodeProbability(
- const HistoricalInfo *const historicalInfo) {
- const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp());
- return sProbabilityTable.getProbability(historicalInfo->getLevel(),
+ const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
+ const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
+ headerPolicy->getForgettingCurveDurationToLevelDown());
+ return sProbabilityTable.getProbability(
+ headerPolicy->getForgettingCurveProbabilityValuesTableId(), historicalInfo->getLevel(),
min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT));
}
@@ -82,24 +80,31 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
if (unigramProbability == NOT_A_PROBABILITY) {
return NOT_A_PROBABILITY;
} else if (bigramProbability == NOT_A_PROBABILITY) {
- return min(backoff(unigramProbability), MAX_COMPUTED_PROBABILITY);
+ return min(backoff(unigramProbability), MAX_PROBABILITY);
} else {
- return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
+ // TODO: Investigate better way to handle bigram probability.
+ return min(max(unigramProbability, bigramProbability + MULTIPLIER_TWO_IN_PROBABILITY_SCALE),
+ MAX_PROBABILITY);
}
}
-/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) {
+/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
+ const HeaderPolicy *const headerPolicy) {
return historicalInfo->getLevel() > 0
- || getElapsedTimeStepCount(historicalInfo->getTimeStamp())
- < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
+ || getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
+ headerPolicy->getForgettingCurveDurationToLevelDown())
+ < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
}
/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
- const HistoricalInfo *const originalHistoricalInfo) {
+ const HistoricalInfo *const originalHistoricalInfo,
+ const HeaderPolicy *const headerPolicy) {
if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
return HistoricalInfo();
}
- const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp());
+ const int durationToLevelDownInSeconds = headerPolicy->getForgettingCurveDurationToLevelDown();
+ const int elapsedTimeStep = getElapsedTimeStepCount(
+ originalHistoricalInfo->getTimeStamp(), durationToLevelDownInSeconds);
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
// No need to update historical info.
return *originalHistoricalInfo;
@@ -108,19 +113,18 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
- const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() +
- levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS;
- return HistoricalInfo(adjustedTimestamp,
+ const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimeStamp() +
+ levelDownAmount * durationToLevelDownInSeconds;
+ return HistoricalInfo(adjustedTimestampInSeconds,
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
}
/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
- const int unigramCount, const int bigramCount,
- const DictionaryHeaderStructurePolicy *const headerPolicy) {
- if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) {
+ const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) {
+ if (unigramCount >= getUnigramCountHardLimit(headerPolicy->getMaxUnigramCount())) {
// Unigram count exceeds the limit.
return true;
- } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) {
+ } else if (bigramCount >= getBigramCountHardLimit(headerPolicy->getMaxBigramCount())) {
// Bigram count exceeds the limit.
return true;
}
@@ -137,37 +141,71 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
// See comments in ProbabilityUtils::backoff().
/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else {
- return max(unigramProbability - 8, 0);
- }
+ // See TODO comments in ForgettingCurveUtils::getProbability().
+ return unigramProbability;
}
-/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp) {
- return (TimeKeeper::peekCurrentTime() - timestamp) / TIME_STEP_DURATION_IN_SECONDS;
+/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp,
+ const int durationToLevelDownInSeconds) {
+ const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp;
+ const int timeStepDurationInSeconds =
+ durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
+ return elapsedTimeInSeconds / timeStepDurationInSeconds;
}
-ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() {
- mTable.resize(MAX_LEVEL + 1);
- for (int level = 0; level <= MAX_LEVEL; ++level) {
- mTable[level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1);
- const float initialProbability =
- static_cast<float>(MAX_COMPUTED_PROBABILITY / (1 << (MAX_LEVEL - level)));
- for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; ++timeStepCount) {
- if (level == 0) {
- mTable[level][timeStepCount] = NOT_A_PROBABILITY;
- continue;
+const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4;
+const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0;
+const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1;
+const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2;
+const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3;
+const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127;
+const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 32;
+const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 35;
+const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 40;
+
+
+ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
+ mTables.resize(PROBABILITY_TABLE_COUNT);
+ for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) {
+ mTables[tableId].resize(MAX_LEVEL + 1);
+ for (int level = 0; level <= MAX_LEVEL; ++level) {
+ mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1);
+ const float initialProbability = getBaseProbabilityForLevel(tableId, level);
+ const float endProbability = getBaseProbabilityForLevel(tableId, level - 1);
+ for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT;
+ ++timeStepCount) {
+ if (level == 0) {
+ mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
+ continue;
+ }
+ const float probability = initialProbability
+ * powf(initialProbability / endProbability,
+ -1.0f * static_cast<float>(timeStepCount)
+ / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1));
+ mTables[tableId][level][timeStepCount] =
+ min(max(static_cast<int>(probability), 1), MAX_PROBABILITY);
}
- const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS;
- const float probability = initialProbability
- * powf(2.0f, -1.0f * static_cast<float>(elapsedTime)
- / static_cast<float>(TIME_STEP_DURATION_IN_SECONDS
- * (MAX_ELAPSED_TIME_STEP_COUNT + 1)));
- mTable[level][timeStepCount] =
- min(max(static_cast<int>(probability), 1), MAX_COMPUTED_PROBABILITY);
}
}
}
+/* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel(
+ const int tableId, const int level) {
+ if (tableId == WEAK_PROBABILITY_TABLE_ID) {
+ // Max probability is 127.
+ return static_cast<float>(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level)));
+ } else if (tableId == MODEST_PROBABILITY_TABLE_ID) {
+ // Max probability is 128.
+ return static_cast<float>(MODEST_BASE_PROBABILITY * (level + 1));
+ } else if (tableId == STRONG_PROBABILITY_TABLE_ID) {
+ // Max probability is 140.
+ return static_cast<float>(STRONG_BASE_PROBABILITY * (level + 1));
+ } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) {
+ // Max probability is 160.
+ return static_cast<float>(AGGRESSIVE_BASE_PROBABILITY * (level + 1));
+ } else {
+ return NOT_A_PROBABILITY;
+ }
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
index b37353455..bb8690939 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
@@ -24,31 +24,39 @@
namespace latinime {
-class DictionaryHeaderStructurePolicy;
+class HeaderPolicy;
class ForgettingCurveUtils {
public:
- static const int MAX_UNIGRAM_COUNT;
- static const int MAX_UNIGRAM_COUNT_AFTER_GC;
- static const int MAX_BIGRAM_COUNT;
- static const int MAX_BIGRAM_COUNT_AFTER_GC;
-
static const HistoricalInfo createUpdatedHistoricalInfo(
const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
- const int timestamp);
+ const int timestamp, const HeaderPolicy *const headerPolicy);
static const HistoricalInfo createHistoricalInfoToSave(
- const HistoricalInfo *const originalHistoricalInfo);
+ const HistoricalInfo *const originalHistoricalInfo,
+ const HeaderPolicy *const headerPolicy);
- static int decodeProbability(const HistoricalInfo *const historicalInfo);
+ static int decodeProbability(const HistoricalInfo *const historicalInfo,
+ const HeaderPolicy *const headerPolicy);
static int getProbability(const int encodedUnigramProbability,
const int encodedBigramProbability);
- static bool needsToKeep(const HistoricalInfo *const historicalInfo);
+ static bool needsToKeep(const HistoricalInfo *const historicalInfo,
+ const HeaderPolicy *const headerPolicy);
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
- const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy);
+ const int bigramCount, const HeaderPolicy *const headerPolicy);
+
+ AK_FORCE_INLINE static int getUnigramCountHardLimit(const int maxUnigramCount) {
+ return static_cast<int>(static_cast<float>(maxUnigramCount)
+ * UNIGRAM_COUNT_HARD_LIMIT_WEIGHT);
+ }
+
+ AK_FORCE_INLINE static int getBigramCountHardLimit(const int maxBigramCount) {
+ return static_cast<int>(static_cast<float>(maxBigramCount)
+ * BIGRAM_COUNT_HARD_LIMIT_WEIGHT);
+ }
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
@@ -57,32 +65,46 @@ class ForgettingCurveUtils {
public:
ProbabilityTable();
- int getProbability(const int level, const int elapsedTimeStepCount) const {
- return mTable[level][elapsedTimeStepCount];
+ int getProbability(const int tableId, const int level,
+ const int elapsedTimeStepCount) const {
+ return mTables[tableId][level][elapsedTimeStepCount];
}
private:
DISALLOW_COPY_AND_ASSIGN(ProbabilityTable);
- std::vector<std::vector<int> > mTable;
+ static const int PROBABILITY_TABLE_COUNT;
+ static const int WEAK_PROBABILITY_TABLE_ID;
+ static const int MODEST_PROBABILITY_TABLE_ID;
+ static const int STRONG_PROBABILITY_TABLE_ID;
+ static const int AGGRESSIVE_PROBABILITY_TABLE_ID;
+
+ static const int WEAK_MAX_PROBABILITY;
+ static const int MODEST_BASE_PROBABILITY;
+ static const int STRONG_BASE_PROBABILITY;
+ static const int AGGRESSIVE_BASE_PROBABILITY;
+
+ std::vector<std::vector<std::vector<int> > > mTables;
+
+ static int getBaseProbabilityForLevel(const int tableId, const int level);
};
- static const int MAX_COMPUTED_PROBABILITY;
+ static const int MULTIPLIER_TWO_IN_PROBABILITY_SCALE;
static const int DECAY_INTERVAL_SECONDS;
static const int MAX_LEVEL;
- static const int MAX_COUNT;
static const int MIN_VALID_LEVEL;
- static const int TIME_STEP_DURATION_IN_SECONDS;
static const int MAX_ELAPSED_TIME_STEP_COUNT;
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
- static const int HALF_LIFE_TIME_IN_SECONDS;
+
+ static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT;
+ static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT;
static const ProbabilityTable sProbabilityTable;
static int backoff(const int unigramProbability);
- static int getElapsedTimeStepCount(const int timestamp);
+ static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown);
};
} // namespace latinime
#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */