aboutsummaryrefslogtreecommitdiffstats
path: root/native
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-10-06 22:03:11 +0900
committerKeisuke Kuroyanagi <ksk@google.com>2014-10-06 22:03:11 +0900
commitaae1a062eb98e7634fb4da996ec604bbefd4ddf5 (patch)
tree26ef38496cd7e4658e442f8b7859404f9dbf75d4 /native
parent36c4eaadfb7a9cb833c8cca8afb49b2aec347b1d (diff)
downloadlatinime-aae1a062eb98e7634fb4da996ec604bbefd4ddf5.tar.gz
latinime-aae1a062eb98e7634fb4da996ec604bbefd4ddf5.tar.xz
latinime-aae1a062eb98e7634fb4da996ec604bbefd4ddf5.zip
Improve bigram probability computation for decaying dicts.
Without personalization: Total words: 1079345, Success Num: 819749, Success Percentage: 75.949% Bad Failures, with auto-correction (typed word == expected word, output word != expected word): 1754, Bad Failure Percentage: 0.163% Failures, with auto-correction (F-C): 28463, F-C Percentage: 2.637% Max Keystrokes: 6074285, Min Keystrokes: 4649326, Keystroke Saving Percentage:23.459% With current probability computing logic: Total words: 1079382, Success Num: 838329, Success Percentage: 77.667% Bad Failures, with auto-correction (typed word == expected word, output word != expected word): 1332, Bad Failure Percentage: 0.123% Failures, with auto-correction (F-C): 28558, F-C Percentage: 2.646% Max Keystrokes: 6074503, Min Keystrokes: 4474102, Keystroke Saving Percentage:26.346% Remove isof files. With new probability computing logic: Total words: 1079356, Success Num: 844954, Success Percentage: 78.283% Bad Failures, with auto-correction (typed word == expected word, output word != expected word): 1306, Bad Failure Percentage: 0.121% Failures, with auto-correction (F-C): 27214, F-C Percentage: 2.521% Max Keystrokes: 6074477, Min Keystrokes: 4243021, Keystroke Saving Percentage:30.150% Remove isof files. Bug: 16547409 Change-Id: I3d2a49c7aaa2c0f6835c52ef72d22466ee225789
Diffstat (limited to 'native')
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp8
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h31
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp80
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp55
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h5
6 files changed, 83 insertions, 98 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
index 6ed65d921..4c4dfc578 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
@@ -35,23 +35,15 @@ const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE
// count.
const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
-const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY =
- "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
-const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
- "FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
-const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 2;
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
-// 30 days
-const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS =
- 30 * 24 * 60 * 60;
const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000;
const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index daf40d4f9..bc8eaded3 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
@@ -53,15 +53,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
&mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
- mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
- DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
- mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
- DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
@@ -86,15 +80,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0),
mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
&mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
- mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
- DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
- mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
- DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
@@ -113,12 +101,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount),
mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
- mForgettingCurveOccurrencesToLevelUp(
- headerPolicy->mForgettingCurveOccurrencesToLevelUp),
mForgettingCurveProbabilityValuesTableId(
headerPolicy->mForgettingCurveProbabilityValuesTableId),
- mForgettingCurveDurationToLevelDown(
- headerPolicy->mForgettingCurveDurationToLevelDown),
mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
mMaxBigramCount(headerPolicy->mMaxBigramCount),
mCodePointTable(headerPolicy->mCodePointTable) {}
@@ -130,8 +114,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
- mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
- mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0),
+ mForgettingCurveProbabilityValuesTableId(0), mMaxUnigramCount(0), mMaxBigramCount(0),
mCodePointTable(nullptr) {}
~HeaderPolicy() {}
@@ -217,18 +200,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return &mAttributeMap;
}
- AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const {
- return mForgettingCurveOccurrencesToLevelUp;
- }
-
AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
return mForgettingCurveProbabilityValuesTableId;
}
- AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const {
- return mForgettingCurveDurationToLevelDown;
- }
-
AK_FORCE_INLINE int getMaxUnigramCount() const {
return mMaxUnigramCount;
}
@@ -280,9 +255,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const MAX_BIGRAM_COUNT_KEY;
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
- static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
- static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS;
static const int DEFAULT_MAX_UNIGRAM_COUNT;
static const int DEFAULT_MAX_BIGRAM_COUNT;
@@ -300,9 +273,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const int mBigramCount;
const int mExtendedRegionSize;
const bool mHasHistoricalInfoOfWords;
- const int mForgettingCurveOccurrencesToLevelUp;
const int mForgettingCurveProbabilityValuesTableId;
- const int mForgettingCurveDurationToLevelDown;
const int mMaxUnigramCount;
const int mMaxBigramCount;
const int *const mCodePointTable;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
index 11f7b305f..e0c3af8df 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
@@ -146,18 +146,15 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probabi
int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
- if (mHeaderPolicy->isDecayingDict()) {
- // Both probabilities are encoded. Decode them and get probability.
- return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability);
- } else {
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (bigramProbability == NOT_A_PROBABILITY) {
- return ProbabilityUtils::backoff(unigramProbability);
- } else {
- return bigramProbability;
- }
+ // In the v4 format, bigramProbability is a conditional probability.
+ const int bigramConditionalProbability = bigramProbability;
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
}
+ if (bigramConditionalProbability == NOT_A_PROBABILITY) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ }
+ return bigramConditionalProbability;
}
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
@@ -170,37 +167,66 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
return NOT_A_PROBABILITY;
}
- if (!prevWordIds.empty()) {
- const int bigramsPosition = getBigramsPositionOfPtNode(
- getTerminalPtNodePosFromWordId(prevWordIds[0]));
- BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
- while (bigramsIt.hasNext()) {
- bigramsIt.next();
- if (bigramsIt.getBigramPos() == ptNodePos
- && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
- return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
- }
- }
+ if (prevWordIds.empty()) {
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ }
+ if (prevWordIds[0] == NOT_A_WORD_ID) {
return NOT_A_PROBABILITY;
}
- return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ const PtNodeParams prevWordPtNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
+ if (prevWordPtNodeParams.isDeleted()) {
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ }
+ const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ prevWordPtNodeParams.getTerminalId());
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ if (bigramsIt.getBigramPos() == ptNodePos
+ && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
+ const int bigramConditionalProbability = getBigramConditionalProbability(
+ prevWordPtNodeParams.getProbability(), bigramsIt.getProbability());
+ return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability);
+ }
+ }
+ return NOT_A_PROBABILITY;
}
void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
NgramListener *const listener) const {
- if (prevWordIds.empty()) {
+ if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) {
+ return;
+ }
+ const PtNodeParams prevWordPtNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
+ if (prevWordPtNodeParams.isDeleted()) {
return;
}
- const int bigramsPosition = getBigramsPositionOfPtNode(
- getTerminalPtNodePosFromWordId(prevWordIds[0]));
+ const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ prevWordPtNodeParams.getTerminalId());
BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
while (bigramsIt.hasNext()) {
bigramsIt.next();
- listener->onVisitEntry(bigramsIt.getProbability(),
+ const int bigramConditionalProbability = getBigramConditionalProbability(
+ prevWordPtNodeParams.getProbability(), bigramsIt.getProbability());
+ listener->onVisitEntry(bigramConditionalProbability,
getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
}
}
+int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability,
+ const int bigramProbability) const {
+ if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
+ // Calculate conditional probability.
+ return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability,
+ MAX_PROBABILITY);
+ } else {
+ // bigramProbability is a conditional probability.
+ return bigramProbability;
+ }
+}
+
BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
const int wordId) const {
const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
index 995d7764f..b82563e61 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
@@ -174,6 +174,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
int getTerminalPtNodePosFromWordId(const int wordId) const;
const WordAttributes getWordAttributes(const int probability,
const PtNodeParams &ptNodeParams) const;
+ int getBigramConditionalProbability(const int prevWordUnigramProbability,
+ const int bigramProbability) const;
};
} // namespace v402
} // namespace backward
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
index af4bc186a..e5ef2abf8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
@@ -29,10 +29,14 @@ namespace latinime {
const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
-const int ForgettingCurveUtils::MAX_LEVEL = 3;
-const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 1;
-const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15;
-const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14;
+const int ForgettingCurveUtils::MAX_LEVEL = 15;
+const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2;
+const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31;
+const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30;
+const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1;
+// TODO: Evaluate whether this should be 7.5 days.
+// 15 days
+const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60;
const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
@@ -54,19 +58,23 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|| (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel()
&& originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) {
// Initial information.
+ int count = newHistoricalInfo->getCount();
+ if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
+ const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1);
+ return HistoricalInfo(timestamp, level, 0 /* count */);
+ }
const int level = clampToValidLevelRange(newHistoricalInfo->getLevel());
- const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy);
- return HistoricalInfo(timestamp, level, count);
+ return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy));
} else {
const int updatedCount = originalHistoricalInfo->getCount() + 1;
- if (updatedCount >= headerPolicy->getForgettingCurveOccurrencesToLevelUp()) {
+ if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
// The count exceeds the max value the level can be incremented.
if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) {
// The level is already max.
return HistoricalInfo(timestamp,
originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount());
} else {
- // Level up.
+ // Raise the level.
return HistoricalInfo(timestamp,
originalHistoricalInfo->getLevel() + 1, 0 /* count */);
}
@@ -79,31 +87,18 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
/* static */ int ForgettingCurveUtils::decodeProbability(
const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(),
- headerPolicy->getForgettingCurveDurationToLevelDown());
+ DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS);
return sProbabilityTable.getProbability(
headerPolicy->getForgettingCurveProbabilityValuesTableId(),
clampToValidLevelRange(historicalInfo->getLevel()),
clampToValidTimeStepCountRange(elapsedTimeStepCount));
}
-/* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability,
- const int bigramProbability) {
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (bigramProbability == NOT_A_PROBABILITY) {
- return std::min(backoff(unigramProbability), MAX_PROBABILITY);
- } else {
- // TODO: Investigate better way to handle bigram probability.
- return std::min(std::max(unigramProbability,
- bigramProbability + MULTIPLIER_TWO_IN_PROBABILITY_SCALE), MAX_PROBABILITY);
- }
-}
-
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy) {
return historicalInfo->getLevel() > 0
|| getElapsedTimeStepCount(historicalInfo->getTimestamp(),
- headerPolicy->getForgettingCurveDurationToLevelDown())
+ DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS)
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
}
@@ -113,14 +108,14 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) {
return HistoricalInfo();
}
- const int durationToLevelDownInSeconds = headerPolicy->getForgettingCurveDurationToLevelDown();
+ const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
const int elapsedTimeStep = getElapsedTimeStepCount(
originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds);
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
// No need to update historical info.
return *originalHistoricalInfo;
}
- // Level down.
+ // Lower the level.
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
@@ -170,7 +165,7 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count,
const HeaderPolicy *const headerPolicy) {
- return std::min(std::max(count, 0), headerPolicy->getForgettingCurveOccurrencesToLevelUp() - 1);
+ return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1);
}
/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) {
@@ -187,9 +182,9 @@ const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID =
const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2;
const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3;
const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127;
-const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 32;
-const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 35;
-const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 40;
+const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8;
+const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9;
+const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10;
ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
@@ -202,7 +197,7 @@ ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
const float endProbability = getBaseProbabilityForLevel(tableId, level - 1);
for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT;
++timeStepCount) {
- if (level == 0) {
+ if (level < MIN_VISIBLE_LEVEL) {
mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
continue;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
index 10abb405a..ccbc4a98d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
@@ -39,9 +39,6 @@ class ForgettingCurveUtils {
static int decodeProbability(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy);
- static int getProbability(const int encodedUnigramProbability,
- const int encodedBigramProbability);
-
static bool needsToKeep(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy);
@@ -101,6 +98,8 @@ class ForgettingCurveUtils {
static const int MIN_VISIBLE_LEVEL;
static const int MAX_ELAPSED_TIME_STEP_COUNT;
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
+ static const int OCCURRENCES_TO_RAISE_THE_LEVEL;
+ static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT;
static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT;