aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp12
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp111
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h31
-rw-r--r--tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java12
6 files changed, 77 insertions, 95 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
index 9c3dc393f..8753c6eb0 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
@@ -43,7 +43,7 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const
}
*outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags);
*outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags);
- if (mIsDecayingDict && !ForgettingCurveUtils::isValidBigram(*outProbability)) {
+ if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) {
// This bigram is too weak to output.
*outBigramPos = NOT_A_DICT_POS;
} else {
@@ -261,8 +261,8 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags(
bigramFlags);
const int probabilityToWrite = mIsDecayingDict ?
- ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(
- originalProbability, probability) : probability;
+ ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability,
+ probability) : probability;
const BigramListReadWriteUtils::BigramFlags updatedFlags =
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags,
probabilityToWrite);
@@ -294,7 +294,7 @@ bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, con
int *const writingPos) {
// hasNext is false because we are adding a new bigram entry at the end of the bigram list.
const int probabilityToWrite = mIsDecayingDict ?
- ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) :
+ ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) :
probability;
return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos,
probabilityToWrite, false /* hasNext */, writingPos);
@@ -365,9 +365,9 @@ bool DynamicBigramListPolicy::updateProbabilityForDecay(
*outRemoved = false;
if (mIsDecayingDict) {
// Update bigram probability for decaying.
- const int newProbability = ForgettingCurveUtils::getBigramProbabilityDeltaToSave(
+ const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave(
BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags));
- if (ForgettingCurveUtils::isValidBigram(newProbability)) {
+ if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
// Write new probability.
const BigramListReadWriteUtils::BigramFlags updatedBigramFlags =
BigramListReadWriteUtils::setProbabilityInFlags(
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
index 26ea6ab68..324b53062 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
@@ -29,14 +29,14 @@ bool DynamicPatriciaTrieGcEventListeners
bool isUselessPtNode = !node->isTerminal();
if (node->isTerminal() && mIsDecayingDict) {
const int newProbability =
- ForgettingCurveUtils::getUnigramProbabilityToSave(node->getProbability());
+ ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability());
int writingPos = node->getProbabilityFieldPos();
// Update probability.
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(
mBuffer, newProbability, &writingPos)) {
return false;
}
- if (!ForgettingCurveUtils::isValidUnigram(newProbability)) {
+ if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
isUselessPtNode = false;
}
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
index 9dc2abe20..70a9ee564 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
@@ -545,7 +545,7 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability,
const int newProbability) {
if (mNeedsToDecay) {
- return ForgettingCurveUtils::getUpdatedUnigramProbability(originalProbability,
+ return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability,
newProbability);
} else {
return newProbability;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
index c525a8ab5..62a19a5a6 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
@@ -14,6 +14,8 @@
* limitations under the License.
*/
+#include <stdlib.h>
+
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
@@ -26,106 +28,91 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
-const int ForgettingCurveUtils::MAX_UNIGRAM_PROBABILITY = 120;
-const int ForgettingCurveUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24;
-const int ForgettingCurveUtils::UNIGRAM_PROBABILITY_STEP = 8;
-const int ForgettingCurveUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15;
-const int ForgettingCurveUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3;
-const int ForgettingCurveUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1;
+const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15;
+const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3;
+const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1;
+// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected
+// duration of the decay is approximately 66hours.
+const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f;
/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability,
- const int encodedBigramProbabilityDelta) {
+ const int encodedBigramProbability) {
if (encodedUnigramProbability == NOT_A_PROBABILITY) {
return NOT_A_PROBABILITY;
- } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) {
- const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability(
- encodedUnigramProbability));
- return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
+ } else if (encodedBigramProbability == NOT_A_PROBABILITY) {
+ return backoff(decodeUnigramProbability(encodedUnigramProbability));
} else {
- const int rawProbability = ProbabilityUtils::computeProbabilityForBigram(
- decodeUnigramProbability(encodedUnigramProbability),
- decodeBigramProbabilityDelta(encodedBigramProbabilityDelta));
- return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
+ const int unigramProbability = decodeUnigramProbability(encodedUnigramProbability);
+ const int bigramProbability = decodeBigramProbability(encodedBigramProbability);
+ return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
}
}
-/* static */ int ForgettingCurveUtils::getUpdatedUnigramProbability(
+// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding
+// (i.e. unigram probability + bigram probability delta).
+/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability(
const int originalEncodedProbability, const int newProbability) {
if (originalEncodedProbability == NOT_A_PROBABILITY) {
- // The unigram is not in this dictionary.
- if (newProbability == NOT_A_PROBABILITY) {
- // The unigram is not in other dictionaries.
- return 0;
- } else {
- return MIN_VALID_UNIGRAM_PROBABILITY;
- }
- } else {
- if (newProbability != NOT_A_PROBABILITY
- && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) {
- return MIN_VALID_UNIGRAM_PROBABILITY;
- }
- return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY);
- }
-}
-
-/* static */ int ForgettingCurveUtils::getUnigramProbabilityToSave(const int encodedProbability) {
- return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0);
-}
-
-/* static */ int ForgettingCurveUtils::getBigramProbabilityDeltaToSave(
- const int encodedProbabilityDelta) {
- return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0);
-}
-
-/* static */ int ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(
- const int originalEncodedProbabilityDelta, const int newProbability) {
- if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) {
// The bigram relation is not in this dictionary.
if (newProbability == NOT_A_PROBABILITY) {
// The bigram target is not in other dictionaries.
return 0;
} else {
- return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ return MIN_VALID_ENCODED_PROBABILITY;
}
} else {
if (newProbability != NOT_A_PROBABILITY
- && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) {
- return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
+ return MIN_VALID_ENCODED_PROBABILITY;
}
- return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP,
- MAX_BIGRAM_PROBABILITY_DELTA);
+ return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY);
}
}
-/* static */ int ForgettingCurveUtils::isValidUnigram(const int encodedUnigramProbability) {
- return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY;
+/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) {
+ return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY;
}
-/* static */ int ForgettingCurveUtils::isValidBigram(const int encodedBigramProbabilityDelta) {
- return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability) {
+ const int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
+ // TODO: Implement the decay in more proper way.
+ const float currentRate = static_cast<float>(currentEncodedProbability)
+ / static_cast<float>(MAX_ENCODED_PROBABILITY);
+ const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY
+ + (1.0f - MIN_PROBABILITY_TO_DECAY) * (1.0f - currentRate);
+ const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+ if (thresholdToDecay < randValue) {
+ return max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, 0);
+ } else {
+ return currentEncodedProbability;
+ }
}
/* static */ int ForgettingCurveUtils::decodeUnigramProbability(const int encodedProbability) {
- const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY;
+ const int probability = encodedProbability - MIN_VALID_ENCODED_PROBABILITY;
if (probability < 0) {
return NOT_A_PROBABILITY;
} else {
- return min(probability, MAX_UNIGRAM_PROBABILITY);
+ return min(probability, MAX_ENCODED_PROBABILITY) * 8;
}
}
-/* static */ int ForgettingCurveUtils::decodeBigramProbabilityDelta(
- const int encodedProbabilityDelta) {
- const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA;
- if (probabilityDelta < 0) {
+/* static */ int ForgettingCurveUtils::decodeBigramProbability(const int encodedProbability) {
+ const int probability = encodedProbability - MIN_VALID_ENCODED_PROBABILITY;
+ if (probability < 0) {
return NOT_A_PROBABILITY;
} else {
- return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA);
+ return min(probability, MAX_ENCODED_PROBABILITY) * 8;
}
}
-/* static */ int ForgettingCurveUtils::getDecayedProbability(const int rawProbability) {
- return rawProbability;
+// See comments in ProbabilityUtils::backoff().
+/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else {
+ return max(unigramProbability - 8, 0);
+ }
}
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
index 11ffb2e2f..281f76a9c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
@@ -24,7 +24,6 @@ namespace latinime {
// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
// required to introduced to each terminal PtNode and bigram entry.
// TODO: Quit using bigram probability to indicate the delta.
-// TODO: Quit using bigram probability delta.
class ForgettingCurveUtils {
public:
static const int MAX_UNIGRAM_COUNT;
@@ -33,38 +32,30 @@ class ForgettingCurveUtils {
static const int MAX_BIGRAM_COUNT_AFTER_GC;
static int getProbability(const int encodedUnigramProbability,
- const int encodedBigramProbabilityDelta);
+ const int encodedBigramProbability);
- static int getUpdatedUnigramProbability(const int originalEncodedProbability,
+ static int getUpdatedEncodedProbability(const int originalEncodedProbability,
const int newProbability);
- static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta,
- const int newProbability);
-
- static int isValidUnigram(const int encodedUnigramProbability);
-
- static int isValidBigram(const int encodedProbabilityDelta);
+ static int isValidEncodedProbability(const int encodedProbability);
- static int getUnigramProbabilityToSave(const int encodedProbability);
-
- static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta);
+ static int getEncodedProbabilityToSave(const int encodedProbability);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
static const int MAX_COMPUTED_PROBABILITY;
- static const int MAX_UNIGRAM_PROBABILITY;
- static const int MIN_VALID_UNIGRAM_PROBABILITY;
- static const int UNIGRAM_PROBABILITY_STEP;
- static const int MAX_BIGRAM_PROBABILITY_DELTA;
- static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA;
- static const int BIGRAM_PROBABILITY_DELTA_STEP;
+ static const int MAX_ENCODED_PROBABILITY;
+ static const int MIN_VALID_ENCODED_PROBABILITY;
+ static const int ENCODED_PROBABILITY_STEP;
+
+ static const float MIN_PROBABILITY_TO_DECAY;
static int decodeUnigramProbability(const int encodedProbability);
- static int decodeBigramProbabilityDelta(const int encodedProbability);
+ static int decodeBigramProbability(const int encodedProbability);
- static int getDecayedProbability(const int rawProbability);
+ static int backoff(const int unigramProbability);
};
} // namespace latinime
#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */
diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
index 8a439fc22..b2d31c21f 100644
--- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
+++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java
@@ -50,14 +50,18 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
}
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
- binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
- binaryDictionary.flushWithGC();
+ // Entries having low probability would be suppressed once in 2 GCs.
+ final int count = 2;
+ for (int i = 0; i < count; i++) {
+ binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
+ binaryDictionary.flushWithGC();
+ }
}
private void forcePassingLongTime(final BinaryDictionary binaryDictionary) {
// Currently, probabilities are decayed when GC is run. All entries that have never been
- // typed in 32 GCs are removed.
- final int count = 32;
+ // typed in 128 GCs would be removed.
+ final int count = 128;
for (int i = 0; i < count; i++) {
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
binaryDictionary.flushWithGC();