aboutsummaryrefslogtreecommitdiffstats
path: root/native
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2013-09-30 20:53:35 +0900
committerKeisuke Kuroyanagi <ksk@google.com>2013-09-30 20:53:35 +0900
commitfd02b2d6ee55d4aee7faab89a7a2b72764eafc47 (patch)
tree25362c4a967d08ff2297e8f5663f486914c8984d /native
parent42941dd2fd2a6b3a89cbb7ab733980465b4895a6 (diff)
downloadlatinime-fd02b2d6ee55d4aee7faab89a7a2b72764eafc47.tar.gz
latinime-fd02b2d6ee55d4aee7faab89a7a2b72764eafc47.tar.xz
latinime-fd02b2d6ee55d4aee7faab89a7a2b72764eafc47.zip
Implement simple dictionary decay.
Groundwork and implement simple decay. Increment probability when typed and decrement probability at GC. Bug: 6669677 Change-Id: Ib12caead0cbeef4ce7808fe8ac0b00ee331523fe
Diffstat (limited to 'native')
-rw-r--r--native/jni/Android.mk1
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp63
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h10
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp15
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h7
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp77
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h6
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp40
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h11
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp129
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h70
11 files changed, 392 insertions, 37 deletions
diff --git a/native/jni/Android.mk b/native/jni/Android.mk
index 0594ddff0..36afea54b 100644
--- a/native/jni/Android.mk
+++ b/native/jni/Android.mk
@@ -85,6 +85,7 @@ LATIN_IME_CORE_SRC_FILES := \
$(addprefix suggest/policyimpl/dictionary/utils/, \
buffer_with_extendable_buffer.cpp \
byte_array_utils.cpp \
+ decaying_utils.cpp \
dict_file_writing_utils.cpp \
format_utils.cpp) \
suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
index e02f4cbf1..67a085de3 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
@@ -17,10 +17,10 @@
#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h"
#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
-#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
namespace latinime {
@@ -41,9 +41,14 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const
if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) {
originalBigramPos += mBuffer->getOriginalBufferSize();
}
- *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
*outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags);
*outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags);
+ if (mIsDecayingDict && !DecayingUtils::isValidBigram(*outProbability)) {
+ // This bigram is too weak to output.
+ *outBigramPos = NOT_A_DICT_POS;
+ } else {
+ *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
+ }
if (usesAdditionalBuffer) {
*bigramEntryPos += mBuffer->getOriginalBufferSize();
}
@@ -153,15 +158,21 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
const int bigramTargetNodePos =
followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos);
- // TODO: Update probability for supporting probability decaying.
if (nodeReader.isDeleted() || !nodeReader.isTerminal()
|| bigramTargetNodePos == NOT_A_DICT_POS) {
// The target is no longer valid terminal. Invalidate the current bigram entry.
if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
- NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) {
+ NOT_A_DICT_POS /* targetPtNodePos */, &bigramEntryPos)) {
return false;
}
- } else {
+ continue;
+ }
+ bool isRemoved = false;
+ if (!updateProbabilityForDecay(bigramFlags, bigramTargetNodePos, &bigramEntryPos,
+ &isRemoved)) {
+ return false;
+ }
+ if (!isRemoved) {
(*outValidBigramEntryCount) += 1;
}
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
@@ -247,8 +258,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
// Update this bigram entry.
*outAddedNewBigram = false;
+ const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags(
+ bigramFlags);
+ const int probabilityToWrite = mIsDecayingDict ?
+ DecayingUtils::getUpdatedBigramProbabilityDelta(
+ originalProbability, probability) : probability;
const BigramListReadWriteUtils::BigramFlags updatedFlags =
- BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability);
+ BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags,
+ probabilityToWrite);
return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
originalBigramPos, &entryPos);
}
@@ -276,8 +293,11 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability,
int *const writingPos) {
// hasNext is false because we are adding a new bigram entry at the end of the bigram list.
+ const int probabilityToWrite = mIsDecayingDict ?
+ DecayingUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) :
+ probability;
return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos,
- probability, false /* hasNext */, writingPos);
+ probabilityToWrite, false /* hasNext */, writingPos);
}
bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) {
@@ -339,4 +359,33 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos(
return currentPos;
}
+bool DynamicBigramListPolicy::updateProbabilityForDecay(
+ BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos,
+ int *const bigramEntryPos, bool *const outRemoved) const {
+ *outRemoved = false;
+ if (mIsDecayingDict) {
+ // Update bigram probability for decaying.
+ const int newProbability = DecayingUtils::getBigramProbabilityDeltaToSave(
+ BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags));
+ if (DecayingUtils::isValidBigram(newProbability)) {
+ // Write new probability.
+ const BigramListReadWriteUtils::BigramFlags updatedBigramFlags =
+ BigramListReadWriteUtils::setProbabilityInFlags(
+ bigramFlags, newProbability);
+ if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedBigramFlags,
+ targetPtNodePos, bigramEntryPos)) {
+ return false;
+ }
+ } else {
+ // Remove current bigram entry.
+ *outRemoved = true;
+ if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
+ NOT_A_DICT_POS /* targetPtNodePos */, bigramEntryPos)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
index 3ebf69946..b358b4ed5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
@@ -21,6 +21,7 @@
#include "defines.h"
#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
+#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
namespace latinime {
@@ -34,8 +35,9 @@ class DictionaryShortcutsStructurePolicy;
class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
public:
DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer,
- const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
- : mBuffer(buffer), mShortcutPolicy(shortcutPolicy) {}
+ const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
+ const bool isDecayingDict)
+ : mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {}
~DynamicBigramListPolicy() {}
@@ -74,9 +76,13 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
BufferWithExtendableBuffer *const mBuffer;
const DictionaryShortcutsStructurePolicy *const mShortcutPolicy;
+ const bool mIsDecayingDict;
// Follow bigram link and return the position of bigram target PtNode that is currently valid.
int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const;
+
+ bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags,
+ const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const;
};
} // namespace latinime
#endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
index 5f755c302..081163a4d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
@@ -16,6 +16,8 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+
namespace latinime {
bool DynamicPatriciaTrieGcEventListeners
@@ -25,6 +27,19 @@ bool DynamicPatriciaTrieGcEventListeners
// PtNode is useless when the PtNode is not a terminal and doesn't have any not useless
// children.
bool isUselessPtNode = !node->isTerminal();
+ if (node->isTerminal() && mIsDecayingDict) {
+ const int newProbability =
+ DecayingUtils::getUnigramProbabilityToSave(node->getProbability());
+ int writingPos = node->getProbabilityFieldPos();
+ // Update probability.
+ if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(
+ mBuffer, newProbability, &writingPos)) {
+ return false;
+ }
+ if (!DecayingUtils::isValidUnigram(newProbability)) {
+ isUselessPtNode = false;
+ }
+ }
if (mChildrenValue > 0) {
isUselessPtNode = false;
} else if (node->isTerminal()) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
index 301998882..463715af5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
@@ -39,9 +39,9 @@ class DynamicPatriciaTrieGcEventListeners {
public:
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
DynamicPatriciaTrieWritingHelper *const writingHelper,
- BufferWithExtendableBuffer *const buffer)
- : mWritingHelper(writingHelper), mBuffer(buffer), mValueStack(),
- mChildrenValue(0), mValidUnigramCount(0) {}
+ BufferWithExtendableBuffer *const buffer, const bool isDecayingDict)
+ : mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict),
+ mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {}
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
@@ -74,6 +74,7 @@ class DynamicPatriciaTrieGcEventListeners {
DynamicPatriciaTrieWritingHelper *const mWritingHelper;
BufferWithExtendableBuffer *const mBuffer;
+ const int mIsDecayingDict;
std::vector<int> mValueStack;
int mChildrenValue;
int mValidUnigramCount;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
index 8c0890e2e..0d8c92768 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
@@ -18,6 +18,7 @@
#include <cstdio>
#include <cstring>
+#include <ctime>
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
@@ -27,12 +28,17 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime {
const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
+const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
+ DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024;
+const int DynamicPatriciaTriePolicy::MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING = 2 * 60 * 60;
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
@@ -143,14 +149,17 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
- // TODO: check mHeaderPolicy.usesForgettingCurve();
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (bigramProbability == NOT_A_PROBABILITY) {
- return ProbabilityUtils::backoff(unigramProbability);
+ if (mHeaderPolicy.isDecayingDict()) {
+ return DecayingUtils::getProbability(unigramProbability, bigramProbability);
} else {
- return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
- bigramProbability);
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else if (bigramProbability == NOT_A_PROBABILITY) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ } else {
+ return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
+ bigramProbability);
+ }
}
}
@@ -199,11 +208,16 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int
AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
return false;
}
+ if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update.");
+ return false;
+ }
DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
readingHelper.initWithPtNodeArrayPos(getRootPosition());
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
bool addedNewUnigram = false;
if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
&addedNewUnigram)) {
@@ -222,6 +236,11 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
return false;
}
+ if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update.");
+ return false;
+ }
const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
false /* forceLowerCaseSearch */);
if (word0Pos == NOT_A_DICT_POS) {
@@ -233,7 +252,7 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
return false;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
bool addedNewBigram = false;
if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
if (addedNewBigram) {
@@ -251,6 +270,11 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary.");
return false;
}
+ if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update.");
+ return false;
+ }
const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
false /* forceLowerCaseSearch */);
if (word0Pos == NOT_A_DICT_POS) {
@@ -262,7 +286,7 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
return false;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
mBigramCount--;
return true;
@@ -277,7 +301,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
return;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
}
@@ -287,7 +311,7 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
return;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy);
}
@@ -296,8 +320,33 @@ bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
return false;
}
- // TODO: Implement more properly.
- return mBufferWithExtendableBuffer.isNearSizeLimit();
+ if (mBufferWithExtendableBuffer.isNearSizeLimit()) {
+ // Additional buffer size is near the limit.
+ return true;
+ } else if (mHeaderPolicy.getExtendedRegionSize()
+ + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize()
+ > MAX_DICT_EXTENDED_REGION_SIZE) {
+ // Total extended region size exceeds the limit.
+ return true;
+ } else if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
+ && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) {
+ // Needs to reduce dictionary size.
+ return true;
+ } else if (mHeaderPolicy.isDecayingDict()) {
+ if (mUnigramCount >= DecayingUtils::MAX_UNIGRAM_COUNT) {
+ // Unigram count exceeds the limit.
+ return true;
+ } else if (mBigramCount >= DecayingUtils::MAX_BIGRAM_COUNT) {
+ // Bigram count exceeds the limit.
+ return true;
+ } else if (mindsBlockByGC && mHeaderPolicy.getLastUpdatedTime()
+ + MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING < time(0)) {
+ // Time to update probabilities for decaying.
+ return true;
+ }
+ }
+ return false;
}
void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
index bdb436c8e..d3150c6fc 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
@@ -37,7 +37,8 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
mShortcutListPolicy(&mBufferWithExtendableBuffer),
- mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy),
+ mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy,
+ mHeaderPolicy.isDecayingDict()),
mUnigramCount(mHeaderPolicy.getUnigramCount()),
mBigramCount(mHeaderPolicy.getBigramCount()) {}
@@ -101,6 +102,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
static const char*const UNIGRAM_COUNT_QUERY;
static const char*const BIGRAM_COUNT_QUERY;
+ static const int MAX_DICT_EXTENDED_REGION_SIZE;
+ static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
+ static const int MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING;
const MmappedBuffer *const mBuffer;
const HeaderPolicy mHeaderPolicy;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
index 2a2e9bcbe..28124d251 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
@@ -25,6 +25,7 @@
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
#include "utils/hash_map_compat.h"
@@ -57,7 +58,9 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
wordCodePoints[matchedCodePointCount + j])) {
*outAddedNewUnigram = true;
return reallocatePtNodeAndAddNewPtNodes(nodeReader,
- readingHelper->getMergedNodeCodePoints(), j, probability,
+ readingHelper->getMergedNodeCodePoints(), j,
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */,
+ probability),
wordCodePoints + matchedCodePointCount,
codePointCount - matchedCodePointCount);
}
@@ -69,7 +72,8 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
}
if (!nodeReader->hasChildren()) {
*outAddedNewUnigram = true;
- return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability,
+ return createChildrenPtNodeArrayAndAChildPtNode(nodeReader,
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability),
wordCodePoints + readingHelper->getTotalCodePointCount(),
codePointCount - readingHelper->getTotalCodePointCount());
}
@@ -86,7 +90,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
return createAndInsertNodeIntoPtNodeArray(parentPos,
wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
codePointCount - readingHelper->getPrevTotalCodePointCount(),
- probability, &pos);
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), &pos);
}
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
@@ -351,9 +355,11 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
if (originalPtNode->isTerminal()) {
// Overwrites the probability.
*outAddedNewUnigram = false;
+ const int probabilityToWrite = getUpdatedProbability(originalPtNode->getProbability(),
+ probability);
int probabilityFieldPos = originalPtNode->getProbabilityFieldPos();
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer,
- probability, &probabilityFieldPos)) {
+ probabilityToWrite, &probabilityFieldPos)) {
return false;
}
} else {
@@ -365,7 +371,8 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
}
if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, originalPtNode,
originalPtNode->getParentPos(), codePoints, originalPtNode->getCodePointCount(),
- probability, &movedPos)) {
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability),
+ &movedPos)) {
return false;
}
}
@@ -481,11 +488,15 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
DynamicPatriciaTrieGcEventListeners
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
- this, mBuffer);
+ this, mBuffer, mIsDecayingDict);
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
return false;
}
+ if (mIsDecayingDict && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ .getValidUnigramCount() > DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
+ // TODO: Remove more unigrams.
+ }
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
@@ -495,6 +506,11 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return false;
}
+ if (mIsDecayingDict && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
+ > DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
+ // TODO: Remove more bigrams.
+ }
+
// Mapping from positions in mBuffer to positions in bufferToWrite.
DictPositionRelocationMap dictPositionRelocationMap;
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
@@ -508,7 +524,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
// Create policy instance for the GCed dictionary.
DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite);
- DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy);
+ DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy,
+ mIsDecayingDict);
// Create reading helper for the GCed dictionary.
DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy,
&newDictShortcutPolicy);
@@ -525,4 +542,13 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return true;
}
+int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability,
+ const int newProbability) {
+ if (mIsDecayingDict) {
+ return DecayingUtils::getUpdatedUnigramProbability(originalProbability, newProbability);
+ } else {
+ return newProbability;
+ }
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
index 827b6097f..ecee2cdbf 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
@@ -47,10 +47,13 @@ class DynamicPatriciaTrieWritingHelper {
DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap);
};
+ static const size_t MAX_DICTIONARY_SIZE;
+
DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer,
DynamicBigramListPolicy *const bigramPolicy,
- DynamicShortcutListPolicy *const shortcutPolicy)
- : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {}
+ DynamicShortcutListPolicy *const shortcutPolicy, const bool isDecayingDict)
+ : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy),
+ mIsDecayingDict(isDecayingDict) {}
~DynamicPatriciaTrieWritingHelper() {}
@@ -87,11 +90,11 @@ class DynamicPatriciaTrieWritingHelper {
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper);
static const int CHILDREN_POSITION_FIELD_SIZE;
- static const size_t MAX_DICTIONARY_SIZE;
BufferWithExtendableBuffer *const mBuffer;
DynamicBigramListPolicy *const mBigramPolicy;
DynamicShortcutListPolicy *const mShortcutPolicy;
+ const bool mIsDecayingDict;
bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate,
const int movedPos, const int bigramLinkedNodePos);
@@ -127,6 +130,8 @@ class DynamicPatriciaTrieWritingHelper {
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
int *const outUnigramCount, int *const outBigramCount);
+
+ int getUpdatedProbability(const int originalProbability, const int newProbability);
};
} // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp
new file mode 100644
index 000000000..942a74238
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+
+#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+
+namespace latinime {
+
+const int DecayingUtils::MAX_UNIGRAM_COUNT = 12000;
+const int DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
+const int DecayingUtils::MAX_BIGRAM_COUNT = 12000;
+const int DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
+
+const int DecayingUtils::MAX_COMPUTED_PROBABILITY = 127;
+const int DecayingUtils::MAX_UNIGRAM_PROBABILITY = 120;
+const int DecayingUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24;
+const int DecayingUtils::UNIGRAM_PROBABILITY_STEP = 8;
+const int DecayingUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15;
+const int DecayingUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3;
+const int DecayingUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1;
+
+/* static */ int DecayingUtils::getProbability(const int encodedUnigramProbability,
+ const int encodedBigramProbabilityDelta) {
+ if (encodedUnigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) {
+ const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability(
+ encodedUnigramProbability));
+ return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
+ } else {
+ const int rawProbability = ProbabilityUtils::computeProbabilityForBigram(
+ decodeUnigramProbability(encodedUnigramProbability),
+ decodeBigramProbabilityDelta(encodedBigramProbabilityDelta));
+ return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
+ }
+}
+
+/* static */ int DecayingUtils::getUpdatedUnigramProbability(const int originalEncodedProbability,
+ const int newProbability) {
+ if (originalEncodedProbability == NOT_A_PROBABILITY) {
+ // The unigram is not in this dictionary.
+ if (newProbability == NOT_A_PROBABILITY) {
+ // The unigram is not in other dictionaries.
+ return 0;
+ } else {
+ return MIN_VALID_UNIGRAM_PROBABILITY;
+ }
+ } else {
+ if (newProbability != NOT_A_PROBABILITY
+ && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) {
+ return MIN_VALID_UNIGRAM_PROBABILITY;
+ }
+ return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY);
+ }
+}
+
+/* static */ int DecayingUtils::getUnigramProbabilityToSave(const int encodedProbability) {
+ return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0);
+}
+
+/* static */ int DecayingUtils::getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta) {
+ return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0);
+}
+
+/* static */ int DecayingUtils::getUpdatedBigramProbabilityDelta(
+ const int originalEncodedProbabilityDelta, const int newProbability) {
+ if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) {
+ // The bigram relation is not in this dictionary.
+ if (newProbability == NOT_A_PROBABILITY) {
+ // The bigram target is not in other dictionaries.
+ return 0;
+ } else {
+ return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ }
+ } else {
+ if (newProbability != NOT_A_PROBABILITY
+ && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) {
+ return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ }
+ return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP,
+ MAX_BIGRAM_PROBABILITY_DELTA);
+ }
+}
+
+/* static */ int DecayingUtils::isValidUnigram(const int encodedUnigramProbability) {
+ return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY;
+}
+
+/* static */ int DecayingUtils::isValidBigram(const int encodedBigramProbabilityDelta) {
+ return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+}
+
+/* static */ int DecayingUtils::decodeUnigramProbability(const int encodedProbability) {
+ const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY;
+ if (probability < 0) {
+ return NOT_A_PROBABILITY;
+ } else {
+ return min(probability, MAX_UNIGRAM_PROBABILITY);
+ }
+}
+
+/* static */ int DecayingUtils::decodeBigramProbabilityDelta(const int encodedProbabilityDelta) {
+ const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ if (probabilityDelta < 0) {
+ return NOT_A_PROBABILITY;
+ } else {
+ return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA);
+ }
+}
+
+/* static */ int DecayingUtils::getDecayedProbability(const int rawProbability) {
+ return rawProbability;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h
new file mode 100644
index 000000000..1ca03918f
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DECAYING_UTILS_H
+#define LATINIME_DECAYING_UTILS_H
+
+#include "defines.h"
+
+namespace latinime {
+
+// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
+// required to introduced to each terminal PtNode and bigram entry.
+// TODO: Quit using bigram probability to indicate the delta.
+// TODO: Quit using bigram probability delta.
+class DecayingUtils {
+ public:
+ static const int MAX_UNIGRAM_COUNT;
+ static const int MAX_UNIGRAM_COUNT_AFTER_GC;
+ static const int MAX_BIGRAM_COUNT;
+ static const int MAX_BIGRAM_COUNT_AFTER_GC;
+
+ static int getProbability(const int encodedUnigramProbability,
+ const int encodedBigramProbabilityDelta);
+
+ static int getUpdatedUnigramProbability(const int originalEncodedProbability,
+ const int newProbability);
+
+ static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta,
+ const int newProbability);
+
+ static int isValidUnigram(const int encodedUnigramProbability);
+
+ static int isValidBigram(const int encodedProbabilityDelta);
+
+ static int getUnigramProbabilityToSave(const int encodedProbability);
+
+ static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DecayingUtils);
+
+ static const int MAX_COMPUTED_PROBABILITY;
+ static const int MAX_UNIGRAM_PROBABILITY;
+ static const int MIN_VALID_UNIGRAM_PROBABILITY;
+ static const int UNIGRAM_PROBABILITY_STEP;
+ static const int MAX_BIGRAM_PROBABILITY_DELTA;
+ static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ static const int BIGRAM_PROBABILITY_DELTA_STEP;
+
+ static int decodeUnigramProbability(const int encodedProbability);
+
+ static int decodeBigramProbabilityDelta(const int encodedProbability);
+
+ static int getDecayedProbability(const int rawProbability);
+};
+} // namespace latinime
+#endif /* LATINIME_DECAYING_UTILS_H */