aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src')
-rw-r--r--native/jni/src/defines.h2
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node.h17
-rw-r--r--native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h25
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.cpp45
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.h7
-rw-r--r--native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h5
-rw-r--r--native/jni/src/suggest/core/policy/weighting.cpp13
-rw-r--r--native/jni/src/suggest/core/suggest.cpp2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp75
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h19
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp44
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h52
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp124
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h18
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp147
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h32
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp10
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp118
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h78
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp93
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h32
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp17
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h12
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp13
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp129
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h70
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp108
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h50
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp2
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_weighting.h9
33 files changed, 1063 insertions, 313 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h
index 89dfa39b3..c2aa8ba0e 100644
--- a/native/jni/src/defines.h
+++ b/native/jni/src/defines.h
@@ -375,7 +375,7 @@ typedef enum {
CT_TERMINAL,
CT_TERMINAL_INSERTION,
// Create new word with space omission
- CT_NEW_WORD_SPACE_OMITTION,
+ CT_NEW_WORD_SPACE_OMISSION,
// Create new word with space substitution
CT_NEW_WORD_SPACE_SUBSTITUTION,
} CorrectionType;
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h
index 41ef9d2b2..9099e8285 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node.h
@@ -38,10 +38,10 @@
INTS_TO_CHARS(mDicNodeState.mDicNodeStatePrevWord.mPrevWord, \
mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength(), prevWordCharBuf, \
NELEMS(prevWordCharBuf)); \
- AKLOGI("#%8s, %5f, %5f, %5f, %5f, %s, %s, %d,,", header, \
+ AKLOGI("#%8s, %5f, %5f, %5f, %5f, %s, %s, %d, %5f,", header, \
getSpatialDistanceForScoring(), getLanguageDistanceForScoring(), \
getNormalizedCompoundDistance(), getRawLength(), prevWordCharBuf, charBuf, \
- getInputIndex(0)); \
+ getInputIndex(0), getNormalizedCompoundDistanceAfterFirstWord()); \
} while (0)
#else
#define LOGI_SHOW_ADD_COST_PROP
@@ -434,6 +434,13 @@ class DicNode {
return mDicNodeState.mDicNodeStateScoring.getLanguageDistance();
}
+ // For space-aware gestures, we store the normalized distance at the char index
+ // that ends the first word of the suggestion. We call this the distance after
+ // first word.
+ float getNormalizedCompoundDistanceAfterFirstWord() const {
+ return mDicNodeState.mDicNodeStateScoring.getNormalizedCompoundDistanceAfterFirstWord();
+ }
+
float getLanguageDistanceRatePerWordForScoring() const {
const float langDist = getLanguageDistanceForScoring();
const float totalWordCount =
@@ -565,6 +572,12 @@ class DicNode {
inputSize, getTotalInputIndex(), errorType);
}
+ // Saves the current normalized compound distance for space-aware gestures.
+ // See getNormalizedCompoundDistanceAfterFirstWord for details.
+ AK_FORCE_INLINE void saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet() {
+ mDicNodeState.mDicNodeStateScoring.saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet();
+ }
+
// Caveat: Must not be called outside Weighting
// This restriction is guaranteed by "friend"
AK_FORCE_INLINE void forwardInputIndex(const int pointerId, const int count,
diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h
index 4c884225a..3c85d0e9d 100644
--- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h
+++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h
@@ -31,7 +31,8 @@ class DicNodeStateScoring {
mDigraphIndex(DigraphUtils::NOT_A_DIGRAPH_INDEX),
mEditCorrectionCount(0), mProximityCorrectionCount(0),
mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f),
- mRawLength(0.0f), mExactMatch(true) {
+ mRawLength(0.0f), mExactMatch(true),
+ mNormalizedCompoundDistanceAfterFirstWord(MAX_VALUE_FOR_WEIGHTING) {
}
virtual ~DicNodeStateScoring() {}
@@ -45,6 +46,7 @@ class DicNodeStateScoring {
mRawLength = 0.0f;
mDoubleLetterLevel = NOT_A_DOUBLE_LETTER;
mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX;
+ mNormalizedCompoundDistanceAfterFirstWord = MAX_VALUE_FOR_WEIGHTING;
mExactMatch = true;
}
@@ -58,6 +60,8 @@ class DicNodeStateScoring {
mDoubleLetterLevel = scoring->mDoubleLetterLevel;
mDigraphIndex = scoring->mDigraphIndex;
mExactMatch = scoring->mExactMatch;
+ mNormalizedCompoundDistanceAfterFirstWord =
+ scoring->mNormalizedCompoundDistanceAfterFirstWord;
}
void addCost(const float spatialCost, const float languageCost, const bool doNormalization,
@@ -86,6 +90,17 @@ class DicNodeStateScoring {
}
}
+ // Saves the current normalized distance for space-aware gestures.
+ // See getNormalizedCompoundDistanceAfterFirstWord for details.
+ void saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet() {
+ // We get called here after each word. We only want to store the distance after
+ // the first word, so if we already have a distance we skip saving -- hence "IfNoneYet"
+ // in the method name.
+ if (mNormalizedCompoundDistanceAfterFirstWord >= MAX_VALUE_FOR_WEIGHTING) {
+ mNormalizedCompoundDistanceAfterFirstWord = getNormalizedCompoundDistance();
+ }
+ }
+
void addRawLength(const float rawLength) {
mRawLength += rawLength;
}
@@ -102,6 +117,13 @@ class DicNodeStateScoring {
return mNormalizedCompoundDistance;
}
+ // For space-aware gestures, we store the normalized distance at the char index
+ // that ends the first word of the suggestion. We call this the distance after
+ // first word.
+ float getNormalizedCompoundDistanceAfterFirstWord() const {
+ return mNormalizedCompoundDistanceAfterFirstWord;
+ }
+
float getSpatialDistance() const {
return mSpatialDistance;
}
@@ -178,6 +200,7 @@ class DicNodeStateScoring {
float mLanguageDistance;
float mRawLength;
bool mExactMatch;
+ float mNormalizedCompoundDistanceAfterFirstWord;
AK_FORCE_INLINE void addDistance(float spatialDistance, float languageDistance,
bool doNormalization, int inputSize, int totalInputIndex) {
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp
index ec1b63a12..b1d01ed86 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp
@@ -33,6 +33,8 @@
namespace latinime {
+const int Dictionary::HEADER_ATTRIBUTE_BUFFER_SIZE = 32;
+
Dictionary::Dictionary(JNIEnv *env,
DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPolicy)
: mDictionaryStructureWithBufferPolicy(dictionaryStructureWithBufferPolicy),
@@ -121,32 +123,37 @@ void Dictionary::flushWithGC(const char *const filePath) {
mDictionaryStructureWithBufferPolicy->flushWithGC(filePath);
}
-bool Dictionary::needsToRunGC() {
- return mDictionaryStructureWithBufferPolicy->needsToRunGC();
+bool Dictionary::needsToRunGC(const bool mindsBlockByGC) {
+ return mDictionaryStructureWithBufferPolicy->needsToRunGC(mindsBlockByGC);
+}
+
+void Dictionary::getProperty(const char *const query, char *const outResult,
+ const int maxResultLength) const {
+ return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength);
}
void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
- const int BUFFER_SIZE = 16;
- int dictionaryIdCodePointBuffer[BUFFER_SIZE];
- int versionStringCodePointBuffer[BUFFER_SIZE];
- int dateStringCodePointBuffer[BUFFER_SIZE];
+ int dictionaryIdCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE];
+ int versionStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE];
+ int dateStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE];
const DictionaryHeaderStructurePolicy *const headerPolicy =
getDictionaryStructurePolicy()->getHeaderStructurePolicy();
headerPolicy->readHeaderValueOrQuestionMark("dictionary", dictionaryIdCodePointBuffer,
- BUFFER_SIZE);
+ HEADER_ATTRIBUTE_BUFFER_SIZE);
headerPolicy->readHeaderValueOrQuestionMark("version", versionStringCodePointBuffer,
- BUFFER_SIZE);
- headerPolicy->readHeaderValueOrQuestionMark("date", dateStringCodePointBuffer, BUFFER_SIZE);
-
- char dictionaryIdCharBuffer[BUFFER_SIZE];
- char versionStringCharBuffer[BUFFER_SIZE];
- char dateStringCharBuffer[BUFFER_SIZE];
- intArrayToCharArray(dictionaryIdCodePointBuffer, BUFFER_SIZE,
- dictionaryIdCharBuffer, BUFFER_SIZE);
- intArrayToCharArray(versionStringCodePointBuffer, BUFFER_SIZE,
- versionStringCharBuffer, BUFFER_SIZE);
- intArrayToCharArray(dateStringCodePointBuffer, BUFFER_SIZE,
- dateStringCharBuffer, BUFFER_SIZE);
+ HEADER_ATTRIBUTE_BUFFER_SIZE);
+ headerPolicy->readHeaderValueOrQuestionMark("date", dateStringCodePointBuffer,
+ HEADER_ATTRIBUTE_BUFFER_SIZE);
+
+ char dictionaryIdCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE];
+ char versionStringCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE];
+ char dateStringCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE];
+ intArrayToCharArray(dictionaryIdCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE,
+ dictionaryIdCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE);
+ intArrayToCharArray(versionStringCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE,
+ versionStringCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE);
+ intArrayToCharArray(dateStringCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE,
+ dateStringCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE);
LogUtils::logToJava(env,
"Dictionary info: dictionary = %s ; version = %s ; date = %s",
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h
index 974447468..d8a0f3e58 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.h
+++ b/native/jni/src/suggest/core/dictionary/dictionary.h
@@ -81,7 +81,10 @@ class Dictionary {
void flushWithGC(const char *const filePath);
- bool needsToRunGC();
+ bool needsToRunGC(const bool mindsBlockByGC);
+
+ void getProperty(const char *const query, char *const outResult,
+ const int maxResultLength) const;
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
return mDictionaryStructureWithBufferPolicy;
@@ -92,6 +95,8 @@ class Dictionary {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary);
+ static const int HEADER_ATTRIBUTE_BUFFER_SIZE;
+
DictionaryStructureWithBufferPolicy *const mDictionaryStructureWithBufferPolicy;
const BigramDictionary *const mBigramDictionary;
const SuggestInterface *const mGestureSuggest;
diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
index b95488ebd..c7ffef0d5 100644
--- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
+++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
@@ -78,7 +78,10 @@ class DictionaryStructureWithBufferPolicy {
virtual void flushWithGC(const char *const filePath) = 0;
- virtual bool needsToRunGC() const = 0;
+ virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0;
+
+ virtual void getProperty(const char *const query, char *const outResult,
+ const int maxResultLength) const = 0;
protected:
DictionaryStructureWithBufferPolicy() {}
diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp
index f9b777df2..0c4016893 100644
--- a/native/jni/src/suggest/core/policy/weighting.cpp
+++ b/native/jni/src/suggest/core/policy/weighting.cpp
@@ -38,7 +38,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_SUBSTITUTION:
PROF_SUBSTITUTION(node->mProfiler);
return;
- case CT_NEW_WORD_SPACE_OMITTION:
+ case CT_NEW_WORD_SPACE_OMISSION:
PROF_NEW_WORD(node->mProfiler);
return;
case CT_MATCH:
@@ -93,6 +93,11 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
}
dicNode->addCost(spatialCost, languageCost, weighting->needsToNormalizeCompoundDistance(),
inputSize, errorType);
+ if (CT_NEW_WORD_SPACE_OMISSION == correctionType) {
+ // When we are on a terminal, we save the current distance for evaluating
+ // when to auto-commit partial suggestions.
+ dicNode->saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet();
+ }
}
/* static */ float Weighting::getSpatialCost(const Weighting *const weighting,
@@ -108,7 +113,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_SUBSTITUTION:
// only used for typing
return weighting->getSubstitutionCost();
- case CT_NEW_WORD_SPACE_OMITTION:
+ case CT_NEW_WORD_SPACE_OMISSION:
return weighting->getNewWordSpatialCost(traverseSession, dicNode, inputStateG);
case CT_MATCH:
return weighting->getMatchedCost(traverseSession, dicNode, inputStateG);
@@ -138,7 +143,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return 0.0f;
case CT_SUBSTITUTION:
return 0.0f;
- case CT_NEW_WORD_SPACE_OMITTION:
+ case CT_NEW_WORD_SPACE_OMISSION:
return weighting->getNewWordBigramLanguageCost(
traverseSession, parentDicNode, multiBigramMap);
case CT_MATCH:
@@ -173,7 +178,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return 0; /* 0 because CT_MATCH will be called */
case CT_SUBSTITUTION:
return 0; /* 0 because CT_MATCH will be called */
- case CT_NEW_WORD_SPACE_OMITTION:
+ case CT_NEW_WORD_SPACE_OMISSION:
return 0;
case CT_MATCH:
return 1;
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index b1340e12f..e20bc497a 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -574,7 +574,7 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
DicNodeUtils::initAsRootWithPreviousWord(
traverseSession->getDictionaryStructurePolicy(), dicNode, &newDicNode);
const CorrectionType correctionType = spaceSubstitution ?
- CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMITTION;
+ CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMISSION;
Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode,
&newDicNode, traverseSession->getMultiBigramMap());
if (newDicNode.getCompoundDistance() < static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
index bc2f5ee58..67a085de3 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp
@@ -17,10 +17,10 @@
#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h"
#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
-#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
namespace latinime {
@@ -41,9 +41,14 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const
if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) {
originalBigramPos += mBuffer->getOriginalBufferSize();
}
- *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
*outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags);
*outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags);
+ if (mIsDecayingDict && !DecayingUtils::isValidBigram(*outProbability)) {
+ // This bigram is too weak to output.
+ *outBigramPos = NOT_A_DICT_POS;
+ } else {
+ *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
+ }
if (usesAdditionalBuffer) {
*bigramEntryPos += mBuffer->getOriginalBufferSize();
}
@@ -119,7 +124,7 @@ bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const b
// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode
// has been deleted or is not a valid terminal.
bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
- int *const bigramListPos) {
+ int *const bigramListPos, int *const outValidBigramEntryCount) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize();
@@ -153,14 +158,22 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
const int bigramTargetNodePos =
followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos);
nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos);
- // TODO: Update probability for supporting probability decaying.
if (nodeReader.isDeleted() || !nodeReader.isTerminal()
|| bigramTargetNodePos == NOT_A_DICT_POS) {
// The target is no longer valid terminal. Invalidate the current bigram entry.
if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
- NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) {
+ NOT_A_DICT_POS /* targetPtNodePos */, &bigramEntryPos)) {
return false;
}
+ continue;
+ }
+ bool isRemoved = false;
+ if (!updateProbabilityForDecay(bigramFlags, bigramTargetNodePos, &bigramEntryPos,
+ &isRemoved)) {
+ return false;
+ }
+ if (!isRemoved) {
+ (*outValidBigramEntryCount) += 1;
}
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
return true;
@@ -169,7 +182,7 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
// Updates bigram target PtNode positions in the list after the placing step in GC.
bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos,
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
- ptNodePositionRelocationMap) {
+ ptNodePositionRelocationMap, int *const outBigramEntryCount) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize();
@@ -211,11 +224,12 @@ bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bi
return false;
}
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
+ (*outBigramEntryCount) = bigramEntryCount;
return true;
}
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos,
- const int probability, int *const bigramListPos) {
+ const int probability, int *const bigramListPos, bool *const outAddedNewBigram) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize();
@@ -243,8 +257,15 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
}
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
// Update this bigram entry.
+ *outAddedNewBigram = false;
+ const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags(
+ bigramFlags);
+ const int probabilityToWrite = mIsDecayingDict ?
+ DecayingUtils::getUpdatedBigramProbabilityDelta(
+ originalProbability, probability) : probability;
const BigramListReadWriteUtils::BigramFlags updatedFlags =
- BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability);
+ BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags,
+ probabilityToWrite);
return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
originalBigramPos, &entryPos);
}
@@ -254,12 +275,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
// The current last entry is found.
// First, update the flags of the last entry.
if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) {
+ *outAddedNewBigram = false;
return false;
}
if (usesAdditionalBuffer) {
*bigramListPos += mBuffer->getOriginalBufferSize();
}
// Then, add a new entry after the last entry.
+ *outAddedNewBigram = true;
return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos);
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
// We return directly from the while loop.
@@ -270,8 +293,11 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability,
int *const writingPos) {
// hasNext is false because we are adding a new bigram entry at the end of the bigram list.
+ const int probabilityToWrite = mIsDecayingDict ?
+ DecayingUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) :
+ probability;
return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos,
- probability, false /* hasNext */, writingPos);
+ probabilityToWrite, false /* hasNext */, writingPos);
}
bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) {
@@ -325,7 +351,7 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos(
nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos);
bigramLinkCount++;
if (bigramLinkCount > CONTINUING_BIGRAM_LINK_COUNT_LIMIT) {
- AKLOGE("Bigram link is invalid. start position: %d", bigramPos);
+ AKLOGE("Bigram link is invalid. start position: %d", originalBigramPos);
ASSERT(false);
return NOT_A_DICT_POS;
}
@@ -333,4 +359,33 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos(
return currentPos;
}
+bool DynamicBigramListPolicy::updateProbabilityForDecay(
+ BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos,
+ int *const bigramEntryPos, bool *const outRemoved) const {
+ *outRemoved = false;
+ if (mIsDecayingDict) {
+ // Update bigram probability for decaying.
+ const int newProbability = DecayingUtils::getBigramProbabilityDeltaToSave(
+ BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags));
+ if (DecayingUtils::isValidBigram(newProbability)) {
+ // Write new probability.
+ const BigramListReadWriteUtils::BigramFlags updatedBigramFlags =
+ BigramListReadWriteUtils::setProbabilityInFlags(
+ bigramFlags, newProbability);
+ if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedBigramFlags,
+ targetPtNodePos, bigramEntryPos)) {
+ return false;
+ }
+ } else {
+ // Remove current bigram entry.
+ *outRemoved = true;
+ if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags,
+ NOT_A_DICT_POS /* targetPtNodePos */, bigramEntryPos)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
index 8ea318a41..b358b4ed5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h
@@ -21,6 +21,7 @@
#include "defines.h"
#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
+#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
namespace latinime {
@@ -34,8 +35,9 @@ class DictionaryShortcutsStructurePolicy;
class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
public:
DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer,
- const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
- : mBuffer(buffer), mShortcutPolicy(shortcutPolicy) {}
+ const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
+ const bool isDecayingDict)
+ : mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {}
~DynamicBigramListPolicy() {}
@@ -50,19 +52,20 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos,
int *const toPos, int *const outBigramsCount) const;
- bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos);
+ bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos,
+ int *const outBigramEntryCount);
bool updateAllBigramTargetPtNodePositions(int *const bigramListPos,
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
- ptNodePositionRelocationMap);
+ ptNodePositionRelocationMap, int *const outValidBigramEntryCount);
bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability,
- int *const bigramListPos);
+ int *const bigramListPos, bool *const outAddedNewBigram);
bool writeNewBigramEntry(const int bigramTargetPos, const int probability,
int *const writingPos);
- // Return if targetBigramPos is found or not.
+ // Return whether or not targetBigramPos is found.
bool removeBigram(const int bigramListPos, const int bigramTargetPos);
private:
@@ -73,9 +76,13 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
BufferWithExtendableBuffer *const mBuffer;
const DictionaryShortcutsStructurePolicy *const mShortcutPolicy;
+ const bool mIsDecayingDict;
// Follow bigram link and return the position of bigram target PtNode that is currently valid.
int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const;
+
+ bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags,
+ const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const;
};
} // namespace latinime
#endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
index c60e45819..081163a4d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp
@@ -16,6 +16,8 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+
namespace latinime {
bool DynamicPatriciaTrieGcEventListeners
@@ -25,6 +27,19 @@ bool DynamicPatriciaTrieGcEventListeners
// PtNode is useless when the PtNode is not a terminal and doesn't have any not useless
// children.
bool isUselessPtNode = !node->isTerminal();
+ if (node->isTerminal() && mIsDecayingDict) {
+ const int newProbability =
+ DecayingUtils::getUnigramProbabilityToSave(node->getProbability());
+ int writingPos = node->getProbabilityFieldPos();
+ // Update probability.
+ if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(
+ mBuffer, newProbability, &writingPos)) {
+ return false;
+ }
+ if (!DecayingUtils::isValidUnigram(newProbability)) {
+ isUselessPtNode = false;
+ }
+ }
if (mChildrenValue > 0) {
isUselessPtNode = false;
} else if (node->isTerminal()) {
@@ -41,7 +56,27 @@ bool DynamicPatriciaTrieGcEventListeners
return false;
}
} else {
- valueStack.back() += 1;
+ mValueStack.back() += 1;
+ if (node->isTerminal()) {
+ mValidUnigramCount += 1;
+ }
+ }
+ return true;
+}
+
+bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
+ ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
+ const int *const nodeCodePoints) {
+ if (!node->isDeleted()) {
+ int pos = node->getBigramsPos();
+ if (pos != NOT_A_DICT_POS) {
+ int bigramEntryCount = 0;
+ if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos,
+ &bigramEntryCount)) {
+ return false;
+ }
+ mValidBigramEntryCount += bigramEntryCount;
+ }
}
return true;
}
@@ -137,10 +172,15 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionField
// Updates bigram target PtNode positions in the bigram list.
int bigramsPos = node->getBigramsPos();
if (bigramsPos != NOT_A_DICT_POS) {
+ int bigramEntryCount;
if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos,
- &mDictPositionRelocationMap->mPtNodePositionRelocationMap)) {
+ &mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) {
return false;
}
+ mBigramCount += bigramEntryCount;
+ }
+ if (node->isTerminal()) {
+ mUnigramCount++;
}
return true;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
index 4256f22fb..463715af5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h
@@ -39,23 +39,23 @@ class DynamicPatriciaTrieGcEventListeners {
public:
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
DynamicPatriciaTrieWritingHelper *const writingHelper,
- BufferWithExtendableBuffer *const buffer)
- : mWritingHelper(writingHelper), mBuffer(buffer), valueStack(),
- mChildrenValue(0) {}
+ BufferWithExtendableBuffer *const buffer, const bool isDecayingDict)
+ : mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict),
+ mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {}
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
bool onAscend() {
- if (valueStack.empty()) {
+ if (mValueStack.empty()) {
return false;
}
- mChildrenValue = valueStack.back();
- valueStack.pop_back();
+ mChildrenValue = mValueStack.back();
+ mValueStack.pop_back();
return true;
}
bool onDescend(const int ptNodeArrayPos) {
- valueStack.push_back(0);
+ mValueStack.push_back(0);
return true;
}
@@ -64,14 +64,20 @@ class DynamicPatriciaTrieGcEventListeners {
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
const int *const nodeCodePoints);
+ int getValidUnigramCount() const {
+ return mValidUnigramCount;
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
DynamicPatriciaTrieWritingHelper *const mWritingHelper;
BufferWithExtendableBuffer *const mBuffer;
- std::vector<int> valueStack;
+ const int mIsDecayingDict;
+ std::vector<int> mValueStack;
int mChildrenValue;
+ int mValidUnigramCount;
};
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
@@ -80,7 +86,7 @@ class DynamicPatriciaTrieGcEventListeners {
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public:
TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
- : mBigramPolicy(bigramPolicy) {}
+ : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {}
bool onAscend() { return true; }
@@ -89,22 +95,17 @@ class DynamicPatriciaTrieGcEventListeners {
bool onReadingPtNodeArrayTail() { return true; }
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
- const int *const nodeCodePoints) {
- if (!node->isDeleted()) {
- int pos = node->getBigramsPos();
- if (pos != NOT_A_DICT_POS) {
- if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) {
- return false;
- }
- }
- }
- return true;
+ const int *const nodeCodePoints);
+
+ int getValidBigramEntryCount() const {
+ return mValidBigramEntryCount;
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability);
DynamicBigramListPolicy *const mBigramPolicy;
+ int mValidBigramEntryCount;
};
class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
@@ -150,7 +151,8 @@ class DynamicPatriciaTrieGcEventListeners {
dictPositionRelocationMap)
: mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy),
mBufferToWrite(bufferToWrite),
- mDictPositionRelocationMap(dictPositionRelocationMap) {};
+ mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0),
+ mBigramCount(0) {};
bool onAscend() { return true; }
@@ -161,6 +163,14 @@ class DynamicPatriciaTrieGcEventListeners {
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
const int *const nodeCodePoints);
+ int getUnigramCount() const {
+ return mUnigramCount;
+ }
+
+ int getBigramCount() const {
+ return mBigramCount;
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields);
@@ -169,6 +179,8 @@ class DynamicPatriciaTrieGcEventListeners {
BufferWithExtendableBuffer *const mBufferToWrite;
const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const
mDictPositionRelocationMap;
+ int mUnigramCount;
+ int mBigramCount;
};
private:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
index 42397c19e..0d8c92768 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp
@@ -16,6 +16,10 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
@@ -24,10 +28,18 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime {
+const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
+const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
+const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
+ DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024;
+const int DynamicPatriciaTriePolicy::MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING = 2 * 60 * 60;
+
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
if (!dicNode->hasChildren()) {
@@ -137,14 +149,17 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
- // TODO: check mHeaderPolicy.usesForgettingCurve();
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (bigramProbability == NOT_A_PROBABILITY) {
- return ProbabilityUtils::backoff(unigramProbability);
+ if (mHeaderPolicy.isDecayingDict()) {
+ return DecayingUtils::getProbability(unigramProbability, bigramProbability);
} else {
- return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
- bigramProbability);
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else if (bigramProbability == NOT_A_PROBABILITY) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ } else {
+ return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
+ bigramProbability);
+ }
}
}
@@ -193,12 +208,26 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int
AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
return false;
}
+ if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update.");
+ return false;
+ }
DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
readingHelper.initWithPtNodeArrayPos(getRootPosition());
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
- return writingHelper.addUnigramWord(&readingHelper, word, length, probability);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
+ bool addedNewUnigram = false;
+ if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
+ &addedNewUnigram)) {
+ if (addedNewUnigram) {
+ mUnigramCount++;
+ }
+ return true;
+ } else {
+ return false;
+ }
}
bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
@@ -207,6 +236,11 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
return false;
}
+ if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update.");
+ return false;
+ }
const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
false /* forceLowerCaseSearch */);
if (word0Pos == NOT_A_DICT_POS) {
@@ -218,8 +252,16 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
return false;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
- return writingHelper.addBigramWords(word0Pos, word1Pos, probability);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
+ bool addedNewBigram = false;
+ if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
+ if (addedNewBigram) {
+ mBigramCount++;
+ }
+ return true;
+ } else {
+ return false;
+ }
}
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
@@ -228,6 +270,11 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary.");
return false;
}
+ if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update.");
+ return false;
+ }
const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
false /* forceLowerCaseSearch */);
if (word0Pos == NOT_A_DICT_POS) {
@@ -239,8 +286,13 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
return false;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
- return writingHelper.removeBigramWords(word0Pos, word1Pos);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
+ if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
+ mBigramCount--;
+ return true;
+ } else {
+ return false;
+ }
}
void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
@@ -249,8 +301,8 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
return;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
- writingHelper.writeToDictFile(filePath, &mHeaderPolicy);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
+ writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
}
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
@@ -259,17 +311,51 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
return;
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
- &mBigramListPolicy, &mShortcutListPolicy);
+ &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy);
}
-bool DynamicPatriciaTriePolicy::needsToRunGC() const {
+bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
if (!mBuffer->isUpdatable()) {
AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
return false;
}
- // TODO: Implement more properly.
- return mBufferWithExtendableBuffer.isNearSizeLimit();
+ if (mBufferWithExtendableBuffer.isNearSizeLimit()) {
+ // Additional buffer size is near the limit.
+ return true;
+ } else if (mHeaderPolicy.getExtendedRegionSize()
+ + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize()
+ > MAX_DICT_EXTENDED_REGION_SIZE) {
+ // Total extended region size exceeds the limit.
+ return true;
+ } else if (mBufferWithExtendableBuffer.getTailPosition()
+ >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
+ && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) {
+ // Needs to reduce dictionary size.
+ return true;
+ } else if (mHeaderPolicy.isDecayingDict()) {
+ if (mUnigramCount >= DecayingUtils::MAX_UNIGRAM_COUNT) {
+ // Unigram count exceeds the limit.
+ return true;
+ } else if (mBigramCount >= DecayingUtils::MAX_BIGRAM_COUNT) {
+ // Bigram count exceeds the limit.
+ return true;
+ } else if (mindsBlockByGC && mHeaderPolicy.getLastUpdatedTime()
+ + MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING < time(0)) {
+ // Time to update probabilities for decaying.
+ return true;
+ }
+ }
+ return false;
+}
+
+void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
+ const int maxResultLength) const {
+ if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d", mUnigramCount);
+ } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d", mBigramCount);
+ }
}
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
index 06d8095d8..d3150c6fc 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h
@@ -37,7 +37,10 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
mShortcutListPolicy(&mBufferWithExtendableBuffer),
- mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy) {}
+ mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy,
+ mHeaderPolicy.isDecayingDict()),
+ mUnigramCount(mHeaderPolicy.getUnigramCount()),
+ mBigramCount(mHeaderPolicy.getBigramCount()) {}
~DynamicPatriciaTriePolicy() {
delete mBuffer;
@@ -89,16 +92,27 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
void flushWithGC(const char *const filePath);
- bool needsToRunGC() const;
+ bool needsToRunGC(const bool mindsBlockByGC) const;
+
+ void getProperty(const char *const query, char *const outResult,
+ const int maxResultLength) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
+ static const char*const UNIGRAM_COUNT_QUERY;
+ static const char*const BIGRAM_COUNT_QUERY;
+ static const int MAX_DICT_EXTENDED_REGION_SIZE;
+ static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
+ static const int MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING;
+
const MmappedBuffer *const mBuffer;
const HeaderPolicy mHeaderPolicy;
BufferWithExtendableBuffer mBufferWithExtendableBuffer;
DynamicShortcutListPolicy mShortcutListPolicy;
DynamicBigramListPolicy mBigramListPolicy;
+ int mUnigramCount;
+ int mBigramCount;
};
} // namespace latinime
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
index 154590fbd..512a4d818 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h
@@ -264,7 +264,7 @@ class DynamicPatriciaTrieReadingHelper {
AK_FORCE_INLINE void pushReadingStateToStack() {
if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) {
- AKLOGI("Reading state stack overflow. Max size: %d", MAX_READING_STATE_STACK_SIZE);
+ AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE);
ASSERT(false);
mIsError = true;
mReadingState.mPos = NOT_A_DICT_POS;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
index a51ae5e1d..28124d251 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp
@@ -16,9 +16,6 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
-#include <cstdio>
-#include <cstring>
-
#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
@@ -28,19 +25,20 @@
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h"
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
#include "utils/hash_map_compat.h"
namespace latinime {
const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
-const char *const DynamicPatriciaTrieWritingHelper::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE =
- ".tmp";
// TODO: Make MAX_DICTIONARY_SIZE 8MB.
const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 * 1024;
bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
DynamicPatriciaTrieReadingHelper *const readingHelper,
- const int *const wordCodePoints, const int codePointCount, const int probability) {
+ const int *const wordCodePoints, const int codePointCount, const int probability,
+ bool *const outAddedNewUnigram) {
int parentPos = NOT_A_DICT_POS;
while (!readingHelper->isEnd()) {
const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
@@ -58,8 +56,11 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
const int nextIndex = matchedCodePointCount + j;
if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j,
wordCodePoints[matchedCodePointCount + j])) {
+ *outAddedNewUnigram = true;
return reallocatePtNodeAndAddNewPtNodes(nodeReader,
- readingHelper->getMergedNodeCodePoints(), j, probability,
+ readingHelper->getMergedNodeCodePoints(), j,
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */,
+ probability),
wordCodePoints + matchedCodePointCount,
codePointCount - matchedCodePointCount);
}
@@ -67,10 +68,12 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
// All characters are matched.
if (codePointCount == readingHelper->getTotalCodePointCount()) {
return setPtNodeProbability(nodeReader, probability,
- readingHelper->getMergedNodeCodePoints());
+ readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram);
}
if (!nodeReader->hasChildren()) {
- return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability,
+ *outAddedNewUnigram = true;
+ return createChildrenPtNodeArrayAndAChildPtNode(nodeReader,
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability),
wordCodePoints + readingHelper->getTotalCodePointCount(),
codePointCount - readingHelper->getTotalCodePointCount());
}
@@ -83,14 +86,15 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
return false;
}
int pos = readingHelper->getPosOfLastForwardLinkField();
+ *outAddedNewUnigram = true;
return createAndInsertNodeIntoPtNodeArray(parentPos,
wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
codePointCount - readingHelper->getPrevTotalCodePointCount(),
- probability, &pos);
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), &pos);
}
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
- const int probability) {
+ const int probability, bool *const outAddedNewBigram) {
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
@@ -111,9 +115,11 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
// Insert a new bigram entry into the existing bigram list.
int bigramListPos = nodeReader.getBigramsPos();
- return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos);
+ return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos,
+ outAddedNewBigram);
} else {
// The PtNode doesn't have a bigram list.
+ *outAddedNewBigram = true;
// First, Write a bigram entry at the tail position of the PtNode.
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
return false;
@@ -142,26 +148,32 @@ bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, con
}
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
- const HeaderPolicy *const headerPolicy) {
+ const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) {
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
- if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) {
+ const int extendedRegionSize = headerPolicy->getExtendedRegionSize() +
+ mBuffer->getUsedAdditionalBufferSize();
+ if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */,
+ unigramCount, bigramCount, extendedRegionSize)) {
return;
}
- flushAllToFile(fileName, &headerBuffer, mBuffer);
+ DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer);
}
void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
const char *const fileName, const HeaderPolicy *const headerPolicy) {
- BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
- if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) {
- return;
- }
BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */,
MAX_DICTIONARY_SIZE);
- if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) {
+ int unigramCount = 0;
+ int bigramCount = 0;
+ if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) {
+ return;
+ }
+ BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
+ if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
+ unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
return;
}
- flushAllToFile(fileName, &headerBuffer, &newDictBuffer);
+ DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer);
}
bool DynamicPatriciaTrieWritingHelper::markNodeAsDeleted(
@@ -339,23 +351,28 @@ bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const
bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability,
- const int *const codePoints) {
+ const int *const codePoints, bool *const outAddedNewUnigram) {
if (originalPtNode->isTerminal()) {
// Overwrites the probability.
+ *outAddedNewUnigram = false;
+ const int probabilityToWrite = getUpdatedProbability(originalPtNode->getProbability(),
+ probability);
int probabilityFieldPos = originalPtNode->getProbabilityFieldPos();
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer,
- probability, &probabilityFieldPos)) {
+ probabilityToWrite, &probabilityFieldPos)) {
return false;
}
} else {
// Make the node terminal and write the probability.
+ *outAddedNewUnigram = true;
int movedPos = mBuffer->getTailPosition();
if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) {
return false;
}
if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, originalPtNode,
originalPtNode->getParentPos(), codePoints, originalPtNode->getCodePointCount(),
- probability, &movedPos)) {
+ getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability),
+ &movedPos)) {
return false;
}
}
@@ -463,72 +480,23 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes(
return true;
}
-// TODO: Create a struct which contains header, body and etc... and use here as an argument.
-void DynamicPatriciaTrieWritingHelper::flushAllToFile(const char *const fileName,
- BufferWithExtendableBuffer *const dictHeader,
- BufferWithExtendableBuffer *const dictBody) const {
- const int tmpFileNameBufSize = strlen(fileName)
- + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */;
- // Name of a temporary file used for writing that is a connected string of original name and
- // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE.
- char tmpFileName[tmpFileNameBufSize];
- snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", fileName,
- TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE);
- FILE *const file = fopen(tmpFileName, "wb");
- if (!file) {
- AKLOGI("Dictionary file %s cannnot be opened.", tmpFileName);
- ASSERT(false);
- return;
- }
- // Write the dictionary header.
- if (!writeBufferToFilePointer(file, dictHeader)) {
- remove(tmpFileName);
- AKLOGI("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition());
- ASSERT(false);
- return;
- }
- // Write the dictionary body.
- if (!writeBufferToFilePointer(file, dictBody)) {
- remove(tmpFileName);
- AKLOGI("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition());
- ASSERT(false);
- return;
- }
- fclose(file);
- rename(tmpFileName, fileName);
-}
-
-// This closes file pointer when an error is caused and returns whether the writing was succeeded
-// or not.
-bool DynamicPatriciaTrieWritingHelper::writeBufferToFilePointer(FILE *const file,
- const BufferWithExtendableBuffer *const buffer) const {
- const int originalBufSize = buffer->getOriginalBufferSize();
- if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */),
- originalBufSize, 1, file) < 1) {
- fclose(file);
- return false;
- }
- const int additionalBufSize = buffer->getTailPosition() - buffer->getOriginalBufferSize();
- if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */),
- additionalBufSize, 1, file) < 1) {
- fclose(file);
- return false;
- }
- return true;
-}
-
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
- BufferWithExtendableBuffer *const bufferToWrite) {
+ BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
+ int *const outBigramCount) {
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
- this, mBuffer);
+ this, mBuffer, mIsDecayingDict);
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
return false;
}
+ if (mIsDecayingDict && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ .getValidUnigramCount() > DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
+ // TODO: Remove more unigrams.
+ }
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
@@ -538,6 +506,11 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return false;
}
+ if (mIsDecayingDict && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
+ > DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
+ // TODO: Remove more bigrams.
+ }
+
// Mapping from positions in mBuffer to positions in bufferToWrite.
DictPositionRelocationMap dictPositionRelocationMap;
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
@@ -551,7 +524,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
// Create policy instance for the GCed dictionary.
DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite);
- DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy);
+ DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy,
+ mIsDecayingDict);
// Create reading helper for the GCed dictionary.
DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy,
&newDictShortcutPolicy);
@@ -563,7 +537,18 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
&traversePolicyToUpdateAllPositionFields)) {
return false;
}
+ *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
+ *outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
return true;
}
+int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability,
+ const int newProbability) {
+ if (mIsDecayingDict) {
+ return DecayingUtils::getUpdatedUnigramProbability(originalProbability, newProbability);
+ } else {
+ return newProbability;
+ }
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
index 028fa6075..ecee2cdbf 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h
@@ -17,7 +17,6 @@
#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H
#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H
-#include <cstdio>
#include <stdint.h>
#include "defines.h"
@@ -48,24 +47,30 @@ class DynamicPatriciaTrieWritingHelper {
DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap);
};
+ static const size_t MAX_DICTIONARY_SIZE;
+
DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer,
DynamicBigramListPolicy *const bigramPolicy,
- DynamicShortcutListPolicy *const shortcutPolicy)
- : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {}
+ DynamicShortcutListPolicy *const shortcutPolicy, const bool isDecayingDict)
+ : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy),
+ mIsDecayingDict(isDecayingDict) {}
~DynamicPatriciaTrieWritingHelper() {}
// Add a word to the dictionary. If the word already exists, update the probability.
bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper,
- const int *const wordCodePoints, const int codePointCount, const int probability);
+ const int *const wordCodePoints, const int codePointCount, const int probability,
+ bool *const outAddedNewUnigram);
// Add a bigram relation from word0Pos to word1Pos.
- bool addBigramWords(const int word0Pos, const int word1Pos, const int probability);
+ bool addBigramWords(const int word0Pos, const int word1Pos, const int probability,
+ bool *const outAddedNewBigram);
// Remove a bigram relation from word0Pos to word1Pos.
bool removeBigramWords(const int word0Pos, const int word1Pos);
- void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy);
+ void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy,
+ const int unigramCount, const int bigramCount);
void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName,
const HeaderPolicy *const headerPolicy);
@@ -85,12 +90,11 @@ class DynamicPatriciaTrieWritingHelper {
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper);
static const int CHILDREN_POSITION_FIELD_SIZE;
- static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE;
- static const size_t MAX_DICTIONARY_SIZE;
BufferWithExtendableBuffer *const mBuffer;
DynamicBigramListPolicy *const mBigramPolicy;
DynamicShortcutListPolicy *const mShortcutPolicy;
+ const bool mIsDecayingDict;
bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate,
const int movedPos, const int bigramLinkedNodePos);
@@ -109,7 +113,7 @@ class DynamicPatriciaTrieWritingHelper {
const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos);
bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode,
- const int probability, const int *const codePoints);
+ const int probability, const int *const codePoints, bool *const outAddedNewUnigram);
bool createChildrenPtNodeArrayAndAChildPtNode(
const DynamicPatriciaTrieNodeReader *const parentNode, const int probability,
@@ -124,14 +128,10 @@ class DynamicPatriciaTrieWritingHelper {
const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
const int newNodeCodePointCount);
- void flushAllToFile(const char *const fileName,
- BufferWithExtendableBuffer *const dictHeader,
- BufferWithExtendableBuffer *const dictBody) const;
-
- bool writeBufferToFilePointer(FILE *const file,
- const BufferWithExtendableBuffer *const buffer) const;
+ bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
+ int *const outUnigramCount, int *const outBigramCount);
- bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite);
+ int getUpdatedProbability(const int originalProbability, const int newProbability);
};
} // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp
index 5a3983776..30ff10cd6 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp
@@ -36,6 +36,16 @@ const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000;
const int DynamicPatriciaTrieWritingUtils::PROBABILITY_FIELD_SIZE = 1;
const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1;
+/* static */ bool DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(
+ BufferWithExtendableBuffer *const buffer, const int rootPos) {
+ int writingPos = rootPos;
+ if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) {
+ return false;
+ }
+ return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */,
+ &writingPos);
+}
+
/* static */ bool DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(
BufferWithExtendableBuffer *const buffer, const int forwardLinkPos,
int *const forwardLinkFieldPos) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h
index a37e9fb3d..af76bc6b5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h
@@ -30,6 +30,8 @@ class DynamicPatriciaTrieWritingUtils {
public:
static const int NODE_FLAG_FIELD_SIZE;
+ static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos);
+
static bool writeForwardLinkPositionAndAdvancePosition(
BufferWithExtendableBuffer *const buffer, const int forwardLinkPos,
int *const forwardLinkFieldPos);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
index 47ace23a1..9ce9994dd 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
@@ -16,16 +16,17 @@
#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include <cstddef>
-#include <cstdio>
-#include <ctime>
-
namespace latinime {
+// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
-const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
+// TODO: Change attribute string to "IS_DECAYING_DICT".
+const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date";
-const float HeaderPolicy::DEFAULT_MULTIPLE_WORD_COST_MULTIPLIER = 1.0f;
+const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
+const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
+const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
+const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
// Used for logging. Question mark is used to indicate that the key is not found.
@@ -37,7 +38,7 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out
return;
}
std::vector<int> keyCodePointVector;
- insertCharactersIntoVector(key, &keyCodePointVector);
+ HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector);
HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector);
if (it == mAttributeMap.end()) {
// The key was not found.
@@ -53,51 +54,17 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out
}
float HeaderPolicy::readMultipleWordCostMultiplier() const {
- int attributeValue = 0;
- if (getAttributeValueAsInt(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &attributeValue)) {
- if (attributeValue <= 0) {
- return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
- }
- return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(attributeValue);
- } else {
- return DEFAULT_MULTIPLE_WORD_COST_MULTIPLIER;
- }
-}
-
-bool HeaderPolicy::readUsesForgettingCurveFlag() const {
- int attributeValue = 0;
- if (getAttributeValueAsInt(USES_FORGETTING_CURVE_KEY, &attributeValue)) {
- return attributeValue != 0;
- } else {
- return false;
- }
-}
-
-// Returns S_INT_MIN when the key is not found or the value is invalid.
-int HeaderPolicy::readLastUpdatedTime() const {
- int attributeValue = 0;
- if (getAttributeValueAsInt(LAST_UPDATED_TIME_KEY, &attributeValue)) {
- return attributeValue;
- } else {
- return S_INT_MIN;
- }
-}
-
-// Returns whether the key is found or not and stores the found value into outValue.
-bool HeaderPolicy::getAttributeValueAsInt(const char *const key, int *const outValue) const {
- std::vector<int> keyVector;
- insertCharactersIntoVector(key, &keyVector);
- HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyVector);
- if (it == mAttributeMap.end()) {
- // The key was not found.
- return false;
+ const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
+ if (demotionRate <= 0) {
+ return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
}
- *outValue = parseIntAttributeValue(&(it->second));
- return true;
+ return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
}
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
- const bool updatesLastUpdatedTime) const {
+ const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount,
+ const int extendedRegionSize) const {
int writingPos = 0;
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
&writingPos)) {
@@ -113,26 +80,19 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT
&writingPos)) {
return false;
}
+ HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
+ HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount);
+ HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount);
+ HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY,
+ extendedRegionSize);
if (updatesLastUpdatedTime) {
// Set current time as a last updated time.
- HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
- std::vector<int> updatedTimekey;
- insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey);
- const time_t currentTime = time(NULL);
- std::vector<int> updatedTimeValue;
- char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
- snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%ld", currentTime);
- insertCharactersIntoVector(charBuf, &updatedTimeValue);
- attributeMapTowrite[updatedTimekey] = updatedTimeValue;
- if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
- &writingPos)) {
- return false;
- }
- } else {
- if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap,
- &writingPos)) {
- return false;
- }
+ HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY,
+ time(0));
+ }
+ if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
+ &writingPos)) {
+ return false;
}
// Writes an actual header size.
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos,
@@ -149,30 +109,4 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT
return attributeMap;
}
-/* static */ int HeaderPolicy::parseIntAttributeValue(
- const std::vector<int> *const attributeValue) {
- int value = 0;
- bool isNegative = false;
- for (size_t i = 0; i < attributeValue->size(); ++i) {
- if (i == 0 && attributeValue->at(i) == '-') {
- isNegative = true;
- } else {
- if (!isdigit(attributeValue->at(i))) {
- // If not a number, return S_INT_MIN
- return S_INT_MIN;
- }
- value *= 10;
- value += attributeValue->at(i) - '0';
- }
- }
- return isNegative ? -value : value;
-}
-
-/* static */ void HeaderPolicy::insertCharactersIntoVector(const char *const characters,
- std::vector<int> *const vector) {
- for (int i = 0; characters[i]; ++i) {
- vector->push_back(characters[i]);
- }
-}
-
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index 6b396f3f2..4261667fa 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
@@ -17,7 +17,7 @@
#ifndef LATINIME_HEADER_POLICY_H
#define LATINIME_HEADER_POLICY_H
-#include <cctype>
+#include <ctime>
#include <stdint.h>
#include "defines.h"
@@ -29,15 +29,36 @@ namespace latinime {
class HeaderPolicy : public DictionaryHeaderStructurePolicy {
public:
- explicit HeaderPolicy(const uint8_t *const dictBuf, const int dictSize)
- : mDictBuf(dictBuf),
- mDictFormatVersion(FormatUtils::detectFormatVersion(dictBuf, dictSize)),
+ // Reads information from existing dictionary buffer.
+ HeaderPolicy(const uint8_t *const dictBuf, const int dictSize)
+ : mDictFormatVersion(FormatUtils::detectFormatVersion(dictBuf, dictSize)),
mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
- mAttributeMap(createAttributeMapAndReadAllAttributes(mDictBuf)),
+ mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
- mUsesForgettingCurve(readUsesForgettingCurveFlag()),
- mLastUpdatedTime(readLastUpdatedTime()) {}
+ mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ IS_DECAYING_DICT_KEY, false /* defaultValue */)),
+ mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
+ mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
+ mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
+ mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {}
+
+ // Constructs header information using an attribute map.
+ HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
+ const HeaderReadWriteUtils::AttributeMap *const attributeMap)
+ : mDictFormatVersion(dictFormatVersion),
+ mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
+ attributeMap)), mSize(0), mAttributeMap(*attributeMap),
+ mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
+ mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ IS_DECAYING_DICT_KEY, false /* defaultValue */)),
+ mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
+ mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {}
~HeaderPolicy() {}
@@ -61,53 +82,60 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mMultiWordCostMultiplier;
}
- AK_FORCE_INLINE bool usesForgettingCurve() const {
- return mUsesForgettingCurve;
+ AK_FORCE_INLINE bool isDecayingDict() const {
+ return mIsDecayingDict;
}
AK_FORCE_INLINE int getLastUpdatedTime() const {
return mLastUpdatedTime;
}
+ AK_FORCE_INLINE int getUnigramCount() const {
+ return mUnigramCount;
+ }
+
+ AK_FORCE_INLINE int getBigramCount() const {
+ return mBigramCount;
+ }
+
+ AK_FORCE_INLINE int getExtendedRegionSize() const {
+ return mExtendedRegionSize;
+ }
+
void readHeaderValueOrQuestionMark(const char *const key,
int *outValue, int outValueSize) const;
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
- const bool updatesLastUpdatedTime) const;
+ const bool updatesLastUpdatedTime, const int unigramCount,
+ const int bigramCount, const int extendedRegionSize) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
- static const char *const USES_FORGETTING_CURVE_KEY;
+ static const char *const IS_DECAYING_DICT_KEY;
static const char *const LAST_UPDATED_TIME_KEY;
- static const float DEFAULT_MULTIPLE_WORD_COST_MULTIPLIER;
+ static const char *const UNIGRAM_COUNT_KEY;
+ static const char *const BIGRAM_COUNT_KEY;
+ static const char *const EXTENDED_REGION_SIZE_KEY;
+ static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
- const uint8_t *const mDictBuf;
const FormatUtils::FORMAT_VERSION mDictFormatVersion;
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
const int mSize;
HeaderReadWriteUtils::AttributeMap mAttributeMap;
const float mMultiWordCostMultiplier;
- const bool mUsesForgettingCurve;
+ const bool mIsDecayingDict;
const int mLastUpdatedTime;
+ const int mUnigramCount;
+ const int mBigramCount;
+ const int mExtendedRegionSize;
float readMultipleWordCostMultiplier() const;
- bool readUsesForgettingCurveFlag() const;
-
- int readLastUpdatedTime() const;
-
- bool getAttributeValueAsInt(const char *const key, int *const outValue) const;
-
static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes(
const uint8_t *const dictBuf);
-
- static int parseIntAttributeValue(const std::vector<int> *const attributeValue);
-
- static void insertCharactersIntoVector(
- const char *const characters, std::vector<int> *const vector);
};
} // namespace latinime
#endif /* LATINIME_HEADER_POLICY_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
index 80fe88671..2694ce8d5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
@@ -16,6 +16,8 @@
#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
+#include <cctype>
+#include <cstdio>
#include <vector>
#include "defines.h"
@@ -43,6 +45,13 @@ const HeaderReadWriteUtils::DictionaryFlags
const HeaderReadWriteUtils::DictionaryFlags
HeaderReadWriteUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
+// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
+const char *const HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_KEY = "SUPPORTS_DYNAMIC_UPDATE";
+const char *const HeaderReadWriteUtils::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
+ "REQUIRES_GERMAN_UMLAUT_PROCESSING";
+const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY =
+ "REQUIRES_FRENCH_LIGATURE_PROCESSING";
+
/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) {
// See the format of the header in the comment in
// BinaryDictionaryFormatUtils::detectFormatVersion()
@@ -56,6 +65,22 @@ const HeaderReadWriteUtils::DictionaryFlags
HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE);
}
+/* static */ HeaderReadWriteUtils::DictionaryFlags
+ HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
+ const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
+ const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap,
+ REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */);
+ const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap,
+ REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */);
+ const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap,
+ SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */);
+ DictionaryFlags dictflags = NO_FLAGS;
+ dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0;
+ dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0;
+ dictflags |= supportsDynamicUpdate ? SUPPORTS_DYNAMIC_UPDATE_FLAG : 0;
+ return dictflags;
+}
+
/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf,
AttributeMap *const headerAttributes) {
const int headerSize = getHeaderSize(dictBuf);
@@ -128,4 +153,72 @@ const HeaderReadWriteUtils::DictionaryFlags
return true;
}
+/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const bool value) {
+ setIntAttribute(headerAttributes, key, value ? 1 : 0);
+}
+
+/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const int value) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ setIntAttributeInner(headerAttributes, &keyVector, value);
+}
+
+/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes,
+ const AttributeMap::key_type *const key, const int value) {
+ AttributeMap::mapped_type valueVector;
+ char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
+ snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%d", value);
+ insertCharactersIntoVector(charBuf, &valueVector);
+ (*headerAttributes)[*key] = valueVector;
+}
+
+/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key,
+ const bool defaultValue) {
+ const int intDefaultValue = defaultValue ? 1 : 0;
+ const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
+ return intValue != 0;
+}
+
+/* static */ int HeaderReadWriteUtils::readIntAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key,
+ const int defaultValue) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue);
+}
+
+/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner(
+ const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
+ const int defaultValue) {
+ AttributeMap::const_iterator it = headerAttributes->find(*key);
+ if (it != headerAttributes->end()) {
+ int value = 0;
+ bool isNegative = false;
+ for (size_t i = 0; i < it->second.size(); ++i) {
+ if (i == 0 && it->second.at(i) == '-') {
+ isNegative = true;
+ } else {
+ if (!isdigit(it->second.at(i))) {
+ // If not a number.
+ return defaultValue;
+ }
+ value *= 10;
+ value += it->second.at(i) - '0';
+ }
+ }
+ return isNegative ? -value : value;
+ }
+ return defaultValue;
+}
+
+/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters,
+ std::vector<int> *const vector) {
+ for (int i = 0; characters[i]; ++i) {
+ vector->push_back(characters[i]);
+ }
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h
index 6cce73375..225968323 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h
@@ -54,6 +54,9 @@ class HeaderReadWriteUtils {
+ HEADER_SIZE_FIELD_SIZE;
}
+ static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap(
+ const HeaderReadWriteUtils::AttributeMap *const attributeMap);
+
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
AttributeMap *const headerAttributes);
@@ -69,6 +72,24 @@ class HeaderReadWriteUtils {
static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer,
const AttributeMap *const headerAttributes, int *const writingPos);
+ /**
+ * Methods for header attributes.
+ */
+ static void setBoolAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const bool value);
+
+ static void setIntAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const int value);
+
+ static bool readBoolAttributeValue(const AttributeMap *const headerAttributes,
+ const char *const key, const bool defaultValue);
+
+ static int readIntAttributeValue(const AttributeMap *const headerAttributes,
+ const char *const key, const int defaultValue);
+
+ static void insertCharactersIntoVector(const char *const characters,
+ AttributeMap::key_type *const key);
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils);
@@ -87,7 +108,16 @@ class HeaderReadWriteUtils {
static const DictionaryFlags GERMAN_UMLAUT_PROCESSING_FLAG;
static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG;
static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG;
- static const DictionaryFlags CONTAINS_BIGRAMS_FLAG;
+
+ static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY;
+ static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
+ static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY;
+
+ static void setIntAttributeInner(AttributeMap *const headerAttributes,
+ const AttributeMap::key_type *const key, const int value);
+
+ static int readIntAttributeValueInner(const AttributeMap *const headerAttributes,
+ const AttributeMap::key_type *const key, const int defaultValue);
};
}
#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp
index 5269795a4..8a84bd261 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp
@@ -31,9 +31,21 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
return;
}
int nextPos = dicNode->getChildrenPos();
+ if (nextPos < 0 || nextPos >= mDictBufferSize) {
+ AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d",
+ nextPos, mDictBufferSize);
+ ASSERT(false);
+ return;
+ }
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
mDictRoot, &nextPos);
for (int i = 0; i < childCount; i++) {
+ if (nextPos < 0 || nextPos >= mDictBufferSize) {
+ AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d",
+ nextPos, mDictBufferSize, i, childCount);
+ ASSERT(false);
+ return;
+ }
nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
}
}
@@ -404,6 +416,11 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
getBigramsStructurePolicy()->skipAllBigrams(&pos);
}
+ if (mergedNodeCodePointCount <= 0) {
+ AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
+ ASSERT(false);
+ return pos;
+ }
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
PatriciaTrieReadingUtils::isTerminal(flags),
PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h
index 19155f938..8d88c68e8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h
@@ -36,6 +36,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
PatriciaTriePolicy(const MmappedBuffer *const buffer)
: mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()),
mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()),
+ mDictBufferSize(mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {}
~PatriciaTriePolicy() {
@@ -106,18 +107,27 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
}
- bool needsToRunGC() const {
+ bool needsToRunGC(const bool mindsBlockByGC) const {
// This method should not be called for non-updatable dictionary.
AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
return false;
}
+ void getProperty(const char *const query, char *const outResult,
+ const int maxResultLength) const {
+ // getProperty is not supported for this class.
+ if (maxResultLength > 0) {
+ outResult[0] = '\0';
+ }
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);
const MmappedBuffer *const mBuffer;
const HeaderPolicy mHeaderPolicy;
const uint8_t *const mDictRoot;
+ const int mDictBufferSize;
const BigramListPolicy mBigramListPolicy;
const ShortcutListPolicy mShortcutListPolicy;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp
index 1316b425f..7df55815f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp
@@ -71,8 +71,17 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
pos);
} else {
- if (maxLength > 0) {
- outBuffer[0] = getCodePointAndAdvancePosition(buffer, pos);
+ const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
+ if (codePoint == NOT_A_CODE_POINT) {
+ // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
+ // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
+ // when the PtNode has a single code point.
+ length = 0;
+ AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
+ *pos - 1, codePoint, buffer[*pos - 1]);
+ ASSERT(false);
+ } else if (maxLength > 0) {
+ outBuffer[0] = codePoint;
length = 1;
}
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h
index 17d2e39c2..9dc34823c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h
@@ -42,6 +42,10 @@ class BufferWithExtendableBuffer {
return mOriginalBufferSize + mUsedAdditionalBufferSize;
}
+ AK_FORCE_INLINE int getUsedAdditionalBufferSize() const {
+ return mUsedAdditionalBufferSize;
+ }
+
/**
* For reading.
*/
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp
new file mode 100644
index 000000000..942a74238
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/policyimpl/dictionary/utils/decaying_utils.h"
+
+#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+
+namespace latinime {
+
+const int DecayingUtils::MAX_UNIGRAM_COUNT = 12000;
+const int DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
+const int DecayingUtils::MAX_BIGRAM_COUNT = 12000;
+const int DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
+
+const int DecayingUtils::MAX_COMPUTED_PROBABILITY = 127;
+const int DecayingUtils::MAX_UNIGRAM_PROBABILITY = 120;
+const int DecayingUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24;
+const int DecayingUtils::UNIGRAM_PROBABILITY_STEP = 8;
+const int DecayingUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15;
+const int DecayingUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3;
+const int DecayingUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1;
+
+/* static */ int DecayingUtils::getProbability(const int encodedUnigramProbability,
+ const int encodedBigramProbabilityDelta) {
+ if (encodedUnigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) {
+ const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability(
+ encodedUnigramProbability));
+ return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
+ } else {
+ const int rawProbability = ProbabilityUtils::computeProbabilityForBigram(
+ decodeUnigramProbability(encodedUnigramProbability),
+ decodeBigramProbabilityDelta(encodedBigramProbabilityDelta));
+ return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
+ }
+}
+
+/* static */ int DecayingUtils::getUpdatedUnigramProbability(const int originalEncodedProbability,
+ const int newProbability) {
+ if (originalEncodedProbability == NOT_A_PROBABILITY) {
+ // The unigram is not in this dictionary.
+ if (newProbability == NOT_A_PROBABILITY) {
+ // The unigram is not in other dictionaries.
+ return 0;
+ } else {
+ return MIN_VALID_UNIGRAM_PROBABILITY;
+ }
+ } else {
+ if (newProbability != NOT_A_PROBABILITY
+ && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) {
+ return MIN_VALID_UNIGRAM_PROBABILITY;
+ }
+ return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY);
+ }
+}
+
+/* static */ int DecayingUtils::getUnigramProbabilityToSave(const int encodedProbability) {
+ return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0);
+}
+
+/* static */ int DecayingUtils::getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta) {
+ return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0);
+}
+
+/* static */ int DecayingUtils::getUpdatedBigramProbabilityDelta(
+ const int originalEncodedProbabilityDelta, const int newProbability) {
+ if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) {
+ // The bigram relation is not in this dictionary.
+ if (newProbability == NOT_A_PROBABILITY) {
+ // The bigram target is not in other dictionaries.
+ return 0;
+ } else {
+ return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ }
+ } else {
+ if (newProbability != NOT_A_PROBABILITY
+ && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) {
+ return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ }
+ return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP,
+ MAX_BIGRAM_PROBABILITY_DELTA);
+ }
+}
+
+/* static */ int DecayingUtils::isValidUnigram(const int encodedUnigramProbability) {
+ return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY;
+}
+
+/* static */ int DecayingUtils::isValidBigram(const int encodedBigramProbabilityDelta) {
+ return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+}
+
+/* static */ int DecayingUtils::decodeUnigramProbability(const int encodedProbability) {
+ const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY;
+ if (probability < 0) {
+ return NOT_A_PROBABILITY;
+ } else {
+ return min(probability, MAX_UNIGRAM_PROBABILITY);
+ }
+}
+
+/* static */ int DecayingUtils::decodeBigramProbabilityDelta(const int encodedProbabilityDelta) {
+ const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ if (probabilityDelta < 0) {
+ return NOT_A_PROBABILITY;
+ } else {
+ return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA);
+ }
+}
+
+/* static */ int DecayingUtils::getDecayedProbability(const int rawProbability) {
+ return rawProbability;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h
new file mode 100644
index 000000000..1ca03918f
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DECAYING_UTILS_H
+#define LATINIME_DECAYING_UTILS_H
+
+#include "defines.h"
+
+namespace latinime {
+
+// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
+// required to introduced to each terminal PtNode and bigram entry.
+// TODO: Quit using bigram probability to indicate the delta.
+// TODO: Quit using bigram probability delta.
+class DecayingUtils {
+ public:
+ static const int MAX_UNIGRAM_COUNT;
+ static const int MAX_UNIGRAM_COUNT_AFTER_GC;
+ static const int MAX_BIGRAM_COUNT;
+ static const int MAX_BIGRAM_COUNT_AFTER_GC;
+
+ static int getProbability(const int encodedUnigramProbability,
+ const int encodedBigramProbabilityDelta);
+
+ static int getUpdatedUnigramProbability(const int originalEncodedProbability,
+ const int newProbability);
+
+ static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta,
+ const int newProbability);
+
+ static int isValidUnigram(const int encodedUnigramProbability);
+
+ static int isValidBigram(const int encodedProbabilityDelta);
+
+ static int getUnigramProbabilityToSave(const int encodedProbability);
+
+ static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DecayingUtils);
+
+ static const int MAX_COMPUTED_PROBABILITY;
+ static const int MAX_UNIGRAM_PROBABILITY;
+ static const int MIN_VALID_UNIGRAM_PROBABILITY;
+ static const int UNIGRAM_PROBABILITY_STEP;
+ static const int MAX_BIGRAM_PROBABILITY_DELTA;
+ static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA;
+ static const int BIGRAM_PROBABILITY_DELTA_STEP;
+
+ static int decodeUnigramProbability(const int encodedProbability);
+
+ static int decodeBigramProbabilityDelta(const int encodedProbability);
+
+ static int getDecayedProbability(const int rawProbability);
+};
+} // namespace latinime
+#endif /* LATINIME_DECAYING_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
new file mode 100644
index 000000000..f22e94c6a
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "suggest/policyimpl/dictionary/header/header_policy.h"
+#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h"
+#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "suggest/policyimpl/dictionary/utils/format_utils.h"
+
+namespace latinime {
+
+const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp";
+
+/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath,
+ const int dictVersion, const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
+ switch (dictVersion) {
+ case 3:
+ return createEmptyV3DictFile(filePath, attributeMap);
+ default:
+ // Only version 3 dictionary is supported for now.
+ return false;
+ }
+}
+
+/* static */ bool DictFileWritingUtils::createEmptyV3DictFile(const char *const filePath,
+ const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
+ BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
+ HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap);
+ headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
+ 0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */);
+ BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
+ if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) {
+ return false;
+ }
+ return flushAllHeaderAndBodyToFile(filePath, &headerBuffer, &bodyBuffer);
+}
+
+/* static */ bool DictFileWritingUtils::flushAllHeaderAndBodyToFile(const char *const filePath,
+ BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody) {
+ const int tmpFileNameBufSize = strlen(filePath)
+ + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */;
+ // Name of a temporary file used for writing that is a connected string of original name and
+ // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE.
+ char tmpFileName[tmpFileNameBufSize];
+ snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", filePath,
+ TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE);
+ FILE *const file = fopen(tmpFileName, "wb");
+ if (!file) {
+ AKLOGE("Dictionary file %s cannnot be opened.", tmpFileName);
+ ASSERT(false);
+ return false;
+ }
+ // Write the dictionary header.
+ if (!writeBufferToFile(file, dictHeader)) {
+ remove(tmpFileName);
+ AKLOGE("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition());
+ ASSERT(false);
+ return false;
+ }
+ // Write the dictionary body.
+ if (!writeBufferToFile(file, dictBody)) {
+ remove(tmpFileName);
+ AKLOGE("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition());
+ ASSERT(false);
+ return false;
+ }
+ fclose(file);
+ rename(tmpFileName, filePath);
+ return true;
+}
+
+// This closes file pointer when an error is caused and returns whether the writing was succeeded
+// or not.
+/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file,
+ const BufferWithExtendableBuffer *const buffer) {
+ const int originalBufSize = buffer->getOriginalBufferSize();
+ if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */),
+ originalBufSize, 1, file) < 1) {
+ fclose(file);
+ return false;
+ }
+ const int additionalBufSize = buffer->getUsedAdditionalBufferSize();
+ if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */),
+ additionalBufSize, 1, file) < 1) {
+ fclose(file);
+ return false;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h
new file mode 100644
index 000000000..bd4ac66fd
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H
+#define LATINIME_DICT_FILE_WRITING_UTILS_H
+
+#include <cstdio>
+
+#include "defines.h"
+#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class DictFileWritingUtils {
+ public:
+ static bool createEmptyDictFile(const char *const filePath, const int dictVersion,
+ const HeaderReadWriteUtils::AttributeMap *const attributeMap);
+
+ static bool flushAllHeaderAndBodyToFile(const char *const filePath,
+ BufferWithExtendableBuffer *const dictHeader,
+ BufferWithExtendableBuffer *const dictBody);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils);
+
+ static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE;
+
+ static bool createEmptyV3DictFile(const char *const filePath,
+ const HeaderReadWriteUtils::AttributeMap *const attributeMap);
+
+ static bool writeBufferToFile(FILE *const file,
+ const BufferWithExtendableBuffer *const buffer);
+};
+} // namespace latinime
+#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp
index 408b12ae9..5b6b5e874 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp
+++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp
@@ -47,7 +47,7 @@ ErrorType TypingWeighting::getErrorType(const CorrectionType correctionType,
case CT_TERMINAL_INSERTION:
case CT_TRANSPOSITION:
return ET_EDIT_CORRECTION;
- case CT_NEW_WORD_SPACE_OMITTION:
+ case CT_NEW_WORD_SPACE_OMISSION:
case CT_NEW_WORD_SPACE_SUBSTITUTION:
return ET_NEW_WORD;
case CT_TERMINAL:
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
index b6aa85896..9f0a331e3 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
@@ -74,7 +74,8 @@ class TypingWeighting : public Weighting {
// Note: min() required since length can be MAX_POINT_TO_KEY_LENGTH for characters not on
// the keyboard (like accented letters)
const float normalizedSquaredLength = traverseSession->getProximityInfoState(0)
- ->getPointToKeyLength(pointIndex, dicNode->getNodeCodePoint());
+ ->getPointToKeyLength(pointIndex,
+ CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint()));
const float normalizedDistance = TouchPositionCorrectionUtils::getSweetSpotFactor(
traverseSession->isTouchPositionCorrectionEnabled(), normalizedSquaredLength);
const float weightedDistance = ScoringParams::DISTANCE_WEIGHT_LENGTH * normalizedDistance;
@@ -113,10 +114,10 @@ class TypingWeighting : public Weighting {
const int16_t parentPointIndex = parentDicNode->getInputIndex(0);
const int prevCodePoint = parentDicNode->getNodeCodePoint();
const float distance1 = traverseSession->getProximityInfoState(0)->getPointToKeyLength(
- parentPointIndex + 1, prevCodePoint);
+ parentPointIndex + 1, CharUtils::toBaseLowerCase(prevCodePoint));
const int codePoint = dicNode->getNodeCodePoint();
const float distance2 = traverseSession->getProximityInfoState(0)->getPointToKeyLength(
- parentPointIndex, codePoint);
+ parentPointIndex, CharUtils::toBaseLowerCase(codePoint));
const float distance = distance1 + distance2;
const float weightedLengthDistance =
distance * ScoringParams::DISTANCE_WEIGHT_LENGTH;
@@ -133,7 +134,7 @@ class TypingWeighting : public Weighting {
const bool existsAdjacentProximityChars = traverseSession->getProximityInfoState(0)
->existsAdjacentProximityChars(insertedPointIndex);
const float dist = traverseSession->getProximityInfoState(0)->getPointToKeyLength(
- insertedPointIndex + 1, dicNode->getNodeCodePoint());
+ insertedPointIndex + 1, CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint()));
const float weightedDistance = dist * ScoringParams::DISTANCE_WEIGHT_LENGTH;
const bool singleChar = dicNode->getNodeCodePointCount() == 1;
float cost = (singleChar ? ScoringParams::INSERTION_COST_FIRST_CHAR : 0.0f);