aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni')
-rw-r--r--native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp60
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node.h4
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.cpp26
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.h10
-rw-r--r--native/jni/src/suggest/core/dictionary/property/bigram_property.h1
-rw-r--r--native/jni/src/suggest/core/dictionary/property/unigram_property.h21
-rw-r--r--native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h2
-rw-r--r--native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h9
-rw-r--r--native/jni/src/suggest/core/session/dic_traverse_session.cpp2
-rw-r--r--native/jni/src/suggest/core/session/prev_words_info.h91
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h6
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp1
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp19
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h8
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp12
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h7
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h14
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp79
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h6
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp3
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h5
-rw-r--r--native/jni/src/utils/char_utils.h14
25 files changed, 277 insertions, 133 deletions
diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
index 28aaf2d1a..e41fe1d43 100644
--- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
+++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
@@ -178,10 +178,10 @@ static void latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz,
jlong proximityInfo, jlong dicTraverseSession, jintArray xCoordinatesArray,
jintArray yCoordinatesArray, jintArray timesArray, jintArray pointerIdsArray,
jintArray inputCodePointsArray, jint inputSize, jintArray suggestOptions,
- jintArray prevWordCodePointsForBigrams, jintArray outSuggestionCount,
- jintArray outCodePointsArray, jintArray outScoresArray, jintArray outSpaceIndicesArray,
- jintArray outTypesArray, jintArray outAutoCommitFirstWordConfidenceArray,
- jfloatArray inOutLanguageWeight) {
+ jintArray prevWordCodePointsForBigrams, jboolean isBeginningOfSentence,
+ jintArray outSuggestionCount, jintArray outCodePointsArray, jintArray outScoresArray,
+ jintArray outSpaceIndicesArray, jintArray outTypesArray,
+ jintArray outAutoCommitFirstWordConfidenceArray, jfloatArray inOutLanguageWeight) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
// Assign 0 to outSuggestionCount here in case of returning earlier in this method.
JniDataUtils::putIntToArray(env, outSuggestionCount, 0 /* index */, 0);
@@ -274,7 +274,7 @@ static jint latinime_BinaryDictionary_getProbability(JNIEnv *env, jclass clazz,
}
static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass clazz,
- jlong dict, jintArray word0, jintArray word1) {
+ jlong dict, jintArray word0, jboolean isBeginningOfSentence, jintArray word1) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) return JNI_FALSE;
const jsize word0Length = env->GetArrayLength(word0);
@@ -283,7 +283,7 @@ static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass c
int word1CodePoints[word1Length];
env->GetIntArrayRegion(word0, 0, word0Length, word0CodePoints);
env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints);
- const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length, false /* isStartOfSentence */);
+ const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length, isBeginningOfSentence);
return dictionary->getBigramProbability(&prevWordsInfo, word1CodePoints, word1Length);
}
@@ -326,7 +326,8 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
static void latinime_BinaryDictionary_addUnigramWord(JNIEnv *env, jclass clazz, jlong dict,
jintArray word, jint probability, jintArray shortcutTarget, jint shortcutProbability,
- jboolean isNotAWord, jboolean isBlacklisted, jint timestamp) {
+ jboolean isBeginningOfSentence, jboolean isNotAWord, jboolean isBlacklisted,
+ jint timestamp) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) {
return;
@@ -341,13 +342,14 @@ static void latinime_BinaryDictionary_addUnigramWord(JNIEnv *env, jclass clazz,
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
}
// Use 1 for count to indicate the word has inputted.
- const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
- probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
- dictionary->addUnigramWord(codePoints, codePointCount, &unigramProperty);
+ const UnigramProperty unigramProperty(isBeginningOfSentence, isNotAWord,
+ isBlacklisted, probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
+ dictionary->addUnigramEntry(codePoints, codePointCount, &unigramProperty);
}
static void latinime_BinaryDictionary_addBigramWords(JNIEnv *env, jclass clazz, jlong dict,
- jintArray word0, jintArray word1, jint probability, jint timestamp) {
+ jintArray word0, jboolean isBeginningOfSentence, jintArray word1, jint probability,
+ jint timestamp) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) {
return;
@@ -363,11 +365,12 @@ static void latinime_BinaryDictionary_addBigramWords(JNIEnv *env, jclass clazz,
// Use 1 for count to indicate the bigram has inputted.
const BigramProperty bigramProperty(&bigramTargetCodePoints, probability,
timestamp, 0 /* level */, 1 /* count */);
- dictionary->addBigramWords(word0CodePoints, word0Length, &bigramProperty);
+ const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length, isBeginningOfSentence);
+ dictionary->addNgramEntry(&prevWordsInfo, &bigramProperty);
}
static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass clazz, jlong dict,
- jintArray word0, jintArray word1) {
+ jintArray word0, jboolean isBeginningOfSentence, jintArray word1) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) {
return;
@@ -378,8 +381,8 @@ static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass claz
jsize word1Length = env->GetArrayLength(word1);
int word1CodePoints[word1Length];
env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints);
- dictionary->removeBigramWords(word0CodePoints, word0Length, word1CodePoints,
- word1Length);
+ const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length, isBeginningOfSentence);
+ dictionary->removeNgramEntry(&prevWordsInfo, word1CodePoints, word1Length);
}
// Returns how many language model params are processed.
@@ -447,9 +450,10 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
}
// Use 1 for count to indicate the word has inputted.
- const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
- unigramProbability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
- dictionary->addUnigramWord(word1CodePoints, word1Length, &unigramProperty);
+ const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
+ isBlacklisted, unigramProbability, timestamp, 0 /* level */, 1 /* count */,
+ &shortcuts);
+ dictionary->addUnigramEntry(word1CodePoints, word1Length, &unigramProperty);
if (word0) {
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
const std::vector<int> bigramTargetCodePoints(
@@ -457,7 +461,9 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
// Use 1 for count to indicate the bigram has inputted.
const BigramProperty bigramProperty(&bigramTargetCodePoints, bigramProbability,
timestamp, 0 /* level */, 1 /* count */);
- dictionary->addBigramWords(word0CodePoints, word0Length, &bigramProperty);
+ const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length,
+ false /* isBeginningOfSentence */);
+ dictionary->addNgramEntry(&prevWordsInfo, &bigramProperty);
}
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
return i + 1;
@@ -541,7 +547,7 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
return false;
}
}
- if (!dictionaryStructureWithBufferPolicy->addUnigramWord(wordCodePoints, wordLength,
+ if (!dictionaryStructureWithBufferPolicy->addUnigramEntry(wordCodePoints, wordLength,
wordProperty.getUnigramProperty())) {
LogUtils::logToJava(env, "Cannot add unigram to the new dict.");
return false;
@@ -561,8 +567,10 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
return false;
}
}
+ const PrevWordsInfo prevWordsInfo(wordCodePoints, wordLength,
+ false /* isStartOfSentence */);
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
- if (!dictionaryStructureWithBufferPolicy->addBigramWords(wordCodePoints, wordLength,
+ if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
&bigramProperty)) {
LogUtils::logToJava(env, "Cannot add bigram to the new dict.");
return false;
@@ -617,7 +625,7 @@ static const JNINativeMethod sMethods[] = {
},
{
const_cast<char *>("getSuggestionsNative"),
- const_cast<char *>("(JJJ[I[I[I[I[II[I[I[I[I[I[I[I[I[F)V"),
+ const_cast<char *>("(JJJ[I[I[I[I[II[I[IZ[I[I[I[I[I[I[F)V"),
reinterpret_cast<void *>(latinime_BinaryDictionary_getSuggestions)
},
{
@@ -627,7 +635,7 @@ static const JNINativeMethod sMethods[] = {
},
{
const_cast<char *>("getBigramProbabilityNative"),
- const_cast<char *>("(J[I[I)I"),
+ const_cast<char *>("(J[IZ[I)I"),
reinterpret_cast<void *>(latinime_BinaryDictionary_getBigramProbability)
},
{
@@ -643,17 +651,17 @@ static const JNINativeMethod sMethods[] = {
},
{
const_cast<char *>("addUnigramWordNative"),
- const_cast<char *>("(J[II[IIZZI)V"),
+ const_cast<char *>("(J[II[IIZZZI)V"),
reinterpret_cast<void *>(latinime_BinaryDictionary_addUnigramWord)
},
{
const_cast<char *>("addBigramWordsNative"),
- const_cast<char *>("(J[I[III)V"),
+ const_cast<char *>("(J[IZ[III)V"),
reinterpret_cast<void *>(latinime_BinaryDictionary_addBigramWords)
},
{
const_cast<char *>("removeBigramWordsNative"),
- const_cast<char *>("(J[I[I)V"),
+ const_cast<char *>("(J[IZ[I)V"),
reinterpret_cast<void *>(latinime_BinaryDictionary_removeBigramWords)
},
{
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h
index e69d2c46b..ef03d2b6d 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node.h
@@ -203,12 +203,12 @@ class DicNode {
return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1;
}
- // Used to get n-gram probability in DicNodeUtils
+ // Used to get n-gram probability in DicNodeUtils.
int getPtNodePos() const {
return mDicNodeProperties.getPtNodePos();
}
- // Used to get n-gram probability in DicNodeUtils
+ // Used to get n-gram probability in DicNodeUtils. n is 1-indexed.
int getNthPrevWordTerminalPtNodePos(const int n) const {
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
return NOT_A_DICT_POS;
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp
index c860d82af..bcf7d5905 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp
@@ -74,28 +74,34 @@ int Dictionary::getProbability(const int *word, int length) const {
return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos);
}
-int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word1,
- int length1) const {
+int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word,
+ int length) const {
TimeKeeper::setCurrentTime();
- return mBigramDictionary.getBigramProbability(prevWordsInfo, word1, length1);
+ return mBigramDictionary.getBigramProbability(prevWordsInfo, word, length);
}
-void Dictionary::addUnigramWord(const int *const word, const int length,
+void Dictionary::addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty) {
+ if (unigramProperty->representsBeginningOfSentence()
+ && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy()
+ ->supportsBeginningOfSentence()) {
+ AKLOGE("The dictionary doesn't support Beginning-of-Sentence.");
+ return;
+ }
TimeKeeper::setCurrentTime();
- mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, unigramProperty);
+ mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty);
}
-void Dictionary::addBigramWords(const int *const word0, const int length0,
+void Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty) {
TimeKeeper::setCurrentTime();
- mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, bigramProperty);
+ mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty);
}
-void Dictionary::removeBigramWords(const int *const word0, const int length0,
- const int *const word1, const int length1) {
+void Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
+ const int *const word, const int length) {
TimeKeeper::setCurrentTime();
- mDictionaryStructureWithBufferPolicy->removeBigramWords(word0, length0, word1, length1);
+ mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length);
}
void Dictionary::flush(const char *const filePath) {
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h
index b63c61fbb..817d9f7fc 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.h
+++ b/native/jni/src/suggest/core/dictionary/dictionary.h
@@ -73,16 +73,16 @@ class Dictionary {
int getProbability(const int *word, int length) const;
int getBigramProbability(const PrevWordsInfo *const prevWordsInfo,
- const int *word1, int length1) const;
+ const int *word, int length) const;
- void addUnigramWord(const int *const codePoints, const int codePointCount,
+ void addUnigramEntry(const int *const codePoints, const int codePointCount,
const UnigramProperty *const unigramProperty);
- void addBigramWords(const int *const word0, const int length0,
+ void addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty);
- void removeBigramWords(const int *const word0, const int length0, const int *const word1,
- const int length1);
+ void removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
+ const int length);
void flush(const char *const filePath);
diff --git a/native/jni/src/suggest/core/dictionary/property/bigram_property.h b/native/jni/src/suggest/core/dictionary/property/bigram_property.h
index 8d3429b5b..343af143c 100644
--- a/native/jni/src/suggest/core/dictionary/property/bigram_property.h
+++ b/native/jni/src/suggest/core/dictionary/property/bigram_property.h
@@ -23,6 +23,7 @@
namespace latinime {
+// TODO: Change to NgramProperty.
class BigramProperty {
public:
BigramProperty(const std::vector<int> *const targetCodePoints,
diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h
index d2551057b..902eb000f 100644
--- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h
+++ b/native/jni/src/suggest/core/dictionary/property/unigram_property.h
@@ -48,15 +48,21 @@ class UnigramProperty {
};
UnigramProperty()
- : mIsNotAWord(false), mIsBlacklisted(false), mProbability(NOT_A_PROBABILITY),
- mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), mShortcuts() {}
-
- UnigramProperty(const bool isNotAWord, const bool isBlacklisted, const int probability,
- const int timestamp, const int level, const int count,
- const std::vector<ShortcutProperty> *const shortcuts)
- : mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
+ : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false),
+ mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0),
+ mShortcuts() {}
+
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isBlacklisted, const int probability, const int timestamp, const int level,
+ const int count, const std::vector<ShortcutProperty> *const shortcuts)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {}
+ bool representsBeginningOfSentence() const {
+ return mRepresentsBeginningOfSentence;
+ }
+
bool isNotAWord() const {
return mIsNotAWord;
}
@@ -94,6 +100,7 @@ class UnigramProperty {
DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty);
// TODO: Make members const.
+ bool mRepresentsBeginningOfSentence;
bool mIsNotAWord;
bool mIsBlacklisted;
int mProbability;
diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
index 845e629e6..a61227626 100644
--- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
+++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
@@ -51,6 +51,8 @@ class DictionaryHeaderStructurePolicy {
virtual const std::vector<int> *getLocale() const = 0;
+ virtual bool supportsBeginningOfSentence() const = 0;
+
protected:
DictionaryHeaderStructurePolicy() {}
diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
index ce5a49f83..3fd815f98 100644
--- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
+++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
@@ -29,6 +29,7 @@ class DicNodeVector;
class DictionaryBigramsStructurePolicy;
class DictionaryHeaderStructurePolicy;
class DictionaryShortcutsStructurePolicy;
+class PrevWordsInfo;
class UnigramProperty;
/*
@@ -69,16 +70,16 @@ class DictionaryStructureWithBufferPolicy {
virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0;
// Returns whether the update was success or not.
- virtual bool addUnigramWord(const int *const word, const int length,
+ virtual bool addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty) = 0;
// Returns whether the update was success or not.
- virtual bool addBigramWords(const int *const word0, const int length0,
+ virtual bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty) = 0;
// Returns whether the update was success or not.
- virtual bool removeBigramWords(const int *const word0, const int length0,
- const int *const word1, const int length1) = 0;
+ virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
+ const int *const word, const int length) = 0;
virtual void flush(const char *const filePath) = 0;
diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
index dc2b66a2c..f1e411f38 100644
--- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp
+++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
@@ -36,7 +36,7 @@ void DicTraverseSession::init(const Dictionary *const dictionary,
->getMultiWordCostMultiplier();
mSuggestOptions = suggestOptions;
prevWordsInfo->getPrevWordsTerminalPtNodePos(
- getDictionaryStructurePolicy(), mPrevWordsPtNodePos);
+ getDictionaryStructurePolicy(), mPrevWordsPtNodePos, true /* tryLowerCaseSearch */);
}
void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo,
diff --git a/native/jni/src/suggest/core/session/prev_words_info.h b/native/jni/src/suggest/core/session/prev_words_info.h
index 70a99ef38..a58000abb 100644
--- a/native/jni/src/suggest/core/session/prev_words_info.h
+++ b/native/jni/src/suggest/core/session/prev_words_info.h
@@ -20,11 +20,11 @@
#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
+#include "utils/char_utils.h"
namespace latinime {
// TODO: Support n-gram.
-// TODO: Support beginning of sentence.
// This class does not take ownership of any code point buffers.
class PrevWordsInfo {
public:
@@ -41,29 +41,48 @@ class PrevWordsInfo {
mIsBeginningOfSentence[0] = isBeginningOfSentence;
}
+ bool isValid() const {
+ for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
+ if (mPrevWordCodePointCount[i] > MAX_WORD_LENGTH) {
+ return false;
+ }
+ }
+ return true;
+ }
+
void getPrevWordsTerminalPtNodePos(
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
- int *const outPrevWordsTerminalPtNodePos) const {
+ int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const {
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
- mIsBeginningOfSentence[i]);
+ mIsBeginningOfSentence[i], tryLowerCaseSearch);
}
}
BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
- int pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0],
- mPrevWordCodePointCount[0], false /* forceLowerCaseSearch */);
- // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
- // dictionary or has no bigrams
- if (NOT_A_DICT_POS == pos) {
- // If no bigrams for this exact word, search again in lower case.
- pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0],
- mPrevWordCodePointCount[0], true /* forceLowerCaseSearch */);
+ const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch(
+ dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0],
+ mIsBeginningOfSentence[0]);
+ return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(),
+ bigramListPos);
+ }
+
+ // n is 1-indexed.
+ const int *getNthPrevWordCodePoints(const int n) const {
+ if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
+ return nullptr;
+ }
+ return mPrevWordCodePoints[n - 1];
+ }
+
+ // n is 1-indexed.
+ int getNthPrevWordCodePointCount(const int n) const {
+ if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
+ return 0;
}
- return BinaryDictionaryBigramsIterator(
- dictStructurePolicy->getBigramsStructurePolicy(), pos);
+ return mPrevWordCodePointCount[n - 1];
}
private:
@@ -72,19 +91,57 @@ class PrevWordsInfo {
static int getTerminalPtNodePosOfWord(
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
const int *const wordCodePoints, const int wordCodePointCount,
- const bool isBeginningOfSentence) {
+ const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
if (!dictStructurePolicy || !wordCodePoints) {
return NOT_A_DICT_POS;
}
+ int codePoints[MAX_WORD_LENGTH];
+ int codePointCount = wordCodePointCount;
+ memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
+ if (isBeginningOfSentence) {
+ codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
+ codePointCount, MAX_WORD_LENGTH);
+ if (codePointCount <= 0) {
+ return NOT_A_DICT_POS;
+ }
+ }
const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
- wordCodePoints, wordCodePointCount, false /* forceLowerCaseSearch */);
- if (wordPtNodePos != NOT_A_DICT_POS) {
+ codePoints, codePointCount, false /* forceLowerCaseSearch */);
+ if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
+ // Return the position when when the word was found or doesn't try lower case
+ // search.
return wordPtNodePos;
}
// Check bigrams for lower-cased previous word if original was not found. Useful for
// auto-capitalized words like "The [current_word]".
return dictStructurePolicy->getTerminalPtNodePositionOfWord(
- wordCodePoints, wordCodePointCount, true /* forceLowerCaseSearch */);
+ codePoints, codePointCount, true /* forceLowerCaseSearch */);
+ }
+
+ static int getBigramListPositionForWordWithTryingLowerCaseSearch(
+ const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
+ const int *const wordCodePoints, const int wordCodePointCount,
+ const bool isBeginningOfSentence) {
+ int codePoints[MAX_WORD_LENGTH];
+ int codePointCount = wordCodePointCount;
+ memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
+ if (isBeginningOfSentence) {
+ codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
+ codePointCount, MAX_WORD_LENGTH);
+ if (codePointCount <= 0) {
+ return NOT_A_DICT_POS;
+ }
+ }
+ int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
+ codePointCount, false /* forceLowerCaseSearch */);
+ // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
+ // dictionary or has no bigrams
+ if (NOT_A_DICT_POS == pos) {
+ // If no bigrams for this exact word, search again in lower case.
+ pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
+ codePointCount, true /* forceLowerCaseSearch */);
+ }
+ return pos;
}
static int getBigramListPositionForWord(
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index 479d15164..75f4fef90 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
@@ -139,6 +139,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
switch (mDictFormatVersion) {
case FormatUtils::VERSION_2:
return FormatUtils::VERSION_2;
+ case FormatUtils::VERSION_401:
+ return FormatUtils::VERSION_401;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
case FormatUtils::VERSION_4:
@@ -246,6 +248,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return &mLocale;
}
+ bool supportsBeginningOfSentence() const {
+ return mDictFormatVersion > FormatUtils::VERSION_401;
+ }
+
private:
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
index a8f8f284b..b13ad1879 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
@@ -98,6 +98,7 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
case FormatUtils::VERSION_2:
// Version 2 dictionary writing is not supported.
return false;
+ case FormatUtils::VERSION_401:
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4:
case FormatUtils::VERSION_4_DEV:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp
index dde1af299..557a0b4c8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp
@@ -31,6 +31,7 @@
#include "suggest/core/dictionary/property/bigram_property.h"
#include "suggest/core/dictionary/property/unigram_property.h"
#include "suggest/core/dictionary/property/word_property.h"
+#include "suggest/core/session/prev_words_info.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
@@ -163,10 +164,10 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons
ptNodeParams.getTerminalId());
}
-bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int length,
+bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty) {
if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
+ AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
return false;
}
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
@@ -218,10 +219,12 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
}
}
-bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
+bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty) {
+ const int length0 = prevWordsInfo->getNthPrevWordCodePointCount(1);
+ const int *word0 = prevWordsInfo->getNthPrevWordCodePoints(1);
if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
return false;
}
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
@@ -257,8 +260,10 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
}
}
-bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
+bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const int *const word1, const int length1) {
+ const int length0 = prevWordsInfo->getNthPrevWordCodePointCount(1);
+ const int *word0 = prevWordsInfo->getNthPrevWordCodePoints(1);
if (!mBuffers->isUpdatable()) {
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
return false;
@@ -427,8 +432,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
shortcuts.emplace_back(&target, shortcutProbability);
}
}
- const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
- ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
+ const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
+ ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
historicalInfo->getCount(), &shortcuts);
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h
index 2f8ad539c..95813881d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h
@@ -108,14 +108,14 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
return &mShortcutPolicy;
}
- bool addUnigramWord(const int *const word, const int length,
+ bool addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty);
- bool addBigramWords(const int *const word0, const int length0,
+ bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty);
- bool removeBigramWords(const int *const word0, const int length0, const int *const word1,
- const int length1);
+ bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
+ const int length);
void flush(const char *const filePath);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
index 59f1f29e9..93e330a2a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
@@ -57,13 +57,14 @@ namespace latinime {
const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) {
FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion);
switch (dictFormatVersion) {
- case FormatUtils::VERSION_4: {
+ case FormatUtils::VERSION_401: {
return newPolicyForOnMemoryV4Dict<backward::v401::Ver4DictConstants,
backward::v401::Ver4DictBuffers,
backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr,
backward::v401::Ver4PatriciaTriePolicy>(
dictFormatVersion, locale, attributeMap);
}
+ case FormatUtils::VERSION_4:
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4_DEV: {
return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers,
@@ -115,13 +116,14 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
case FormatUtils::VERSION_2:
AKLOGE("Given path is a directory but the format is version 2. path: %s", path);
break;
- case FormatUtils::VERSION_4: {
+ case FormatUtils::VERSION_401: {
return newPolicyForV4Dict<backward::v401::Ver4DictConstants,
backward::v401::Ver4DictBuffers,
backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr,
backward::v401::Ver4PatriciaTriePolicy>(
headerFilePath, formatVersion, std::move(mmappedBuffer));
}
+ case FormatUtils::VERSION_4:
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4_DEV: {
return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers,
@@ -145,7 +147,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
char dictPath[dictDirPathBufSize];
if (!FileUtils::getFilePathWithoutSuffix(headerFilePath,
DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) {
- AKLOGE("Dictionary file name is not valid as a ver4 dictionary. path: %s", path);
+ AKLOGE("Dictionary file name is not valid as a ver4 dictionary. header path: %s",
+ headerFilePath);
ASSERT(false);
return nullptr;
}
@@ -153,7 +156,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion);
if (!dictBuffers || !dictBuffers->isValid()) {
AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s",
- path);
+ dictPath);
ASSERT(false);
return nullptr;
}
@@ -176,6 +179,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
case FormatUtils::VERSION_2:
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
new PatriciaTriePolicy(std::move(mmappedBuffer)));
+ case FormatUtils::VERSION_401:
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4:
case FormatUtils::VERSION_4_DEV:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
index 028e9ecbf..1f00fc6ab 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
@@ -56,7 +56,7 @@ bool DynamicPtGcEventListeners
}
} else {
mValueStack.back() += 1;
- if (ptNodeParams->isTerminal()) {
+ if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) {
mValidUnigramCount += 1;
}
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
index 5704c2e90..b2e60a837 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
@@ -160,7 +160,12 @@ class PtNodeParams {
}
AK_FORCE_INLINE bool representsNonWordInfo() const {
- return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0])
+ return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0])
+ && isNotAWord();
+ }
+
+ AK_FORCE_INLINE int representsBeginningOfSentence() const {
+ return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE
&& isNotAWord();
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
index 30dcfba37..a6a470c4e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -383,8 +383,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
shortcuts.emplace_back(&shortcutTarget, shortcutProbability);
}
}
- const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
- ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
+ const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
+ ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
index 54d1e0f6d..6240d46aa 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
@@ -81,24 +81,24 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
return &mShortcutListPolicy;
}
- bool addUnigramWord(const int *const word, const int length,
+ bool addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty) {
// This method should not be called for non-updatable dictionary.
- AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
+ AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
return false;
}
- bool addBigramWords(const int *const word0, const int length0,
+ bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty) {
// This method should not be called for non-updatable dictionary.
- AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
return false;
}
- bool removeBigramWords(const int *const word0, const int length0, const int *const word1,
- const int length1) {
+ bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
+ const int length) {
// This method should not be called for non-updatable dictionary.
- AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary.");
+ AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
return false;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
index 7da9e3072..02478700a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -23,6 +23,7 @@
#include "suggest/core/dictionary/property/bigram_property.h"
#include "suggest/core/dictionary/property/unigram_property.h"
#include "suggest/core/dictionary/property/word_property.h"
+#include "suggest/core/session/prev_words_info.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
@@ -60,7 +61,7 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
}
readingHelper.readNextSiblingNode(ptNodeParams);
- if (!ptNodeParams.representsNonWordInfo()) {
+ if (ptNodeParams.representsNonWordInfo()) {
// Skip PtNodes that represent non-word information.
continue;
}
@@ -155,10 +156,10 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons
ptNodeParams.getTerminalId());
}
-bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int length,
+bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty) {
if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
+ AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
return false;
}
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
@@ -180,9 +181,19 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
readingHelper.initWithPtNodeArrayPos(getRootPosition());
bool addedNewUnigram = false;
- if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length,
+ int codePointsToAdd[MAX_WORD_LENGTH];
+ int codePointCountToAdd = length;
+ memmove(codePointsToAdd, word, sizeof(int) * length);
+ if (unigramProperty->representsBeginningOfSentence()) {
+ codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
+ codePointCountToAdd, MAX_WORD_LENGTH);
+ }
+ if (codePointCountToAdd <= 0) {
+ return false;
+ }
+ if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd,
unigramProperty, &addedNewUnigram)) {
- if (addedNewUnigram) {
+ if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
mUnigramCount++;
}
if (unigramProperty->getShortcuts().size() > 0) {
@@ -210,10 +221,10 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
}
}
-bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
+bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty) {
if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
return false;
}
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
@@ -221,15 +232,20 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
mDictBuffer->getTailPosition());
return false;
}
- if (length0 > MAX_WORD_LENGTH
- || bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
- AKLOGE("Either src word or target word is too long to insert the bigram to the dictionary. "
- "length0: %d, length1: %d", length0, bigramProperty->getTargetCodePoints()->size());
+ if (!prevWordsInfo->isValid()) {
+ AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
return false;
}
- const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0,
- false /* forceLowerCaseSearch */);
- if (word0Pos == NOT_A_DICT_POS) {
+ if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert the ngram to the dictionary. "
+ "length: %d", bigramProperty->getTargetCodePoints()->size());
+ return false;
+ }
+ int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
+ false /* tryLowerCaseSearch */);
+ // TODO: Support N-gram.
+ if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
return false;
}
const int word1Pos = getTerminalPtNodePositionOfWord(
@@ -239,7 +255,8 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
return false;
}
bool addedNewBigram = false;
- if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, bigramProperty, &addedNewBigram)) {
+ if (mUpdatingHelper.addBigramWords(prevWordsPtNodePos[0], word1Pos, bigramProperty,
+ &addedNewBigram)) {
if (addedNewBigram) {
mBigramCount++;
}
@@ -249,10 +266,10 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
}
}
-bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
- const int *const word1, const int length1) {
+bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
+ const int *const word, const int length) {
if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
+ AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
return false;
}
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
@@ -260,22 +277,26 @@ bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int
mDictBuffer->getTailPosition());
return false;
}
- if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) {
- AKLOGE("Either src word or target word is too long to remove the bigram to from the "
- "dictionary. length0: %d, length1: %d", length0, length1);
+ if (!prevWordsInfo->isValid()) {
+ AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary.");
return false;
}
- const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0,
- false /* forceLowerCaseSearch */);
- if (word0Pos == NOT_A_DICT_POS) {
+ if (length > MAX_WORD_LENGTH) {
+ AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length);
+ }
+ int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
+ false /* tryLowerCaseSerch */);
+ // TODO: Support N-gram.
+ if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
return false;
}
- const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1,
+ const int wordPos = getTerminalPtNodePositionOfWord(word, length,
false /* forceLowerCaseSearch */);
- if (word1Pos == NOT_A_DICT_POS) {
+ if (wordPos == NOT_A_DICT_POS) {
return false;
}
- if (mUpdatingHelper.removeBigramWords(word0Pos, word1Pos)) {
+ if (mUpdatingHelper.removeBigramWords(prevWordsPtNodePos[0], wordPos)) {
mBigramCount--;
return true;
} else {
@@ -419,8 +440,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
shortcuts.emplace_back(&target, shortcutProbability);
}
}
- const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
- ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
+ const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
+ ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
historicalInfo->getCount(), &shortcuts);
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
index b78576484..008f2e423 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
@@ -90,13 +90,13 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
return &mShortcutPolicy;
}
- bool addUnigramWord(const int *const word, const int length,
+ bool addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty);
- bool addBigramWords(const int *const word0, const int length0,
+ bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
const BigramProperty *const bigramProperty);
- bool removeBigramWords(const int *const word0, const int length0, const int *const word1,
+ bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1,
const int length1);
void flush(const char *const filePath);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
index 105363db5..a04551a44 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
@@ -41,11 +41,12 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE =
TimeKeeper::setCurrentTime();
const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion);
switch (formatVersion) {
- case FormatUtils::VERSION_4:
+ case FormatUtils::VERSION_401:
return createEmptyV4DictFile<backward::v401::Ver4DictConstants,
backward::v401::Ver4DictBuffers,
backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr>(
filePath, localeAsCodePointVector, attributeMap, formatVersion);
+ case FormatUtils::VERSION_4:
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4_DEV:
return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers,
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
index ba405b07e..18f558094 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
@@ -29,6 +29,8 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
switch (formatVersion) {
case VERSION_2:
return VERSION_2;
+ case VERSION_401:
+ return VERSION_401;
case VERSION_4_ONLY_FOR_TESTING:
return VERSION_4_ONLY_FOR_TESTING;
case VERSION_4:
@@ -60,6 +62,8 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
// same so we use them for both here.
if (ByteArrayUtils::readUint16(dict, 4) == VERSION_2) {
return VERSION_2;
+ } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_401) {
+ return VERSION_401;
} else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4_ONLY_FOR_TESTING) {
return VERSION_4_ONLY_FOR_TESTING;
} else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
index c47f30ca4..b05cb2fc8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
@@ -32,8 +32,9 @@ class FormatUtils {
// These MUST have the same values as the relevant constants in FormatSpec.java.
VERSION_2 = 2,
VERSION_4_ONLY_FOR_TESTING = 399,
- VERSION_4 = 401,
- VERSION_4_DEV = 402,
+ VERSION_401 = 401,
+ VERSION_4 = 402,
+ VERSION_4_DEV = 403,
UNKNOWN_VERSION = -1
};
diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h
index 634c45b04..f28ed5682 100644
--- a/native/jni/src/utils/char_utils.h
+++ b/native/jni/src/utils/char_utils.h
@@ -18,6 +18,7 @@
#define LATINIME_CHAR_UTILS_H
#include <cctype>
+#include <cstring>
#include <vector>
#include "defines.h"
@@ -93,6 +94,19 @@ class CharUtils {
static unsigned short latin_tolower(const unsigned short c);
static const std::vector<int> EMPTY_STRING;
+ // Returns updated code point count. Returns 0 when the code points cannot be marked as a
+ // Beginning-of-Sentence.
+ static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
+ const int codePointCount, const int maxCodePoint) {
+ if (codePointCount >= maxCodePoint) {
+ // the code points cannot be marked as a Beginning-of-Sentence.
+ return 0;
+ }
+ memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
+ codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
+ return codePointCount + 1;
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);