diff options
Diffstat (limited to 'native/jni/src')
26 files changed, 211 insertions, 75 deletions
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index fe3167a61..898b44f44 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -80,32 +80,38 @@ int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, c return mBigramDictionary.getBigramProbability(prevWordsInfo, word, length); } -void Dictionary::addUnigramEntry(const int *const word, const int length, +bool Dictionary::addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty) { + if (unigramProperty->representsBeginningOfSentence() + && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy() + ->supportsBeginningOfSentence()) { + AKLOGE("The dictionary doesn't support Beginning-of-Sentence."); + return false; + } TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); + return mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); } -void Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, +bool Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, const BigramProperty *const bigramProperty) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty); + return mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty); } -void Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, +bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length); + return mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length); } -void Dictionary::flush(const char *const filePath) { +bool Dictionary::flush(const char *const filePath) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->flush(filePath); + return mDictionaryStructureWithBufferPolicy->flush(filePath); } -void Dictionary::flushWithGC(const char *const filePath) { +bool Dictionary::flushWithGC(const char *const filePath) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); + return mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); } bool Dictionary::needsToRunGC(const bool mindsBlockByGC) { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 817d9f7fc..f6d406fbd 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -57,6 +57,7 @@ class Dictionary { static const int KIND_MASK_FLAGS = 0xFFFFFF00; // Mask to get the flags static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; static const int KIND_FLAG_EXACT_MATCH = 0x40000000; + static const int KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = 0x20000000; Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy); @@ -75,18 +76,18 @@ class Dictionary { int getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, int length) const; - void addUnigramEntry(const int *const codePoints, const int codePointCount, + bool addUnigramEntry(const int *const codePoints, const int codePointCount, const UnigramProperty *const unigramProperty); - void addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, const BigramProperty *const bigramProperty); - void removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, + bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC); diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp index 0635fef7e..b6bf7a98c 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -31,4 +31,8 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x80; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = NOT_AN_ERROR | MATCH_WITH_CASE_ERROR | MATCH_WITH_ACCENT_ERROR | MATCH_WITH_DIGRAPH; +const ErrorTypeUtils::ErrorType + ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = + ERRORS_TREATED_AS_AN_EXACT_MATCH | INTENTIONAL_OMISSION; + } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h index 0e8e5b635..e3e76b238 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.h +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -51,6 +51,11 @@ class ErrorTypeUtils { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; } + static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) { + return (containedErrorTypes + & ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0; + } + static bool isEditCorrectionError(const ErrorType errorType) { return (errorType & EDIT_CORRECTION) != 0; } @@ -67,6 +72,7 @@ class ErrorTypeUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; + static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION; }; } // namespace latinime #endif // LATINIME_ERROR_TYPE_UTILS_H diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h index d2551057b..902eb000f 100644 --- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h +++ b/native/jni/src/suggest/core/dictionary/property/unigram_property.h @@ -48,15 +48,21 @@ class UnigramProperty { }; UnigramProperty() - : mIsNotAWord(false), mIsBlacklisted(false), mProbability(NOT_A_PROBABILITY), - mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), mShortcuts() {} - - UnigramProperty(const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp, const int level, const int count, - const std::vector<ShortcutProperty> *const shortcuts) - : mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), + : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false), + mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), + mShortcuts() {} + + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const int probability, const int timestamp, const int level, + const int count, const std::vector<ShortcutProperty> *const shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {} + bool representsBeginningOfSentence() const { + return mRepresentsBeginningOfSentence; + } + bool isNotAWord() const { return mIsNotAWord; } @@ -94,6 +100,7 @@ class UnigramProperty { DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); // TODO: Make members const. + bool mRepresentsBeginningOfSentence; bool mIsNotAWord; bool mIsBlacklisted; int mProbability; diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index 845e629e6..a61227626 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -51,6 +51,8 @@ class DictionaryHeaderStructurePolicy { virtual const std::vector<int> *getLocale() const = 0; + virtual bool supportsBeginningOfSentence() const = 0; + protected: DictionaryHeaderStructurePolicy() {} diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 3fd815f98..cda89406c 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -81,9 +81,11 @@ class DictionaryStructureWithBufferPolicy { virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length) = 0; - virtual void flush(const char *const filePath) = 0; + // Returns whether the flush was success or not. + virtual bool flush(const char *const filePath) = 0; - virtual void flushWithGC(const char *const filePath) = 0; + // Returns whether the GC and flush were success or not. + virtual bool flushWithGC(const char *const filePath) = 0; virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp index a307cb45d..23908255b 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -89,6 +89,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; const bool isExactMatch = ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + const bool isExactMatchWithIntentionalOmission = + ErrorTypeUtils::isExactMatchWithIntentionalOmission( + terminalDicNode->getContainedErrorTypes()); const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. // (e.g. "AMD" and "and") @@ -96,7 +99,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; && !(isPossiblyOffensiveWord && isFirstCharUppercase); const int outputTypeFlags = (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) - | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); + | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) + | (isExactMatchWithIntentionalOmission ? + Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); // Entries that are blacklisted or do not represent a word should not be output. const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); diff --git a/native/jni/src/suggest/core/session/prev_words_info.h b/native/jni/src/suggest/core/session/prev_words_info.h index e4de1f4cc..56c53c1c2 100644 --- a/native/jni/src/suggest/core/session/prev_words_info.h +++ b/native/jni/src/suggest/core/session/prev_words_info.h @@ -20,11 +20,11 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" namespace latinime { // TODO: Support n-gram. -// TODO: Support beginning of sentence. // This class does not take ownership of any code point buffers. class PrevWordsInfo { public: @@ -52,8 +52,7 @@ class PrevWordsInfo { void getPrevWordsTerminalPtNodePos( const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, - int *const outPrevWordsTerminalPtNodePos, - const bool tryLowerCaseSearch) const { + int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const { for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, mPrevWordCodePoints[i], mPrevWordCodePointCount[i], @@ -63,17 +62,11 @@ class PrevWordsInfo { BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction( const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const { - int pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0], - mPrevWordCodePointCount[0], false /* forceLowerCaseSearch */); - // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the - // dictionary or has no bigrams - if (NOT_A_DICT_POS == pos) { - // If no bigrams for this exact word, search again in lower case. - pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0], - mPrevWordCodePointCount[0], true /* forceLowerCaseSearch */); - } - return BinaryDictionaryBigramsIterator( - dictStructurePolicy->getBigramsStructurePolicy(), pos); + const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch( + dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0], + mIsBeginningOfSentence[0]); + return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(), + bigramListPos); } // n is 1-indexed. @@ -99,11 +92,21 @@ class PrevWordsInfo { const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, const int *const wordCodePoints, const int wordCodePointCount, const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { - if (!dictStructurePolicy || !wordCodePoints) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { return NOT_A_DICT_POS; } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, + codePointCount, MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_DICT_POS; + } + } const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( - wordCodePoints, wordCodePointCount, false /* forceLowerCaseSearch */); + codePoints, codePointCount, false /* forceLowerCaseSearch */); if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { // Return the position when when the word was found or doesn't try lower case // search. @@ -112,7 +115,36 @@ class PrevWordsInfo { // Check bigrams for lower-cased previous word if original was not found. Useful for // auto-capitalized words like "The [current_word]". return dictStructurePolicy->getTerminalPtNodePositionOfWord( - wordCodePoints, wordCodePointCount, true /* forceLowerCaseSearch */); + codePoints, codePointCount, true /* forceLowerCaseSearch */); + } + + static int getBigramListPositionForWordWithTryingLowerCaseSearch( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { + return NOT_A_DICT_POS; + } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, + codePointCount, MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_DICT_POS; + } + } + int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, + codePointCount, false /* forceLowerCaseSearch */); + // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the + // dictionary or has no bigrams + if (NOT_A_DICT_POS == pos) { + // If no bigrams for this exact word, search again in lower case. + pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, + codePointCount, true /* forceLowerCaseSearch */); + } + return pos; } static int getBigramListPositionForWord( diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 479d15164..75f4fef90 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -139,6 +139,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { switch (mDictFormatVersion) { case FormatUtils::VERSION_2: return FormatUtils::VERSION_2; + case FormatUtils::VERSION_401: + return FormatUtils::VERSION_401; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: return FormatUtils::VERSION_4_ONLY_FOR_TESTING; case FormatUtils::VERSION_4: @@ -246,6 +248,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return &mLocale; } + bool supportsBeginningOfSentence() const { + return mDictFormatVersion > FormatUtils::VERSION_401; + } + private: DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index a8f8f284b..b13ad1879 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -98,6 +98,7 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; case FormatUtils::VERSION_2: // Version 2 dictionary writing is not supported. return false; + case FormatUtils::VERSION_401: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_DEV: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp index 97e1120a3..0f60a898d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp @@ -296,26 +296,30 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor } } -void Ver4PatriciaTriePolicy::flush(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return; + return false; } if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; + return false; } + return true; } -void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return; + return false; } if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { AKLOGE("Cannot flush the dictionary to file with GC."); mIsCorrupted = true; + return false; } + return true; } bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -432,8 +436,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code shortcuts.emplace_back(&target, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h index 95813881d..b064aaf33 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h @@ -117,9 +117,9 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index f93d2894c..93e330a2a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -57,13 +57,14 @@ namespace latinime { const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); switch (dictFormatVersion) { - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_401: { return newPolicyForOnMemoryV4Dict<backward::v401::Ver4DictConstants, backward::v401::Ver4DictBuffers, backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr, backward::v401::Ver4PatriciaTriePolicy>( dictFormatVersion, locale, attributeMap); } + case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_DEV: { return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers, @@ -115,13 +116,14 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str case FormatUtils::VERSION_2: AKLOGE("Given path is a directory but the format is version 2. path: %s", path); break; - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_401: { return newPolicyForV4Dict<backward::v401::Ver4DictConstants, backward::v401::Ver4DictBuffers, backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr, backward::v401::Ver4PatriciaTriePolicy>( headerFilePath, formatVersion, std::move(mmappedBuffer)); } + case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_DEV: { return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers, @@ -177,6 +179,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str case FormatUtils::VERSION_2: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); + case FormatUtils::VERSION_401: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_DEV: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp index 028e9ecbf..1f00fc6ab 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -56,7 +56,7 @@ bool DynamicPtGcEventListeners } } else { mValueStack.back() += 1; - if (ptNodeParams->isTerminal()) { + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { mValidUnigramCount += 1; } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h index 5704c2e90..b2e60a837 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -160,7 +160,12 @@ class PtNodeParams { } AK_FORCE_INLINE bool representsNonWordInfo() const { - return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]) + return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0]) + && isNotAWord(); + } + + AK_FORCE_INLINE int representsBeginningOfSentence() const { + return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE && isNotAWord(); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 30dcfba37..a6a470c4e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -383,8 +383,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin shortcuts.emplace_back(&shortcutTarget, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 6240d46aa..88bbfd966 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -102,14 +102,16 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return false; } - void flush(const char *const filePath) { + bool flush(const char *const filePath) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: flush() is called for non-updatable dictionary."); + return false; } - void flushWithGC(const char *const filePath) { + bool flushWithGC(const char *const filePath) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; } bool needsToRunGC(const bool mindsBlockByGC) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp index d53922763..e1ceaee49 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -23,9 +23,11 @@ namespace latinime { const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( int *const bigramEntryPos) const { const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); - if (*bigramEntryPos < 0 || *bigramEntryPos >= bigramListBuffer->getTailPosition()) { - AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bufSize: %d", - *bigramEntryPos, bigramListBuffer->getTailPosition()); + const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); + if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { + AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " + "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, + bigramListBuffer->getTailPosition()); ASSERT(false); return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, Ver4DictConstants::NOT_A_TERMINAL_ID); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h index b8bdb63a8..52447a336 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h @@ -99,6 +99,20 @@ class BigramDictContent : public SparseTableDictContent { return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0; } + int getBigramEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } else { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } + } + bool runGCBigramList(const int bigramListPos, const BigramDictContent *const sourceBigramDictContent, const int toPos, const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 439e90e44..09c7b7d85 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -61,7 +61,7 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; } readingHelper.readNextSiblingNode(ptNodeParams); - if (!ptNodeParams.representsNonWordInfo()) { + if (ptNodeParams.representsNonWordInfo()) { // Skip PtNodes that represent non-word information. continue; } @@ -181,9 +181,19 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; - if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = length; + memmove(codePointsToAdd, word, sizeof(int) * length); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, unigramProperty, &addedNewUnigram)) { - if (addedNewUnigram) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { mUnigramCount++; } if (unigramProperty->getShortcuts().size() > 0) { @@ -294,26 +304,30 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor } } -void Ver4PatriciaTriePolicy::flush(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return; + return false; } if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; + return false; } + return true; } -void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return; + return false; } if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { AKLOGE("Cannot flush the dictionary to file with GC."); mIsCorrupted = true; + return false; } + return true; } bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -430,8 +444,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code shortcuts.emplace_back(&target, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 008f2e423..d198c97fd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -99,9 +99,9 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1, const int length1); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 105363db5..a04551a44 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -41,11 +41,12 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = TimeKeeper::setCurrentTime(); const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); switch (formatVersion) { - case FormatUtils::VERSION_4: + case FormatUtils::VERSION_401: return createEmptyV4DictFile<backward::v401::Ver4DictConstants, backward::v401::Ver4DictBuffers, backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr>( filePath, localeAsCodePointVector, attributeMap, formatVersion); + case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_DEV: return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers, diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index ba405b07e..18f558094 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -29,6 +29,8 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; switch (formatVersion) { case VERSION_2: return VERSION_2; + case VERSION_401: + return VERSION_401; case VERSION_4_ONLY_FOR_TESTING: return VERSION_4_ONLY_FOR_TESTING; case VERSION_4: @@ -60,6 +62,8 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; // same so we use them for both here. if (ByteArrayUtils::readUint16(dict, 4) == VERSION_2) { return VERSION_2; + } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_401) { + return VERSION_401; } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4_ONLY_FOR_TESTING) { return VERSION_4_ONLY_FOR_TESTING; } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index c47f30ca4..b05cb2fc8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -32,8 +32,9 @@ class FormatUtils { // These MUST have the same values as the relevant constants in FormatSpec.java. VERSION_2 = 2, VERSION_4_ONLY_FOR_TESTING = 399, - VERSION_4 = 401, - VERSION_4_DEV = 402, + VERSION_401 = 401, + VERSION_4 = 402, + VERSION_4_DEV = 403, UNKNOWN_VERSION = -1 }; diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index 634c45b04..f28ed5682 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -18,6 +18,7 @@ #define LATINIME_CHAR_UTILS_H #include <cctype> +#include <cstring> #include <vector> #include "defines.h" @@ -93,6 +94,19 @@ class CharUtils { static unsigned short latin_tolower(const unsigned short c); static const std::vector<int> EMPTY_STRING; + // Returns updated code point count. Returns 0 when the code points cannot be marked as a + // Beginning-of-Sentence. + static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, + const int codePointCount, const int maxCodePoint) { + if (codePointCount >= maxCodePoint) { + // the code points cannot be marked as a Beginning-of-Sentence. + return 0; + } + memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount); + codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE; + return codePointCount + 1; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); |