diff options
Diffstat (limited to 'native/jni/src')
32 files changed, 364 insertions, 79 deletions
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index ef03d2b6d..92f39ea25 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -125,7 +125,7 @@ class DicNode { PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - void initAsPassingChild(DicNode *parentDicNode) { + void initAsPassingChild(const DicNode *parentDicNode) { mIsCachedForNextSuggestion = parentDicNode->mIsCachedForNextSuggestion; const int codePoint = parentDicNode->mDicNodeState.mDicNodeStateOutput.getCurrentWordCodePointAt( diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index bf2a0000d..4445f4aaf 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -48,7 +48,7 @@ namespace latinime { /////////////////////////////////// // Traverse node expansion utils // /////////////////////////////////// -/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, +/* static */ void DicNodeUtils::getAllChildDicNodes(const DicNode *dicNode, const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNodeVector *const childDicNodes) { if (dicNode->isTotalInputSizeExceedingLimit()) { diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h index 0d60e5796..00e80c604 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -35,7 +35,7 @@ class DicNodeUtils { const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode); static void initByCopy(const DicNode *const srcDicNode, DicNode *const destDicNode); - static void getAllChildDicNodes(DicNode *dicNode, + static void getAllChildDicNodes(const DicNode *dicNode, const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNodeVector *childDicNodes); static float getBigramNodeImprobability( diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h index cb28e57d8..54cde1988 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_vector.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -52,7 +52,7 @@ class DicNodeVector { return static_cast<int>(mDicNodes.size()); } - void pushPassingChild(DicNode *dicNode) { + void pushPassingChild(const DicNode *dicNode) { ASSERT(!mLock); mDicNodes.emplace_back(); mDicNodes.back().initAsPassingChild(dicNode); diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index fe3167a61..f88388c75 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -19,6 +19,7 @@ #include "suggest/core/dictionary/dictionary.h" #include "defines.h" +#include "suggest/core/dictionary/dictionary_utils.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/result/suggestion_results.h" #include "suggest/core/session/dic_traverse_session.h" @@ -74,38 +75,50 @@ int Dictionary::getProbability(const int *word, int length) const { return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos); } +int Dictionary::getMaxProbabilityOfExactMatches(const int *word, int length) const { + TimeKeeper::setCurrentTime(); + return DictionaryUtils::getMaxProbabilityOfExactMatches( + mDictionaryStructureWithBufferPolicy.get(), word, length); +} + int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, int length) const { TimeKeeper::setCurrentTime(); return mBigramDictionary.getBigramProbability(prevWordsInfo, word, length); } -void Dictionary::addUnigramEntry(const int *const word, const int length, +bool Dictionary::addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty) { + if (unigramProperty->representsBeginningOfSentence() + && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy() + ->supportsBeginningOfSentence()) { + AKLOGE("The dictionary doesn't support Beginning-of-Sentence."); + return false; + } TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); + return mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); } -void Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, +bool Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, const BigramProperty *const bigramProperty) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty); + return mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty); } -void Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, +bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length); + return mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length); } -void Dictionary::flush(const char *const filePath) { +bool Dictionary::flush(const char *const filePath) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->flush(filePath); + return mDictionaryStructureWithBufferPolicy->flush(filePath); } -void Dictionary::flushWithGC(const char *const filePath) { +bool Dictionary::flushWithGC(const char *const filePath) { TimeKeeper::setCurrentTime(); - mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); + return mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); } bool Dictionary::needsToRunGC(const bool mindsBlockByGC) { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 817d9f7fc..10010b21c 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -57,6 +57,7 @@ class Dictionary { static const int KIND_MASK_FLAGS = 0xFFFFFF00; // Mask to get the flags static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; static const int KIND_FLAG_EXACT_MATCH = 0x40000000; + static const int KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = 0x20000000; Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy); @@ -72,21 +73,23 @@ class Dictionary { int getProbability(const int *word, int length) const; + int getMaxProbabilityOfExactMatches(const int *word, int length) const; + int getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, int length) const; - void addUnigramEntry(const int *const codePoints, const int codePointCount, + bool addUnigramEntry(const int *const codePoints, const int codePointCount, const UnigramProperty *const unigramProperty); - void addNgramEntry(const PrevWordsInfo *const prevWordsInfo, + bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, const BigramProperty *const bigramProperty); - void removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, + bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC); diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp new file mode 100644 index 000000000..b94966cbe --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/dictionary_utils.h" + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/session/prev_words_info.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" + +namespace latinime { + +/* static */ int DictionaryUtils::getMaxProbabilityOfExactMatches( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int *const codePoints, const int codePointCount) { + std::vector<DicNode> current; + std::vector<DicNode> next; + + // No prev words information. + PrevWordsInfo emptyPrevWordsInfo; + int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + emptyPrevWordsInfo.getPrevWordsTerminalPtNodePos(dictionaryStructurePolicy, + prevWordsPtNodePos, false /* tryLowerCaseSearch */); + current.emplace_back(); + DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordsPtNodePos, ¤t.front()); + for (int i = 0; i < codePointCount; ++i) { + // The base-lower input is used to ignore case errors and accent errors. + const int codePoint = CharUtils::toBaseLowerCase(codePoints[i]); + for (const DicNode &dicNode : current) { + if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == codePoint) { + next.emplace_back(dicNode); + next.back().advanceDigraphIndex(); + continue; + } + processChildDicNodes(dictionaryStructurePolicy, codePoint, &dicNode, &next); + } + current.clear(); + current.swap(next); + } + + int maxProbability = NOT_A_PROBABILITY; + for (const DicNode &dicNode : current) { + if (!dicNode.isTerminalDicNode()) { + continue; + } + // dicNode can contain case errors, accent errors, intentional omissions or digraphs. + maxProbability = std::max(maxProbability, dicNode.getProbability()); + } + return maxProbability; +} + +/* static */ void DictionaryUtils::processChildDicNodes( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int inputCodePoint, const DicNode *const parentDicNode, + std::vector<DicNode> *const outDicNodes) { + DicNodeVector childDicNodes; + DicNodeUtils::getAllChildDicNodes(parentDicNode, dictionaryStructurePolicy, &childDicNodes); + for (int childIndex = 0; childIndex < childDicNodes.getSizeAndLock(); ++childIndex) { + DicNode *const childDicNode = childDicNodes[childIndex]; + const int codePoint = CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint()); + if (inputCodePoint == codePoint) { + outDicNodes->emplace_back(*childDicNode); + } + if (childDicNode->canBeIntentionalOmission()) { + processChildDicNodes(dictionaryStructurePolicy, inputCodePoint, childDicNode, + outDicNodes); + } + if (DigraphUtils::hasDigraphForCodePoint( + dictionaryStructurePolicy->getHeaderStructurePolicy(), + childDicNode->getNodeCodePoint())) { + childDicNode->advanceDigraphIndex(); + if (childDicNode->getNodeCodePoint() == codePoint) { + childDicNode->advanceDigraphIndex(); + outDicNodes->emplace_back(*childDicNode); + } + } + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.h b/native/jni/src/suggest/core/dictionary/dictionary_utils.h new file mode 100644 index 000000000..358ebf674 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_UTILS_H +#define LATINIME_DICTIONARY_UTILS_H + +#include <vector> + +#include "defines.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; +class DicNode; + +class DictionaryUtils { + public: + static int getMaxProbabilityOfExactMatches( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int *const codePoints, const int codePointCount); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryUtils); + + static void processChildDicNodes( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int inputCodePoint, const DicNode *const parentDicNode, + std::vector<DicNode> *const outDicNodes); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_UTILS_H diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp index 0635fef7e..b6bf7a98c 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -31,4 +31,8 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x80; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = NOT_AN_ERROR | MATCH_WITH_CASE_ERROR | MATCH_WITH_ACCENT_ERROR | MATCH_WITH_DIGRAPH; +const ErrorTypeUtils::ErrorType + ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = + ERRORS_TREATED_AS_AN_EXACT_MATCH | INTENTIONAL_OMISSION; + } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h index 0e8e5b635..e3e76b238 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.h +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -51,6 +51,11 @@ class ErrorTypeUtils { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; } + static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) { + return (containedErrorTypes + & ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0; + } + static bool isEditCorrectionError(const ErrorType errorType) { return (errorType & EDIT_CORRECTION) != 0; } @@ -67,6 +72,7 @@ class ErrorTypeUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; + static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION; }; } // namespace latinime #endif // LATINIME_ERROR_TYPE_UTILS_H diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h index d2551057b..902eb000f 100644 --- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h +++ b/native/jni/src/suggest/core/dictionary/property/unigram_property.h @@ -48,15 +48,21 @@ class UnigramProperty { }; UnigramProperty() - : mIsNotAWord(false), mIsBlacklisted(false), mProbability(NOT_A_PROBABILITY), - mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), mShortcuts() {} - - UnigramProperty(const bool isNotAWord, const bool isBlacklisted, const int probability, - const int timestamp, const int level, const int count, - const std::vector<ShortcutProperty> *const shortcuts) - : mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), + : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false), + mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), + mShortcuts() {} + + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const int probability, const int timestamp, const int level, + const int count, const std::vector<ShortcutProperty> *const shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {} + bool representsBeginningOfSentence() const { + return mRepresentsBeginningOfSentence; + } + bool isNotAWord() const { return mIsNotAWord; } @@ -94,6 +100,7 @@ class UnigramProperty { DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); // TODO: Make members const. + bool mRepresentsBeginningOfSentence; bool mIsNotAWord; bool mIsBlacklisted; int mProbability; diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index 845e629e6..a61227626 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -51,6 +51,8 @@ class DictionaryHeaderStructurePolicy { virtual const std::vector<int> *getLocale() const = 0; + virtual bool supportsBeginningOfSentence() const = 0; + protected: DictionaryHeaderStructurePolicy() {} diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 3fd815f98..cda89406c 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -81,9 +81,11 @@ class DictionaryStructureWithBufferPolicy { virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length) = 0; - virtual void flush(const char *const filePath) = 0; + // Returns whether the flush was success or not. + virtual bool flush(const char *const filePath) = 0; - virtual void flushWithGC(const char *const filePath) = 0; + // Returns whether the GC and flush were success or not. + virtual bool flushWithGC(const char *const filePath) = 0; virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp index a307cb45d..23908255b 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -89,6 +89,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; const bool isExactMatch = ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + const bool isExactMatchWithIntentionalOmission = + ErrorTypeUtils::isExactMatchWithIntentionalOmission( + terminalDicNode->getContainedErrorTypes()); const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. // (e.g. "AMD" and "and") @@ -96,7 +99,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; && !(isPossiblyOffensiveWord && isFirstCharUppercase); const int outputTypeFlags = (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) - | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); + | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) + | (isExactMatchWithIntentionalOmission ? + Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); // Entries that are blacklisted or do not represent a word should not be output. const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); diff --git a/native/jni/src/suggest/core/session/prev_words_info.h b/native/jni/src/suggest/core/session/prev_words_info.h index e4de1f4cc..56c53c1c2 100644 --- a/native/jni/src/suggest/core/session/prev_words_info.h +++ b/native/jni/src/suggest/core/session/prev_words_info.h @@ -20,11 +20,11 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" namespace latinime { // TODO: Support n-gram. -// TODO: Support beginning of sentence. // This class does not take ownership of any code point buffers. class PrevWordsInfo { public: @@ -52,8 +52,7 @@ class PrevWordsInfo { void getPrevWordsTerminalPtNodePos( const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, - int *const outPrevWordsTerminalPtNodePos, - const bool tryLowerCaseSearch) const { + int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const { for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, mPrevWordCodePoints[i], mPrevWordCodePointCount[i], @@ -63,17 +62,11 @@ class PrevWordsInfo { BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction( const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const { - int pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0], - mPrevWordCodePointCount[0], false /* forceLowerCaseSearch */); - // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the - // dictionary or has no bigrams - if (NOT_A_DICT_POS == pos) { - // If no bigrams for this exact word, search again in lower case. - pos = getBigramListPositionForWord(dictStructurePolicy, mPrevWordCodePoints[0], - mPrevWordCodePointCount[0], true /* forceLowerCaseSearch */); - } - return BinaryDictionaryBigramsIterator( - dictStructurePolicy->getBigramsStructurePolicy(), pos); + const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch( + dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0], + mIsBeginningOfSentence[0]); + return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(), + bigramListPos); } // n is 1-indexed. @@ -99,11 +92,21 @@ class PrevWordsInfo { const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, const int *const wordCodePoints, const int wordCodePointCount, const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { - if (!dictStructurePolicy || !wordCodePoints) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { return NOT_A_DICT_POS; } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, + codePointCount, MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_DICT_POS; + } + } const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( - wordCodePoints, wordCodePointCount, false /* forceLowerCaseSearch */); + codePoints, codePointCount, false /* forceLowerCaseSearch */); if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { // Return the position when when the word was found or doesn't try lower case // search. @@ -112,7 +115,36 @@ class PrevWordsInfo { // Check bigrams for lower-cased previous word if original was not found. Useful for // auto-capitalized words like "The [current_word]". return dictStructurePolicy->getTerminalPtNodePositionOfWord( - wordCodePoints, wordCodePointCount, true /* forceLowerCaseSearch */); + codePoints, codePointCount, true /* forceLowerCaseSearch */); + } + + static int getBigramListPositionForWordWithTryingLowerCaseSearch( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { + return NOT_A_DICT_POS; + } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, + codePointCount, MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_DICT_POS; + } + } + int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, + codePointCount, false /* forceLowerCaseSearch */); + // getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the + // dictionary or has no bigrams + if (NOT_A_DICT_POS == pos) { + // If no bigrams for this exact word, search again in lower case. + pos = getBigramListPositionForWord(dictStructurePolicy, codePoints, + codePointCount, true /* forceLowerCaseSearch */); + } + return pos; } static int getBigramListPositionForWord( diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 479d15164..75f4fef90 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -139,6 +139,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { switch (mDictFormatVersion) { case FormatUtils::VERSION_2: return FormatUtils::VERSION_2; + case FormatUtils::VERSION_401: + return FormatUtils::VERSION_401; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: return FormatUtils::VERSION_4_ONLY_FOR_TESTING; case FormatUtils::VERSION_4: @@ -246,6 +248,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return &mLocale; } + bool supportsBeginningOfSentence() const { + return mDictFormatVersion > FormatUtils::VERSION_401; + } + private: DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index a8f8f284b..b13ad1879 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -98,6 +98,7 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; case FormatUtils::VERSION_2: // Version 2 dictionary writing is not supported. return false; + case FormatUtils::VERSION_401: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_DEV: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp index 97e1120a3..0f60a898d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.cpp @@ -296,26 +296,30 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor } } -void Ver4PatriciaTriePolicy::flush(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return; + return false; } if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; + return false; } + return true; } -void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return; + return false; } if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { AKLOGE("Cannot flush the dictionary to file with GC."); mIsCorrupted = true; + return false; } + return true; } bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -432,8 +436,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code shortcuts.emplace_back(&target, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h index 95813881d..b064aaf33 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_policy.h @@ -117,9 +117,9 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, const int length); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index f93d2894c..93e330a2a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -57,13 +57,14 @@ namespace latinime { const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); switch (dictFormatVersion) { - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_401: { return newPolicyForOnMemoryV4Dict<backward::v401::Ver4DictConstants, backward::v401::Ver4DictBuffers, backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr, backward::v401::Ver4PatriciaTriePolicy>( dictFormatVersion, locale, attributeMap); } + case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_DEV: { return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers, @@ -115,13 +116,14 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str case FormatUtils::VERSION_2: AKLOGE("Given path is a directory but the format is version 2. path: %s", path); break; - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_401: { return newPolicyForV4Dict<backward::v401::Ver4DictConstants, backward::v401::Ver4DictBuffers, backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr, backward::v401::Ver4PatriciaTriePolicy>( headerFilePath, formatVersion, std::move(mmappedBuffer)); } + case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_DEV: { return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers, @@ -177,6 +179,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str case FormatUtils::VERSION_2: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); + case FormatUtils::VERSION_401: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_DEV: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp index 028e9ecbf..1f00fc6ab 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -56,7 +56,7 @@ bool DynamicPtGcEventListeners } } else { mValueStack.back() += 1; - if (ptNodeParams->isTerminal()) { + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { mValidUnigramCount += 1; } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h index 5704c2e90..b2e60a837 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -160,7 +160,12 @@ class PtNodeParams { } AK_FORCE_INLINE bool representsNonWordInfo() const { - return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]) + return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0]) + && isNotAWord(); + } + + AK_FORCE_INLINE int representsBeginningOfSentence() const { + return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE && isNotAWord(); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 30dcfba37..a6a470c4e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -383,8 +383,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin shortcuts.emplace_back(&shortcutTarget, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 6240d46aa..88bbfd966 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -102,14 +102,16 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return false; } - void flush(const char *const filePath) { + bool flush(const char *const filePath) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: flush() is called for non-updatable dictionary."); + return false; } - void flushWithGC(const char *const filePath) { + bool flushWithGC(const char *const filePath) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; } bool needsToRunGC(const bool mindsBlockByGC) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp index d53922763..e1ceaee49 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -23,9 +23,11 @@ namespace latinime { const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( int *const bigramEntryPos) const { const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); - if (*bigramEntryPos < 0 || *bigramEntryPos >= bigramListBuffer->getTailPosition()) { - AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bufSize: %d", - *bigramEntryPos, bigramListBuffer->getTailPosition()); + const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); + if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { + AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " + "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, + bigramListBuffer->getTailPosition()); ASSERT(false); return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, Ver4DictConstants::NOT_A_TERMINAL_ID); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h index b8bdb63a8..52447a336 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h @@ -99,6 +99,20 @@ class BigramDictContent : public SparseTableDictContent { return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0; } + int getBigramEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } else { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } + } + bool runGCBigramList(const int bigramListPos, const BigramDictContent *const sourceBigramDictContent, const int toPos, const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 439e90e44..09c7b7d85 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -61,7 +61,7 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; } readingHelper.readNextSiblingNode(ptNodeParams); - if (!ptNodeParams.representsNonWordInfo()) { + if (ptNodeParams.representsNonWordInfo()) { // Skip PtNodes that represent non-word information. continue; } @@ -181,9 +181,19 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; - if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = length; + memmove(codePointsToAdd, word, sizeof(int) * length); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, unigramProperty, &addedNewUnigram)) { - if (addedNewUnigram) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { mUnigramCount++; } if (unigramProperty->getShortcuts().size() > 0) { @@ -294,26 +304,30 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor } } -void Ver4PatriciaTriePolicy::flush(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return; + return false; } if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; + return false; } + return true; } -void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return; + return false; } if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { AKLOGE("Cannot flush the dictionary to file with GC."); mIsCorrupted = true; + return false; } + return true; } bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -430,8 +444,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code shortcuts.emplace_back(&target, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), - ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 008f2e423..d198c97fd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -99,9 +99,9 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1, const int length1); - void flush(const char *const filePath); + bool flush(const char *const filePath); - void flushWithGC(const char *const filePath); + bool flushWithGC(const char *const filePath); bool needsToRunGC(const bool mindsBlockByGC) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 105363db5..a04551a44 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -41,11 +41,12 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = TimeKeeper::setCurrentTime(); const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); switch (formatVersion) { - case FormatUtils::VERSION_4: + case FormatUtils::VERSION_401: return createEmptyV4DictFile<backward::v401::Ver4DictConstants, backward::v401::Ver4DictBuffers, backward::v401::Ver4DictBuffers::Ver4DictBuffersPtr>( filePath, localeAsCodePointVector, attributeMap, formatVersion); + case FormatUtils::VERSION_4: case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_4_DEV: return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers, diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index ba405b07e..18f558094 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -29,6 +29,8 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; switch (formatVersion) { case VERSION_2: return VERSION_2; + case VERSION_401: + return VERSION_401; case VERSION_4_ONLY_FOR_TESTING: return VERSION_4_ONLY_FOR_TESTING; case VERSION_4: @@ -60,6 +62,8 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; // same so we use them for both here. if (ByteArrayUtils::readUint16(dict, 4) == VERSION_2) { return VERSION_2; + } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_401) { + return VERSION_401; } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4_ONLY_FOR_TESTING) { return VERSION_4_ONLY_FOR_TESTING; } else if (ByteArrayUtils::readUint16(dict, 4) == VERSION_4) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index c47f30ca4..b05cb2fc8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -32,8 +32,9 @@ class FormatUtils { // These MUST have the same values as the relevant constants in FormatSpec.java. VERSION_2 = 2, VERSION_4_ONLY_FOR_TESTING = 399, - VERSION_4 = 401, - VERSION_4_DEV = 402, + VERSION_401 = 401, + VERSION_4 = 402, + VERSION_4_DEV = 403, UNKNOWN_VERSION = -1 }; diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index 634c45b04..f28ed5682 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -18,6 +18,7 @@ #define LATINIME_CHAR_UTILS_H #include <cctype> +#include <cstring> #include <vector> #include "defines.h" @@ -93,6 +94,19 @@ class CharUtils { static unsigned short latin_tolower(const unsigned short c); static const std::vector<int> EMPTY_STRING; + // Returns updated code point count. Returns 0 when the code points cannot be marked as a + // Beginning-of-Sentence. + static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, + const int codePointCount, const int maxCodePoint) { + if (codePointCount >= maxCodePoint) { + // the code points cannot be marked as a Beginning-of-Sentence. + return 0; + } + memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount); + codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE; + return codePointCount + 1; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); |