diff options
Diffstat (limited to 'native/jni/src')
13 files changed, 111 insertions, 36 deletions
diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp index b6bf7a98c..1e2494e92 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -19,17 +19,18 @@ namespace latinime { const ErrorTypeUtils::ErrorType ErrorTypeUtils::NOT_AN_ERROR = 0x0; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_CASE_ERROR = 0x1; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR = 0x2; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x4; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x8; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x10; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x20; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x40; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x80; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_CASE = 0x1; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT = 0x2; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT = 0x4; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x8; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x10; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x20; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x40; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x80; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x100; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = - NOT_AN_ERROR | MATCH_WITH_CASE_ERROR | MATCH_WITH_ACCENT_ERROR | MATCH_WITH_DIGRAPH; + NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h index e3e76b238..fd1d5fcff 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.h +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -30,8 +30,9 @@ class ErrorTypeUtils { typedef uint32_t ErrorType; static const ErrorType NOT_AN_ERROR; - static const ErrorType MATCH_WITH_CASE_ERROR; - static const ErrorType MATCH_WITH_ACCENT_ERROR; + static const ErrorType MATCH_WITH_WRONG_CASE; + static const ErrorType MATCH_WITH_MISSING_ACCENT; + static const ErrorType MATCH_WITH_WRONG_ACCENT; static const ErrorType MATCH_WITH_DIGRAPH; // Treat error as an intentional omission when the CorrectionType is omission and the node can // be intentional omission. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp index f7179f68d..97a8bcc98 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp @@ -425,6 +425,18 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, return true; } +bool Ver4PatriciaTrieNodeWriter::suppressUnigramEntry(const PtNodeParams *const ptNodeParams) { + if (!mHeaderPolicy->hasHistoricalInfoOfWords()) { + // Require historical info to suppress unigram entry. + return false; + } + const HistoricalInfo suppressedHistorycalInfo(0 /* timestamp */, 0 /* level */, 0 /* count */); + const ProbabilityEntry probabilityEntryToWrite = + ProbabilityEntry().createEntryWithUpdatedHistoricalInfo(&suppressedHistorycalInfo); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + ptNodeParams->getTerminalId(), &probabilityEntryToWrite); +} + } // namespace v402 } // namespace backward } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h index d49d9a666..9d8a55bff 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h @@ -111,6 +111,11 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); + // Suppress unigram not to use the word for generating suggestions. So, this method can be used + // only for dictionaries with historical info. Also, suppressed entries are included in unigram + // count. They will be removed from the dictionary during GC. + bool suppressUnigramEntry(const PtNodeParams *const ptNodeParams); + private: DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp index 1296b8acd..9c6452e40 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -210,7 +210,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le } for (const auto &shortcut : unigramProperty->getShortcuts()) { if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", shortcut.getTargetCodePoints()->size()); return false; } @@ -245,7 +245,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le if (!mUpdatingHelper.addShortcutTarget(wordPos, shortcut.getTargetCodePoints()->data(), shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { - AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), shortcut.getProbability()); return false; @@ -258,6 +258,20 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le } } +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int ptNodePos = getTerminalPtNodePositionOfWord(word, length, + false /* forceLowerCaseSearch */); + if (ptNodePos == NOT_A_DICT_POS) { + return false; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + return mNodeWriter.suppressUnigramEntry(&ptNodeParams); +} + bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, const BigramProperty *const bigramProperty) { if (!mBuffers->isUpdatable()) { @@ -275,7 +289,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI } if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("The word is too long to insert the ngram to the dictionary. " - "length: %d", bigramProperty->getTargetCodePoints()->size()); + "length: %zd", bigramProperty->getTargetCodePoints()->size()); return false; } int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h index 9e989b268..d77499636 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -108,10 +108,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty); - bool removeUnigramEntry(const int *const word, const int length) { - // Removing unigram entry is not supported. - return false; - } + bool removeUnigramEntry(const int *const word, const int length); bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, const BigramProperty *const bigramProperty); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp index f3bc4a0cb..bbcea2ee0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -45,12 +45,25 @@ ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( } bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, - const int terminalId, const ProbabilityEntry *const probabilityEntry) { + const int wordId, const ProbabilityEntry *const probabilityEntry) { + if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) { + return false; + } const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds); if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { return false; } - return mTrieMap.put(terminalId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); + return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); +} + +bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId) { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + // Cannot find bitmap entry for the probability entry. The entry doesn't exist. + return false; + } + return mTrieMap.remove(wordId, bitmapEntryIndex); } bool LanguageModelDictContent::runGCInner( diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h index 104ee2520..bd07f2f62 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h @@ -61,12 +61,18 @@ class LanguageModelDictContent { return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); } + bool removeProbabilityEntry(const int wordId) { + return removeNgramProbabilityEntry(WordIdArrayView(), wordId); + } + ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId) const; bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId, const ProbabilityEntry *const probabilityEntry); + bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId); + private: DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 2c848cb29..62e008b94 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -255,6 +255,14 @@ bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) { + // TODO: Support n-gram. + LanguageModelDictContent *const languageModelDictContent = + mBuffers->getMutableLanguageModelDictContent(); + if (!languageModelDictContent->removeNgramProbabilityEntry(prevWordIds.limit(1 /* maxSize */), + wordId)) { + // TODO: Uncomment. + // return false; + } // TODO: Remove. return mBigramPolicy->removeEntry(prevWordIds[0], wordId); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 723808399..04e3018da 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -200,7 +200,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le } for (const auto &shortcut : unigramProperty->getShortcuts()) { if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", shortcut.getTargetCodePoints()->size()); return false; } @@ -235,7 +235,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le if (!mUpdatingHelper.addShortcutTarget(wordPos, shortcut.getTargetCodePoints()->data(), shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { - AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), shortcut.getProbability()); return false; @@ -263,6 +263,11 @@ bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); return false; } + if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry( + ptNodeParams.getTerminalId())) { + // TODO: Uncomment. + // return false; + } if (!ptNodeParams.representsNonWordInfo()) { mUnigramCount--; } @@ -286,7 +291,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI } if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("The word is too long to insert the ngram to the dictionary. " - "length: %d", bigramProperty->getTargetCodePoints()->size()); + "length: %zd", bigramProperty->getTargetCodePoints()->size()); return false; } int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp index e630aba9a..944a59c52 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp @@ -420,6 +420,10 @@ bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t val bool TrieMap::removeInner(const Entry &bitmapEntry) { const int tableSize = popCount(bitmapEntry.getBitmap()); + if (tableSize <= 0) { + // The table is empty. No need to remove any entries. + return true; + } for (int i = 0; i < tableSize; ++i) { const int entryIndex = bitmapEntry.getTableIndex() + i; const Entry entry = readEntry(entryIndex); @@ -444,7 +448,7 @@ bool TrieMap::removeInner(const Entry &bitmapEntry) { } } } - return freeTable(bitmapEntry.getTableIndex(), tableSize); + return true; } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h index 04cb6603a..52c4251f0 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h @@ -51,10 +51,10 @@ class TypingScoring : public Scoring { } if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { score += ScoringParams::EXACT_MATCH_PROMOTION; - if ((ErrorTypeUtils::MATCH_WITH_CASE_ERROR & containedErrorTypes) != 0) { + if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) { score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; } - if ((ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR & containedErrorTypes) != 0) { + if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) { score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; } if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp index 54f65c786..1d590c353 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp @@ -36,25 +36,34 @@ ErrorTypeUtils::ErrorType TypingWeighting::getErrorType(const CorrectionType cor // Compare the node code point with original primary code point on the keyboard. const ProximityInfoState *const pInfoState = traverseSession->getProximityInfoState(0); - const int primaryOriginalCodePoint = pInfoState->getPrimaryOriginalCodePointAt( + const int primaryCodePoint = pInfoState->getPrimaryCodePointAt( dicNode->getInputIndex(0)); const int nodeCodePoint = dicNode->getNodeCodePoint(); - if (primaryOriginalCodePoint == nodeCodePoint) { + // TODO: Check whether the input code point is on the keyboard. + if (primaryCodePoint == nodeCodePoint) { // Node code point is same as original code point on the keyboard. return ErrorTypeUtils::NOT_AN_ERROR; - } else if (CharUtils::toLowerCase(primaryOriginalCodePoint) == + } else if (CharUtils::toLowerCase(primaryCodePoint) == CharUtils::toLowerCase(nodeCodePoint)) { // Only cases of the code points are different. - return ErrorTypeUtils::MATCH_WITH_CASE_ERROR; - } else if (CharUtils::toBaseCodePoint(primaryOriginalCodePoint) == - CharUtils::toBaseCodePoint(nodeCodePoint)) { + return ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } else if (primaryCodePoint == CharUtils::toBaseCodePoint(nodeCodePoint)) { // Node code point is a variant of original code point. - return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR; - } else { + return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT; + } else if (CharUtils::toBaseCodePoint(primaryCodePoint) + == CharUtils::toBaseCodePoint(nodeCodePoint)) { + // Base code points are the same but the code point is intentionally input. + return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT; + } else if (CharUtils::toLowerCase(primaryCodePoint) + == CharUtils::toBaseLowerCase(nodeCodePoint)) { // Node code point is a variant of original code point and the cases are also // different. - return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR - | ErrorTypeUtils::MATCH_WITH_CASE_ERROR; + return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } else { + // Base code points are the same and the cases are different. + return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; } } break; |