diff options
Diffstat (limited to 'native/jni/src')
24 files changed, 106 insertions, 106 deletions
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 0bcde2294..228260216 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -145,10 +145,11 @@ const WordProperty Dictionary::getWordProperty(const int *const codePoints, codePoints, codePointCount); } -int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints) { +int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { TimeKeeper::setCurrentTime(); return mDictionaryStructureWithBufferPolicy->getNextWordAndNextToken( - token, outCodePoints); + token, outCodePoints, outCodePointCount); } void Dictionary::logDictionaryInfo(JNIEnv *const env) const { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 542ba7291..247ee2421 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -103,7 +103,8 @@ class Dictionary { // Method to iterate all words in the dictionary. // The returned token has to be used to get the next word. If token is 0, this method newly // starts iterating the dictionary. - int getNextWordAndNextToken(const int token, int *const outCodePoints); + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { return mDictionaryStructureWithBufferPolicy.get(); diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.cpp b/native/jni/src/suggest/core/dictionary/property/word_property.cpp index 6f5f808f8..5bdd5606b 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.cpp +++ b/native/jni/src/suggest/core/dictionary/property/word_property.cpp @@ -28,7 +28,8 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(), false /* needsNullTermination */); jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(), - !mBigrams.empty(), mUnigramProperty.hasShortcuts()}; + !mBigrams.empty(), mUnigramProperty.hasShortcuts(), + mUnigramProperty.representsBeginningOfSentence()}; env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(), mUnigramProperty.getLevel(), mUnigramProperty.getCount()}; diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index e2771f97c..b72601109 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -104,7 +104,8 @@ class DictionaryStructureWithBufferPolicy { // Method to iterate all words in the dictionary. // The returned token has to be used to get the next word. If token is 0, this method newly // starts iterating the dictionary. - virtual int getNextWordAndNextToken(const int token, int *const outCodePoints) = 0; + virtual int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) = 0; virtual bool isCorrupted() const = 0; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h index 4032a67fa..1999a51a6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h @@ -58,7 +58,7 @@ class Ver4PatriciaTrieNodeReader : public PtNodeReader { ~Ver4PatriciaTrieNodeReader() {} - virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const { + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, NOT_A_DICT_POS /* siblingNodePos */); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp index 805820b05..9780ae048 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -135,7 +135,7 @@ int Ver4PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) c if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_PROBABILITY; } - const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { return NOT_A_PROBABILITY; } @@ -146,7 +146,7 @@ int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) con if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); if (ptNodeParams.isDeleted()) { return NOT_A_DICT_POS; } @@ -158,7 +158,7 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); if (ptNodeParams.isDeleted()) { return NOT_A_DICT_POS; } @@ -410,7 +410,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code AKLOGE("getWordProperty is called for invalid word."); return WordProperty(); } - const PtNodeParams ptNodeParams = mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); std::vector<int> codePointVector(ptNodeParams.getCodePoints(), ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); const ProbabilityEntry probabilityEntry = @@ -478,10 +478,9 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code return WordProperty(&codePointVector, &unigramProperty, &bigrams); } -int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { - // TODO: Return code point count like other methods. - // Null termination. - outCodePoints[0] = 0; +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; if (token == 0) { mTerminalPtNodePositionsForIteratingWords.clear(); DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( @@ -498,13 +497,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; int unigramProbability = NOT_A_PROBABILITY; - const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); - if (codePointCount < MAX_WORD_LENGTH) { - // Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH - // code points. - outCodePoints[codePointCount] = 0; - } const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h index 2e948ac4a..16b1bd2c1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -134,7 +134,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { const WordProperty getWordProperty(const int *const codePoints, const int codePointCount) const; - int getNextWordAndNextToken(const int token, int *const outCodePoints); + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); bool isCorrupted() const { return mIsCorrupted; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp index 99eed0f67..3fb4caa08 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp @@ -224,7 +224,7 @@ bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( const int ptNodePos = priorityQueue.top().getDictPos(); priorityQueue.pop(); const PtNodeParams ptNodeParams = - ptNodeReader->fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); if (ptNodeParams.representsNonWordInfo()) { continue; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp index 1f00fc6ab..db1a802d0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -65,7 +65,7 @@ bool DynamicPtGcEventListeners bool DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - if (!ptNodeParams->isDeleted() && ptNodeParams->hasBigrams()) { + if (!ptNodeParams->isDeleted()) { int bigramEntryCount = 0; if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams, &bigramEntryCount)) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h index cc7b5ff70..2e05bf397 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h @@ -126,7 +126,7 @@ class DynamicPtReadingHelper { if (isEnd()) { return PtNodeParams(); } - return mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(mReadingState.mPos); + return mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(mReadingState.mPos); } AK_FORCE_INLINE bool isValidTerminalNode(const PtNodeParams &ptNodeParams) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp index 9e575858a..f31c914d2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp @@ -87,9 +87,9 @@ bool DynamicPtUpdatingHelper::addUnigramWord( bool DynamicPtUpdatingHelper::addBigramWords(const int word0Pos, const int word1Pos, const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { const PtNodeParams sourcePtNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(word0Pos)); const PtNodeParams targetPtNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(word1Pos)); return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, bigramProperty, outAddedNewBigram); } @@ -97,16 +97,16 @@ bool DynamicPtUpdatingHelper::addBigramWords(const int word0Pos, const int word1 // Remove a bigram relation from word0Pos to word1Pos. bool DynamicPtUpdatingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { const PtNodeParams sourcePtNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(word0Pos)); const PtNodeParams targetPtNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(word1Pos)); return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams); } bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, const int *const targetCodePoints, const int targetCodePointCount, const int shortcutProbability) { - const PtNodeParams ptNodeParams(mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(wordPos)); + const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount, shortcutProbability); } @@ -125,7 +125,7 @@ bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int paren bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { - if (originalPtNodeParams->isTerminal()) { + if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) { // Overwrites the probability. *outAddedNewUnigram = false; return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); @@ -260,7 +260,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } // Load node info. Information of the 1st part will be fetched. const PtNodeParams ptNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); // Update children position. return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); } @@ -270,8 +270,8 @@ const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos, const int codePointCount, const int *const codePoints, const int probability) const { const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( - isBlacklisted, isNotAWord, isTerminal, originalPtNodeParams->hasShortcutTargets(), - originalPtNodeParams->hasBigrams(), codePointCount > 1 /* hasMultipleChars */, + isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints, probability); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h index c6b2a8bed..31299a707 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h @@ -27,7 +27,8 @@ namespace latinime { class PtNodeReader { public: virtual ~PtNodeReader() {} - virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const = 0; + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos( + const int ptNodePos) const = 0; protected: PtNodeReader() {}; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index a6a470c4e..5c62b9caf 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -282,7 +282,8 @@ int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_PROBABILITY; } - const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) { // If this is not a word, or if it's a blacklisted entry, it should behave as // having no probability outside of the suggestion process (where it should be used @@ -296,14 +297,14 @@ int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - return mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos).getShortcutPos(); + return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos(); } int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - return mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos).getBigramsPos(); + return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos(); } int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, @@ -339,7 +340,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin AKLOGE("getWordProperty was called for invalid word."); return WordProperty(); } - const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); std::vector<int> codePointVector(ptNodeParams.getCodePoints(), ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); // Fetch bigram information. @@ -389,7 +391,9 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin return WordProperty(&codePointVector, &unigramProperty, &bigrams); } -int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { +int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; if (token == 0) { // Start iterating the dictionary. mTerminalPtNodePositionsForIteratingWords.clear(); @@ -407,8 +411,8 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; int unigramProbability = NOT_A_PROBABILITY; - getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, - outCodePoints, &unigramProbability); + *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, + MAX_WORD_LENGTH, outCodePoints, &unigramProbability); const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index dce94363a..ec8407408 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -137,7 +137,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { const WordProperty getWordProperty(const int *const codePoints, const int codePointCount) const; - int getNextWordAndNextToken(const int token, int *const outCodePoints); + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); bool isCorrupted() const { return mIsCorrupted; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp index 0c8de0df2..c1e938710 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp @@ -20,7 +20,7 @@ namespace latinime { -const PtNodeParams Ver2ParticiaTrieNodeReader::fetchNodeInfoInBufferFromPtNodePos( +const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNodePos( const int ptNodePos) const { if (ptNodePos < 0 || ptNodePos >= mDictSize) { // Reading invalid position because of bug or broken dictionary. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h index 86fc89c5e..f0725b66d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h @@ -36,7 +36,7 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader { : mDictBuffer(dictBuffer), mDictSize(dictSize), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy) {} - virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const; + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const; private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h index f24307e7b..22ed4a6c0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h @@ -41,7 +41,7 @@ class Ver4PatriciaTrieNodeReader : public PtNodeReader { ~Ver4PatriciaTrieNodeReader() {} - virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const { + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, NOT_A_DICT_POS /* siblingNodePos */); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index f89d3d7a0..3d8da9173 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -231,14 +231,6 @@ bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry( sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId()); return false; } - if (!sourcePtNodeParams->hasBigrams()) { - // Update has bigrams flag. - return updatePtNodeFlags(sourcePtNodeParams->getHeadPos(), - sourcePtNodeParams->isBlacklisted(), sourcePtNodeParams->isNotAWord(), - sourcePtNodeParams->isTerminal(), sourcePtNodeParams->hasShortcutTargets(), - true /* hasBigrams */, - sourcePtNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); - } return true; } @@ -303,28 +295,9 @@ bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptN AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId()); return false; } - if (!ptNodeParams->hasShortcutTargets()) { - // Update has shortcut targets flag. - return updatePtNodeFlags(ptNodeParams->getHeadPos(), - ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), - ptNodeParams->isTerminal(), true /* hasShortcutTargets */, - ptNodeParams->hasBigrams(), - ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); - } return true; } -bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags( - const PtNodeParams *const ptNodeParams) { - const bool hasBigrams = mBuffers->getBigramDictContent()->getBigramListHeadPos( - ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; - const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos( - ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; - return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isBlacklisted(), - ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets, - hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); -} - bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( const PtNodeParams *const ptNodeParams, int *const outTerminalId, int *const ptNodeWritingPos) { @@ -377,8 +350,7 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( return false; } return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), - isTerminal, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(), - ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); } const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( @@ -402,11 +374,11 @@ const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, - const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars) { + const bool hasMultipleChars) { // Create node flags and write them. PatriciaTrieReadingUtils::NodeFlags nodeFlags = PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, - hasShortcutTargets, hasBigrams, hasMultipleChars, + false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE); if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index e90bc44c0..162dc9b1d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -93,8 +93,6 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const int *const targetCodePoints, const int targetCodePointCount, const int shortcutProbability); - bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); - private: DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); @@ -110,8 +108,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const UnigramProperty *const unigramProperty) const; bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, - const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, - const bool hasMultipleChars); + const bool isTerminal, const bool hasMultipleChars); static const int CHILDREN_POSITION_FIELD_SIZE; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index aec3b8ea3..46107d92a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -125,7 +125,7 @@ int Ver4PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) c if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_PROBABILITY; } - const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { return NOT_A_PROBABILITY; } @@ -136,7 +136,7 @@ int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) con if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); if (ptNodeParams.isDeleted()) { return NOT_A_DICT_POS; } @@ -148,7 +148,7 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - const PtNodeParams ptNodeParams(mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos)); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); if (ptNodeParams.isDeleted()) { return NOT_A_DICT_POS; } @@ -222,8 +222,24 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le } bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) { - // TODO: Implement. - return false; + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int ptNodePos = getTerminalPtNodePositionOfWord(word, length, + false /* forceLowerCaseSearch */); + if (ptNodePos == NOT_A_DICT_POS) { + return false; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { + AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); + return false; + } + if (!ptNodeParams.representsNonWordInfo()) { + mUnigramCount--; + } + return true; } bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, @@ -405,7 +421,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code AKLOGE("getWordProperty is called for invalid word."); return WordProperty(); } - const PtNodeParams ptNodeParams = mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); std::vector<int> codePointVector(ptNodeParams.getCodePoints(), ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); const ProbabilityEntry probabilityEntry = @@ -473,10 +489,9 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code return WordProperty(&codePointVector, &unigramProperty, &bigrams); } -int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { - // TODO: Return code point count like other methods. - // Null termination. - outCodePoints[0] = 0; +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; if (token == 0) { mTerminalPtNodePositionsForIteratingWords.clear(); DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( @@ -493,13 +508,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; int unigramProbability = NOT_A_PROBABILITY; - const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); - if (codePointCount < MAX_WORD_LENGTH) { - // Null termination. outCodePoints have to be null terminated or contain MAX_WORD_LENGTH - // code points. - outCodePoints[codePointCount] = 0; - } const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 0a20965f3..5d66a2cce 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -113,7 +113,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { const WordProperty getWordProperty(const int *const codePoints, const int codePointCount) const; - int getNextWordAndNextToken(const int token, int *const outCodePoints); + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); bool isCorrupted() const { return mIsCorrupted; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index e868ddf6f..40fdfa068 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -215,7 +215,7 @@ bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( const int ptNodePos = priorityQueue.top().getDictPos(); priorityQueue.pop(); const PtNodeParams ptNodeParams = - ptNodeReader->fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); if (ptNodeParams.representsNonWordInfo()) { continue; } @@ -286,8 +286,9 @@ bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTermi } if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + return false; } - return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); + return true; } } // namespace latinime diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index f28ed5682..63786502b 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -98,6 +98,10 @@ class CharUtils { // Beginning-of-Sentence. static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, const int codePointCount, const int maxCodePoint) { + if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { + // Marker has already been attached. + return codePointCount; + } if (codePointCount >= maxCodePoint) { // the code points cannot be marked as a Beginning-of-Sentence. return 0; diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h index 67a66fdfe..3514aeeb0 100644 --- a/native/jni/src/utils/jni_data_utils.h +++ b/native/jni/src/utils/jni_data_utils.h @@ -69,18 +69,23 @@ class JniDataUtils { static void outputCodePoints(JNIEnv *env, jintArray intArrayToOutputCodePoints, const int start, const int maxLength, const int *const codePoints, const int codePointCount, const bool needsNullTermination) { - const int outputCodePointCount = std::min(maxLength, codePointCount); - int outputCodePonts[outputCodePointCount]; - for (int i = 0; i < outputCodePointCount; ++i) { + const int codePointBufSize = std::min(maxLength, codePointCount); + int outputCodePonts[codePointBufSize]; + int outputCodePointCount = 0; + for (int i = 0; i < codePointBufSize; ++i) { const int codePoint = codePoints[i]; + int codePointToOutput = codePoint; if (!CharUtils::isInUnicodeSpace(codePoint)) { - outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER; + if (codePoint == CODE_POINT_BEGINNING_OF_SENTENCE) { + // Just skip Beginning-of-Sentence marker. + continue; + } + codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER; } else if (codePoint >= 0x01 && codePoint <= 0x1F) { // Control code. - outputCodePonts[i] = CODE_POINT_REPLACEMENT_CHARACTER; - } else { - outputCodePonts[i] = codePoint; + codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER; } + outputCodePonts[outputCodePointCount++] = codePointToOutput; } env->SetIntArrayRegion(intArrayToOutputCodePoints, start, outputCodePointCount, outputCodePonts); @@ -90,6 +95,11 @@ class JniDataUtils { } } + static void putBooleanToArray(JNIEnv *env, jbooleanArray array, const int index, + const jboolean value) { + env->SetBooleanArrayRegion(array, index, 1 /* len */, &value); + } + static void putIntToArray(JNIEnv *env, jintArray array, const int index, const int value) { env->SetIntArrayRegion(array, index, 1 /* len */, &value); } |