diff options
Diffstat (limited to 'native/jni/src')
4 files changed, 115 insertions, 26 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h index faaf44162..e4847fcf9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -30,18 +30,19 @@ namespace latinime { class PtNodeParams { public: // Invalid PtNode. - PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mParentPos(NOT_A_DICT_POS), - mCodePointCount(0), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), - mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), mProbabilityFieldPos(NOT_A_DICT_POS), - mProbability(NOT_A_PROBABILITY), mChildrenPosFieldPos(NOT_A_DICT_POS), - mChildrenPos(NOT_A_DICT_POS), mBigramLinkedNodePos(NOT_A_DICT_POS), - mShortcutPos(NOT_A_DICT_POS), mBigramPos(NOT_A_DICT_POS), - mSiblingPos(NOT_A_DICT_POS) {} + PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mHasMovedFlag(false), + mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mCodePoints(), + mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {} PtNodeParams(const PtNodeParams& ptNodeParams) : mHeadPos(ptNodeParams.mHeadPos), mFlags(ptNodeParams.mFlags), - mParentPos(ptNodeParams.mParentPos), mCodePointCount(ptNodeParams.mCodePointCount), - mCodePoints(), mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos), + mHasMovedFlag(ptNodeParams.mHasMovedFlag), mParentPos(ptNodeParams.mParentPos), + mCodePointCount(ptNodeParams.mCodePointCount), mCodePoints(), + mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos), mTerminalId(ptNodeParams.mTerminalId), mProbabilityFieldPos(ptNodeParams.mProbabilityFieldPos), mProbability(ptNodeParams.mProbability), @@ -58,7 +59,7 @@ class PtNodeParams { const int codePointCount, const int *const codePoints, const int probability, const int childrenPos, const int shortcutPos, const int bigramPos, const int siblingPos) - : mHeadPos(headPos), mFlags(flags), mParentPos(NOT_A_DICT_POS), + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(false), mParentPos(NOT_A_DICT_POS), mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), @@ -73,7 +74,7 @@ class PtNodeParams { const int parentPos, const int codePointCount, const int *const codePoints, const int terminalIdFieldPos, const int terminalId, const int probability, const int childrenPosFieldPos, const int childrenPos, const int siblingPos) - : mHeadPos(headPos), mFlags(flags), mParentPos(parentPos), + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(terminalIdFieldPos), mTerminalId(terminalId), mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), @@ -87,8 +88,8 @@ class PtNodeParams { PtNodeParams(const PtNodeParams *const ptNodeParams, const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, const int codePointCount, const int *const codePoints, const int probability) - : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mParentPos(parentPos), - mCodePointCount(codePointCount), mCodePoints(), + : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true), + mParentPos(parentPos), mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()), mTerminalId(ptNodeParams->getTerminalId()), mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()), @@ -104,7 +105,7 @@ class PtNodeParams { PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, const int codePointCount, const int *const codePoints, const int probability) - : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mParentPos(parentPos), + : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), @@ -126,11 +127,11 @@ class PtNodeParams { // Flags AK_FORCE_INLINE bool isDeleted() const { - return DynamicPtReadingUtils::isDeleted(mFlags); + return mHasMovedFlag && DynamicPtReadingUtils::isDeleted(mFlags); } AK_FORCE_INLINE bool willBecomeNonTerminal() const { - return DynamicPtReadingUtils::willBecomeNonTerminal(mFlags); + return mHasMovedFlag && DynamicPtReadingUtils::willBecomeNonTerminal(mFlags); } AK_FORCE_INLINE bool hasChildren() const { @@ -224,6 +225,7 @@ class PtNodeParams { const int mHeadPos; const PatriciaTrieReadingUtils::NodeFlags mFlags; + const bool mHasMovedFlag; const int mParentPos; const uint8_t mCodePointCount; int mCodePoints[MAX_WORD_LENGTH]; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 8172e70b6..fa5993090 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -20,6 +20,7 @@ #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" @@ -303,4 +304,92 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod return siblingPos; } +const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoints, + const int codePointCount) const { + const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, + false /* forceLowerCaseSearch */); + if (ptNodePos == NOT_A_DICT_POS) { + AKLOGE("getWordProperty was called for invalid word."); + return WordProperty(); + } + const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + std::vector<int> codePointVector(ptNodeParams.getCodePoints(), + ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); + // Fetch bigram information. + std::vector<WordProperty::BigramProperty> bigrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + BinaryDictionaryBigramsIterator bigramsIt(getBigramsStructurePolicy(), bigramListPos); + while (bigramsIt.hasNext()) { + // Fetch the next bigram information and forward the iterator. + bigramsIt.next(); + // Skip the entry if the entry has been deleted. This never happens for ver2 dicts. + if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { + int word1Probability = NOT_A_PROBABILITY; + int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints, + &word1Probability); + std::vector<int> word1(bigramWord1CodePoints, + bigramWord1CodePoints + word1CodePointCount); + bigrams.push_back(WordProperty::BigramProperty(&word1, bigramsIt.getProbability(), + NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */)); + } + } + // Fetch shortcut information. + std::vector<WordProperty::ShortcutProperty> shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTargetCodePoints[MAX_WORD_LENGTH]; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mDictRoot, &shortcutPos); + bool hasNext = true; + while (hasNext) { + const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, &shortcutPos); + hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); + const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( + mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); + std::vector<int> shortcutTarget(shortcutTargetCodePoints, + shortcutTargetCodePoints + shortcutTargetLength); + const int shortcutProbability = + ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); + shortcuts.push_back( + WordProperty::ShortcutProperty(&shortcutTarget, shortcutProbability)); + } + } + return WordProperty(&codePointVector, ptNodeParams.isNotAWord(), + ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(), + ptNodeParams.hasShortcutTargets(), ptNodeParams.getProbability(), + NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, + &bigrams, &shortcuts); +} + +int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) { + if (token == 0) { + // Start iterating the dictionary. + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + int unigramProbability = NOT_A_PROBABILITY; + getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH, + outCodePoints, &unigramProbability); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 1ce7f85d4..8fbca2612 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -18,6 +18,7 @@ #define LATINIME_PATRICIA_TRIE_POLICY_H #include <stdint.h> +#include <vector> #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" @@ -44,7 +45,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { - mHeaderPolicy.getSize()), mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot), mPtNodeReader(mDictRoot, mDictBufferSize, &mBigramListPolicy, &mShortcutListPolicy), - mPtNodeArrayReader(mDictRoot, mDictBufferSize) {} + mPtNodeArrayReader(mDictRoot, mDictBufferSize), + mTerminalPtNodePositionsForIteratingWords() {} AK_FORCE_INLINE int getRootPosition() const { return 0; @@ -128,15 +130,9 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { } const WordProperty getWordProperty(const int *const codePoints, - const int codePointCount) const { - // getWordProperty is not supported. - return WordProperty(); - } + const int codePointCount) const; - int getNextWordAndNextToken(const int token, int *const outCodePoints) { - // getNextWordAndNextToken is not supported. - return 0; - } + int getNextWordAndNextToken(const int token, int *const outCodePoints); private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); @@ -149,6 +145,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { const ShortcutListPolicy mShortcutListPolicy; const Ver2ParticiaTrieNodeReader mPtNodeReader; const Ver2PtNodeArrayReader mPtNodeArrayReader; + std::vector<int> mTerminalPtNodePositionsForIteratingWords; int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, DicNodeVector *const childDicNodes) const; diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp index d41fc8924..adc474b4c 100644 --- a/native/jni/src/utils/char_utils.cpp +++ b/native/jni/src/utils/char_utils.cpp @@ -1118,7 +1118,8 @@ static int compare_pair_capital(const void *a, const void *b) { /* U+0118 */ 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067, /* U+0120 */ 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0126, 0x0127, /* U+0128 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, - /* U+0130 */ 0x0049, 0x0131, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B, + // U+0131: Manually changed from 0131 to 0049 + /* U+0130 */ 0x0049, 0x0049, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B, /* U+0138 */ 0x0138, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, /* U+0140 */ 0x006C, 0x004C, 0x006C, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E, // U+0141: Manually changed from 0141 to 004C |