diff options
Diffstat (limited to 'native/jni/src')
13 files changed, 138 insertions, 153 deletions
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index 017df34fd..973da67e4 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -112,32 +112,23 @@ class DicNode { mIsUsed = true; mIsCachedForNextSuggestion = false; mDicNodeProperties.init( - NOT_A_DICT_POS, 0 /* flags */, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, + NOT_A_DICT_POS, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, - false /* isTerminal */, true /* hasChildren */, 0 /* depth */, - 0 /* terminalDepth */); + false /* isTerminal */, true /* hasChildren */, + false /* isBlacklistedOrNotAWord */, 0 /* depth */, 0 /* terminalDepth */); mDicNodeState.init(prevWordNodePos); PROF_NODE_RESET(mProfiler); } - void initAsPassingChild(DicNode *parentNode) { - mIsUsed = true; - mIsCachedForNextSuggestion = parentNode->mIsCachedForNextSuggestion; - const int c = parentNode->getNodeTypedCodePoint(); - mDicNodeProperties.init(&parentNode->mDicNodeProperties, c); - mDicNodeState.init(&parentNode->mDicNodeState); - PROF_NODE_COPY(&parentNode->mProfiler, mProfiler); - } - // Init for root with previous word void initAsRootWithPreviousWord(DicNode *dicNode, const int rootGroupPos) { mIsUsed = true; mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; mDicNodeProperties.init( - NOT_A_DICT_POS, 0 /* flags */, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, + NOT_A_DICT_POS, rootGroupPos, NOT_A_DICT_POS /* attributesPos */, NOT_A_CODE_POINT /* nodeCodePoint */, NOT_A_PROBABILITY /* probability */, - false /* isTerminal */, true /* hasChildren */, 0 /* depth */, - 0 /* terminalDepth */); + false /* isTerminal */, true /* hasChildren */, + false /* isBlacklistedOrNotAWord */, 0 /* depth */, 0 /* terminalDepth */); // TODO: Move to dicNodeState? mDicNodeState.mDicNodeStateOutput.init(); // reset for next word mDicNodeState.mDicNodeStateInput.init( @@ -157,18 +148,27 @@ class DicNode { PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - // TODO: minimize arguments by looking binary_format - void initAsChild(DicNode *dicNode, const int pos, const uint8_t flags, const int childrenPos, + void initAsPassingChild(DicNode *parentNode) { + mIsUsed = true; + mIsCachedForNextSuggestion = parentNode->mIsCachedForNextSuggestion; + const int c = parentNode->getNodeTypedCodePoint(); + mDicNodeProperties.init(&parentNode->mDicNodeProperties, c); + mDicNodeState.init(&parentNode->mDicNodeState); + PROF_NODE_COPY(&parentNode->mProfiler, mProfiler); + } + + void initAsChild(DicNode *dicNode, const int pos, const int childrenPos, const int attributesPos, const int probability, const bool isTerminal, - const bool hasChildren, const uint16_t mergedNodeCodePointCount, - const int *const mergedNodeCodePoints) { + const bool hasChildren, const bool isBlacklistedOrNotAWord, + const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { mIsUsed = true; uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1); mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; const uint16_t newLeavingDepth = static_cast<uint16_t>( dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount); - mDicNodeProperties.init(pos, flags, childrenPos, attributesPos, mergedNodeCodePoints[0], - probability, isTerminal, hasChildren, newDepth, newLeavingDepth); + mDicNodeProperties.init(pos, childrenPos, attributesPos, mergedNodeCodePoints[0], + probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth, + newLeavingDepth); mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount, mergedNodeCodePoints); PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); @@ -216,7 +216,7 @@ class DicNode { } bool isImpossibleBigramWord() const { - if (mDicNodeProperties.hasBlacklistedOrNotAWordFlag()) { + if (isBlacklistedOrNotAWord()) { return true; } const int prevWordLen = mDicNodeState.mDicNodeStatePrevWord.getPrevWordLength() @@ -463,8 +463,8 @@ class DicNode { return mDicNodeState.mDicNodeStateScoring.isExactMatch(); } - uint8_t getFlags() const { - return mDicNodeProperties.getFlags(); + bool isBlacklistedOrNotAWord() const { + return mDicNodeProperties.isBlacklistedOrNotAWord(); } int getAttributesPos() const { @@ -504,6 +504,12 @@ class DicNode { if (!right->isUsed()) { return false; } + // Promote exact matches to prevent them from being pruned. + const bool leftExactMatch = isExactMatch(); + const bool rightExactMatch = right->isExactMatch(); + if (leftExactMatch != rightExactMatch) { + return leftExactMatch; + } const float diff = right->getNormalizedCompoundDistance() - getNormalizedCompoundDistance(); static const float MIN_DIFF = 0.000001f; diff --git a/native/jni/src/suggest/core/dicnode/dic_node_properties.h b/native/jni/src/suggest/core/dicnode/dic_node_properties.h index 7e8aa4979..d98000d83 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_properties.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_properties.h @@ -20,7 +20,6 @@ #include <stdint.h> #include "defines.h" -#include "suggest/core/dictionary/binary_format.h" namespace latinime { @@ -32,24 +31,25 @@ namespace latinime { class DicNodeProperties { public: AK_FORCE_INLINE DicNodeProperties() - : mPos(0), mFlags(0), mChildrenPos(0), mAttributesPos(0), mProbability(0), - mNodeCodePoint(0), mDepth(0), mLeavingDepth(0), mIsTerminal(false), - mHasChildren(false) {} + : mPos(0), mChildrenPos(0), mAttributesPos(0), mProbability(0), + mNodeCodePoint(0), mIsTerminal(false), mHasChildren(false), + mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {} virtual ~DicNodeProperties() {} // Should be called only once per DicNode is initialized. - void init(const int pos, const uint8_t flags, const int childrenPos, const int attributesPos, + void init(const int pos, const int childrenPos, const int attributesPos, const int nodeCodePoint, const int probability, const bool isTerminal, - const bool hasChildren, const uint16_t depth, const uint16_t leavingDepth) { + const bool hasChildren, const bool isBlacklistedOrNotAWord, + const uint16_t depth, const uint16_t leavingDepth) { mPos = pos; - mFlags = flags; mChildrenPos = childrenPos; mAttributesPos = attributesPos; mNodeCodePoint = nodeCodePoint; mProbability = probability; mIsTerminal = isTerminal; mHasChildren = hasChildren; + mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord; mDepth = depth; mLeavingDepth = leavingDepth; } @@ -57,13 +57,13 @@ class DicNodeProperties { // Init for copy void init(const DicNodeProperties *const nodeProp) { mPos = nodeProp->mPos; - mFlags = nodeProp->mFlags; mChildrenPos = nodeProp->mChildrenPos; mAttributesPos = nodeProp->mAttributesPos; mNodeCodePoint = nodeProp->mNodeCodePoint; mProbability = nodeProp->mProbability; mIsTerminal = nodeProp->mIsTerminal; mHasChildren = nodeProp->mHasChildren; + mIsBlacklistedOrNotAWord = nodeProp->mIsBlacklistedOrNotAWord; mDepth = nodeProp->mDepth; mLeavingDepth = nodeProp->mLeavingDepth; } @@ -71,13 +71,13 @@ class DicNodeProperties { // Init as passing child void init(const DicNodeProperties *const nodeProp, const int codePoint) { mPos = nodeProp->mPos; - mFlags = nodeProp->mFlags; mChildrenPos = nodeProp->mChildrenPos; mAttributesPos = nodeProp->mAttributesPos; mNodeCodePoint = codePoint; // Overwrite the node char of a passing child mProbability = nodeProp->mProbability; mIsTerminal = nodeProp->mIsTerminal; mHasChildren = nodeProp->mHasChildren; + mIsBlacklistedOrNotAWord = nodeProp->mIsBlacklistedOrNotAWord; mDepth = nodeProp->mDepth + 1; // Increment the depth of a passing child mLeavingDepth = nodeProp->mLeavingDepth; } @@ -86,10 +86,6 @@ class DicNodeProperties { return mPos; } - uint8_t getFlags() const { - return mFlags; - } - int getChildrenPos() const { return mChildrenPos; } @@ -123,8 +119,8 @@ class DicNodeProperties { return mHasChildren || mDepth != mLeavingDepth; } - bool hasBlacklistedOrNotAWordFlag() const { - return BinaryFormat::hasBlacklistedOrNotAWordFlag(mFlags); + bool isBlacklistedOrNotAWord() const { + return mIsBlacklistedOrNotAWord; } private: @@ -132,15 +128,15 @@ class DicNodeProperties { // Use a default copy constructor and an assign operator because shallow copies are ok // for this class int mPos; - uint8_t mFlags; int mChildrenPos; int mAttributesPos; int mProbability; int mNodeCodePoint; - uint16_t mDepth; - uint16_t mLeavingDepth; bool mIsTerminal; bool mHasChildren; + bool mIsBlacklistedOrNotAWord; + uint16_t mDepth; + uint16_t mLeavingDepth; }; } // namespace latinime #endif // LATINIME_DIC_NODE_PROPERTIES_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index c7c8d2a19..6c7f6667a 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -78,6 +78,7 @@ namespace latinime { const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); const bool hasShortcuts = (0 != (BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS & flags)); + const bool isBlacklistedOrNotAWord = BinaryFormat::hasBlacklistedOrNotAWordFlag(flags); int codePoint = BinaryFormat::getCodePointAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos); @@ -111,8 +112,9 @@ namespace latinime { if (childrenFilter->isFilteredOut(mergedNodeCodePoints[0])) { return siblingPos; } - childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, - probability, isTerminal, hasChildren, mergedNodeCodePointCount, mergedNodeCodePoints); + childDicNodes->pushLeavingChild(dicNode, nextPos, childrenPos, attributesPos, + probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, + mergedNodeCodePointCount, mergedNodeCodePoints); return siblingPos; } diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h index 9641cc19c..5ac4eeaf4 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_vector.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -62,14 +62,15 @@ class DicNodeVector { mDicNodes.back().initAsPassingChild(dicNode); } - void pushLeavingChild(DicNode *dicNode, const int pos, const uint8_t flags, - const int childrenPos, const int attributesPos, const int probability, - const bool isTerminal, const bool hasChildren, const uint16_t mergedNodeCodePointCount, - const int *const mergedNodeCodePoints) { + void pushLeavingChild(DicNode *dicNode, const int pos, const int childrenPos, + const int attributesPos, const int probability, const bool isTerminal, + const bool hasChildren, const bool isBlacklistedOrNotAWord, + const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { ASSERT(!mLock); mDicNodes.push_back(mEmptyNode); - mDicNodes.back().initAsChild(dicNode, pos, flags, childrenPos, attributesPos, probability, - isTerminal, hasChildren, mergedNodeCodePointCount, mergedNodeCodePoints); + mDicNodes.back().initAsChild(dicNode, pos, childrenPos, attributesPos, probability, + isTerminal, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, + mergedNodeCodePoints); } DicNode *operator[](const int id) { diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp index f48386bba..5d14a0554 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp @@ -31,9 +31,9 @@ const int BinaryDictionaryFormatUtils::DICTIONARY_MINIMUM_SIZE = 4; // The versions of Latin IME that only handle format version 1 only test for the magic // number, so we had to change it so that version 2 files would be rejected by older // implementations. On this occasion, we made the magic number 32 bits long. -const uint32_t BinaryDictionaryFormatUtils::FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; +const uint32_t BinaryDictionaryFormatUtils::HEADER_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; // Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12 -const int BinaryDictionaryFormatUtils::FORMAT_VERSION_2_MINIMUM_SIZE = 12; +const int BinaryDictionaryFormatUtils::HEADER_VERSION_2_MINIMUM_SIZE = 12; /* static */ BinaryDictionaryFormatUtils::FORMAT_VERSION BinaryDictionaryFormatUtils::detectFormatVersion(const uint8_t *const dict, @@ -46,25 +46,28 @@ const int BinaryDictionaryFormatUtils::FORMAT_VERSION_2_MINIMUM_SIZE = 12; } const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0); switch (magicNumber) { - case FORMAT_VERSION_2_MAGIC_NUMBER: - // Version 2 dictionaries are at least 12 bytes long. - // If this dictionary has the version 2 magic number but is less than 12 bytes long, - // then it's an unknown format and we need to avoid confidently reading the next bytes. - if (dictSize < FORMAT_VERSION_2_MINIMUM_SIZE) { + case HEADER_VERSION_2_MAGIC_NUMBER: + // Version 2 header are at least 12 bytes long. + // If this header has the version 2 magic number but is less than 12 bytes long, + // then it's an unknown format and we need to avoid confidently reading the next bytes. + if (dictSize < HEADER_VERSION_2_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + // Version 2 header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Version number (2 bytes) + // Options (2 bytes) + // Header size (4 bytes) : integer, big endian + if (ByteArrayUtils::readUint16(dict, 4) == 2) { + return VERSION_2; + } else if (ByteArrayUtils::readUint16(dict, 4) == 3) { + // TODO: Support version 3 dictionary. + return UNKNOWN_VERSION; + } else { + return UNKNOWN_VERSION; + } + default: return UNKNOWN_VERSION; - } - // Format 2 header is as follows: - // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE - // Version number (2 bytes) 0x00 0x02 - // Options (2 bytes) - // Header size (4 bytes) : integer, big endian - if (ByteArrayUtils::readUint16(dict, 4) == 2) { - return VERSION_2; - } else { - return UNKNOWN_VERSION; - } - default: - return UNKNOWN_VERSION; } } diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h index 80067b255..830684c70 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h @@ -33,9 +33,9 @@ namespace latinime { */ class BinaryDictionaryFormatUtils { public: - // TODO: Support version 3 format. enum FORMAT_VERSION { - VERSION_2 = 1, + VERSION_2, + VERSION_3, UNKNOWN_VERSION }; @@ -45,8 +45,8 @@ class BinaryDictionaryFormatUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryFormatUtils); static const int DICTIONARY_MINIMUM_SIZE; - static const uint32_t FORMAT_VERSION_2_MAGIC_NUMBER; - static const int FORMAT_VERSION_2_MINIMUM_SIZE; + static const uint32_t HEADER_VERSION_2_MAGIC_NUMBER; + static const int HEADER_VERSION_2_MINIMUM_SIZE; }; } // namespace latinime #endif /* LATINIME_BINARY_DICTIONARY_FORMAT_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp index c4c4bedde..a57b0f859 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp @@ -26,10 +26,10 @@ namespace latinime { const int BinaryDictionaryHeaderReadingUtils::MAX_OPTION_KEY_LENGTH = 256; -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_MAGIC_NUMBER_SIZE = 4; -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_DICTIONARY_VERSION_SIZE = 2; -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_DICTIONARY_FLAG_SIZE = 2; -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_DICTIONARY_HEADER_SIZE_SIZE = 4; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_MAGIC_NUMBER_SIZE = 4; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_DICTIONARY_VERSION_SIZE = 2; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_FLAG_SIZE = 2; +const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_SIZE_FIELD_SIZE = 4; const BinaryDictionaryHeaderReadingUtils::DictionaryFlags BinaryDictionaryHeaderReadingUtils::NO_FLAGS = 0; @@ -45,13 +45,13 @@ const BinaryDictionaryHeaderReadingUtils::DictionaryFlags /* static */ int BinaryDictionaryHeaderReadingUtils::getHeaderSize( const BinaryDictionaryInfo *const binaryDictionaryInfo) { - switch (binaryDictionaryInfo->getFormat()) { - case BinaryDictionaryFormatUtils::VERSION_2: + switch (getHeaderVersion(binaryDictionaryInfo->getFormat())) { + case HEADER_VERSION_2: // See the format of the header in the comment in // BinaryDictionaryFormatUtils::detectFormatVersion() return ByteArrayUtils::readUint32(binaryDictionaryInfo->getDictBuf(), - VERSION_2_MAGIC_NUMBER_SIZE + VERSION_2_DICTIONARY_VERSION_SIZE - + VERSION_2_DICTIONARY_FLAG_SIZE); + VERSION_2_HEADER_MAGIC_NUMBER_SIZE + VERSION_2_HEADER_DICTIONARY_VERSION_SIZE + + VERSION_2_HEADER_FLAG_SIZE); default: return S_INT_MAX; } @@ -60,10 +60,10 @@ const BinaryDictionaryHeaderReadingUtils::DictionaryFlags /* static */ BinaryDictionaryHeaderReadingUtils::DictionaryFlags BinaryDictionaryHeaderReadingUtils::getFlags( const BinaryDictionaryInfo *const binaryDictionaryInfo) { - switch (binaryDictionaryInfo->getFormat()) { - case BinaryDictionaryFormatUtils::VERSION_2: + switch (getHeaderVersion(binaryDictionaryInfo->getFormat())) { + case HEADER_VERSION_2: return ByteArrayUtils::readUint16(binaryDictionaryInfo->getDictBuf(), - VERSION_2_MAGIC_NUMBER_SIZE + VERSION_2_DICTIONARY_VERSION_SIZE); + VERSION_2_HEADER_MAGIC_NUMBER_SIZE + VERSION_2_HEADER_DICTIONARY_VERSION_SIZE); default: return NO_FLAGS; } @@ -73,11 +73,15 @@ const BinaryDictionaryHeaderReadingUtils::DictionaryFlags /* static */ bool BinaryDictionaryHeaderReadingUtils::readHeaderValue( const BinaryDictionaryInfo *const binaryDictionaryInfo, const char *const key, int *outValue, const int outValueSize) { - if (outValueSize <= 0 || !hasHeaderAttributes(binaryDictionaryInfo->getFormat())) { + if (outValueSize <= 0) { return false; } const int headerSize = getHeaderSize(binaryDictionaryInfo); int pos = getHeaderOptionsPosition(binaryDictionaryInfo->getFormat()); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return false; + } while (pos < headerSize) { if(ByteArrayUtils::compareStringInBufferWithCharArray( binaryDictionaryInfo->getDictBuf(), key, headerSize - pos, &pos) == 0) { diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h index 94b9e124d..61748227e 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h @@ -48,27 +48,15 @@ class BinaryDictionaryHeaderReadingUtils { return (flags & FRENCH_LIGATURE_PROCESSING_FLAG) != 0; } - static AK_FORCE_INLINE bool hasHeaderAttributes( - const BinaryDictionaryFormatUtils::FORMAT_VERSION format) { - // Only format 2 and above have header attributes as {key,value} string pairs. - switch (format) { - case BinaryDictionaryFormatUtils::VERSION_2: - return true; - break; - default: - return false; - } - } - static AK_FORCE_INLINE int getHeaderOptionsPosition( - const BinaryDictionaryFormatUtils::FORMAT_VERSION format) { - switch (format) { - case BinaryDictionaryFormatUtils::VERSION_2: - return VERSION_2_MAGIC_NUMBER_SIZE + VERSION_2_DICTIONARY_VERSION_SIZE - + VERSION_2_DICTIONARY_FLAG_SIZE + VERSION_2_DICTIONARY_HEADER_SIZE_SIZE; + const BinaryDictionaryFormatUtils::FORMAT_VERSION dictionaryFormat) { + switch (getHeaderVersion(dictionaryFormat)) { + case HEADER_VERSION_2: + return VERSION_2_HEADER_MAGIC_NUMBER_SIZE + VERSION_2_HEADER_DICTIONARY_VERSION_SIZE + + VERSION_2_HEADER_FLAG_SIZE + VERSION_2_HEADER_SIZE_FIELD_SIZE; break; default: - return 0; + return NOT_A_DICT_POS; } } @@ -82,10 +70,15 @@ class BinaryDictionaryHeaderReadingUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryHeaderReadingUtils); - static const int VERSION_2_MAGIC_NUMBER_SIZE; - static const int VERSION_2_DICTIONARY_VERSION_SIZE; - static const int VERSION_2_DICTIONARY_FLAG_SIZE; - static const int VERSION_2_DICTIONARY_HEADER_SIZE_SIZE; + enum HEADER_VERSION { + HEADER_VERSION_2, + UNKNOWN_HEADER_VERSION + }; + + static const int VERSION_2_HEADER_MAGIC_NUMBER_SIZE; + static const int VERSION_2_HEADER_DICTIONARY_VERSION_SIZE; + static const int VERSION_2_HEADER_FLAG_SIZE; + static const int VERSION_2_HEADER_SIZE_FIELD_SIZE; static const DictionaryFlags NO_FLAGS; // Flags for special processing @@ -95,6 +88,18 @@ class BinaryDictionaryHeaderReadingUtils { static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG; static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG; static const DictionaryFlags CONTAINS_BIGRAMS_FLAG; + + static HEADER_VERSION getHeaderVersion( + const BinaryDictionaryFormatUtils::FORMAT_VERSION formatVersion) { + switch(formatVersion) { + case BinaryDictionaryFormatUtils::VERSION_2: + // Fall through + case BinaryDictionaryFormatUtils::VERSION_3: + return HEADER_VERSION_2; + default: + return UNKNOWN_HEADER_VERSION; + } + } }; } #endif /* LATINIME_DICTIONARY_HEADER_READING_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/terminal_attributes.h b/native/jni/src/suggest/core/dictionary/terminal_attributes.h index a8520b1f1..0da6504eb 100644 --- a/native/jni/src/suggest/core/dictionary/terminal_attributes.h +++ b/native/jni/src/suggest/core/dictionary/terminal_attributes.h @@ -21,7 +21,6 @@ #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h" -#include "suggest/core/dictionary/binary_format.h" namespace latinime { @@ -71,13 +70,12 @@ class TerminalAttributes { }; TerminalAttributes(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const uint8_t nodeFlags, const int shortcutPos) - : mBinaryDictionaryInfo(binaryDictionaryInfo), - mNodeFlags(nodeFlags), mShortcutListSizePos(shortcutPos) {} + const int shortcutPos) + : mBinaryDictionaryInfo(binaryDictionaryInfo), mShortcutListSizePos(shortcutPos) {} inline ShortcutIterator getShortcutIterator() const { int shortcutPos = mShortcutListSizePos; - const bool hasShortcutList = 0 != (mNodeFlags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS); + const bool hasShortcutList = shortcutPos != NOT_A_DICT_POS; if (hasShortcutList) { BinaryDictionaryTerminalAttributesReadingUtils::getShortcutListSizeAndForwardPointer( mBinaryDictionaryInfo, &shortcutPos); @@ -86,14 +84,9 @@ class TerminalAttributes { return ShortcutIterator(mBinaryDictionaryInfo, shortcutPos, hasShortcutList); } - bool isBlacklistedOrNotAWord() const { - return BinaryFormat::hasBlacklistedOrNotAWordFlag(mNodeFlags); - } - private: DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes); const BinaryDictionaryInfo *const mBinaryDictionaryInfo; - const uint8_t mNodeFlags; const int mShortcutListSizePos; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 9a0f10cd5..c6da6f003 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -36,7 +36,6 @@ namespace latinime { const int Suggest::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2; const float Suggest::AUTOCORRECT_CLASSIFICATION_THRESHOLD = 0.33f; -const int Suggest::FINAL_SCORE_PENALTY_FOR_NOT_BEST_EXACT_MATCHED_WORD = 1; /** * Returns a set of suggestions for the given input touch points. The commitPoint argument indicates @@ -149,8 +148,6 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen &doubleLetterTerminalIndex, &doubleLetterLevel); int maxScore = S_INT_MIN; - int bestExactMatchedNodeTerminalIndex = -1; - int bestExactMatchedNodeOutputWordIndex = -1; // Force autocorrection for obvious long multi-word suggestions when the top suggestion is // a long multiple words suggestion. // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. @@ -173,8 +170,6 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel); const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) + doubleLetterCost; - const TerminalAttributes terminalAttributes(traverseSession->getBinaryDictionaryInfo(), - terminalDicNode->getFlags(), terminalDicNode->getAttributesPos()); const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0; const bool isExactMatch = terminalDicNode->isExactMatch(); const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); @@ -187,14 +182,15 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen | (isSafeExactMatch ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); // Entries that are blacklisted or do not represent a word should not be output. - const bool isValidWord = !terminalAttributes.isBlacklistedOrNotAWord(); + const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); // Increase output score of top typing suggestion to ensure autocorrection. // TODO: Better integration with java side autocorrection logic. const int finalScore = SCORING->calculateFinalScore( compoundDistance, traverseSession->getInputSize(), - (forceCommitMultiWords && terminalDicNode->hasMultipleWords()) - || (isValidWord && SCORING->doesAutoCorrectValidWord())); + terminalDicNode->isExactMatch() + || (forceCommitMultiWords && terminalDicNode->hasMultipleWords()) + || (isValidWord && SCORING->doesAutoCorrectValidWord())); maxScore = max(maxScore, finalScore); // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. @@ -207,25 +203,6 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen if (isValidWord) { outputTypes[outputWordIndex] = Dictionary::KIND_CORRECTION | outputTypeFlags; frequencies[outputWordIndex] = finalScore; - if (isSafeExactMatch) { - // Demote exact matches that are not the highest probable node among all exact - // matches. - const bool isBestTerminal = bestExactMatchedNodeTerminalIndex < 0 - || terminals[bestExactMatchedNodeTerminalIndex].getProbability() - < terminalDicNode->getProbability(); - const int outputWordIndexToBeDemoted = isBestTerminal ? - bestExactMatchedNodeOutputWordIndex : outputWordIndex; - if (outputWordIndexToBeDemoted >= 0) { - frequencies[outputWordIndexToBeDemoted] -= - FINAL_SCORE_PENALTY_FOR_NOT_BEST_EXACT_MATCHED_WORD; - } - if (isBestTerminal) { - // Updates the best exact matched node index. - bestExactMatchedNodeTerminalIndex = terminalIndex; - // Updates the best exact matched output word index. - bestExactMatchedNodeOutputWordIndex = outputWordIndex; - } - } // Populate the outputChars array with the suggested word. const int startIndex = outputWordIndex * MAX_WORD_LENGTH; terminalDicNode->outputResult(&outputCodePoints[startIndex]); @@ -233,6 +210,8 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen } if (!terminalDicNode->hasMultipleWords()) { + const TerminalAttributes terminalAttributes(traverseSession->getBinaryDictionaryInfo(), + terminalDicNode->getAttributesPos()); // Shortcut is not supported for multiple words suggestions. // TODO: Check shortcuts during traversal for multiple words suggestions. const bool sameAsTyped = TRAVERSAL->sameAsTyped(traverseSession, terminalDicNode); diff --git a/native/jni/src/suggest/core/suggest.h b/native/jni/src/suggest/core/suggest.h index 752bde9ac..875cbe4e0 100644 --- a/native/jni/src/suggest/core/suggest.h +++ b/native/jni/src/suggest/core/suggest.h @@ -82,8 +82,6 @@ class Suggest : public SuggestInterface { // Threshold for autocorrection classifier static const float AUTOCORRECT_CLASSIFICATION_THRESHOLD; - // Final score penalty to exact match words that are not the most probable exact match. - static const int FINAL_SCORE_PENALTY_FOR_NOT_BEST_EXACT_MATCHED_WORD; const Traversal *const TRAVERSAL; const Scoring *const SCORING; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h index c0e24fa4e..70dad67e8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h @@ -32,6 +32,9 @@ class DictionaryStructurePolicyFactory { switch (dictionaryFormat) { case BinaryDictionaryFormatUtils::VERSION_2: return PatriciaTriePolicy::getInstance(); + case BinaryDictionaryFormatUtils::VERSION_3: + // TODO: support version 3 dictionaries. + return 0; default: ASSERT(false); return 0; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index e098f353e..830aa80de 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -169,12 +169,7 @@ class TypingWeighting : public Weighting { float getTerminalLanguageCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, const float dicNodeLanguageImprobability) const { - // We promote exact matches here to prevent them from being pruned. The final score of - // exact match nodes might be demoted later in Suggest::outputSuggestions if there are - // multiple exact matches. - const float languageImprobability = (dicNode->isExactMatch()) ? - 0.0f : dicNodeLanguageImprobability; - return languageImprobability * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; + return dicNodeLanguageImprobability * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; } AK_FORCE_INLINE bool needsToNormalizeCompoundDistance() const { |