diff options
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r-- | native/src/unigram_dictionary.cpp | 132 |
1 files changed, 28 insertions, 104 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 9bdd916f0..afa8bc545 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -20,7 +20,6 @@ #define LOG_TAG "LatinIME: unigram_dictionary.cpp" -#include "basechars.h" #include "char_utils.h" #include "dictionary.h" #include "unigram_dictionary.h" @@ -351,18 +350,6 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) return false; } -static inline unsigned short toBaseLowerCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (c >='A' && c <= 'Z') { - c |= 32; - } else if (c > 127) { - c = latin_tolower(c); - } - return c; -} - static const char QUOTE = '\''; static const char SPACE = ' '; @@ -556,7 +543,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); if (excessivePos >= 0) { multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); - if (!existsAdjacentProximityChars(inputIndex, mInputLength)) { + if (!mProximityInfo->existsAdjacentProximityChars(inputIndex)) { // If an excessive character is not adjacent to the left char or the right char, // we will demote this word. multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); @@ -592,75 +579,11 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth) { - const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0]; + const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex); // Skip the ' or other letter and continue deeper return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth; } -inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex, - const int inputLength) const { - if (inputIndex < 0 || inputIndex >= inputLength) return false; - const int currentChar = *getInputCharsAt(inputIndex); - const int leftIndex = inputIndex - 1; - if (leftIndex >= 0) { - const int *leftChars = getInputCharsAt(leftIndex); - int i = 0; - while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { - if (leftChars[i++] == currentChar) return true; - } - } - const int rightIndex = inputIndex + 1; - if (rightIndex < inputLength) { - const int *rightChars = getInputCharsAt(rightIndex); - int i = 0; - while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { - if (rightChars[i++] == currentChar) return true; - } - } - return false; -} - -// In the following function, c is the current character of the dictionary word -// currently examined. -// currentChars is an array containing the keys close to the character the -// user actually typed at the same position. We want to see if c is in it: if so, -// then the word contains at that position a character close to what the user -// typed. -// What the user typed is actually the first character of the array. -// Notice : accented characters do not have a proximity list, so they are alone -// in their list. The non-accented version of the character should be considered -// "close", but not the other keys close to the non-accented version. -inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId( - const int *currentChars, const unsigned short c, const int skipPos, - const int excessivePos, const int transposedPos) { - const unsigned short baseLowerC = toBaseLowerCase(c); - - // The first char in the array is what user typed. If it matches right away, - // that means the user typed that same char for this pos. - if (currentChars[0] == baseLowerC || currentChars[0] == c) - return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; - - // If one of those is true, we should not check for close characters at all. - if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) - return UNRELATED_CHAR; - - // If the non-accented, lowercased version of that first character matches c, - // then we have a non-accented version of the accented character the user - // typed. Treat it as a close char. - if (toBaseLowerCase(currentChars[0]) == baseLowerC) - return NEAR_PROXIMITY_CHAR; - - // Not an exact nor an accent-alike match: search the list of close keys - int j = 1; - while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) { - const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); - if (matched) return NEAR_PROXIMITY_CHAR; - ++j; - } - - // Was not included, signal this as an unrelated character. - return UNRELATED_CHAR; -} inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth, const uint8_t* const root, const uint8_t flags, const int pos, @@ -826,15 +749,14 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) { const int inputIndex = startInputIndex + depth; - const int *currentChars = getInputCharsAt(inputIndex); unsigned short c; *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos, &c, newChildPosition, newTerminal, newFreq); - const unsigned int inputC = currentChars[0]; + const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex); if (DEBUG_DICT) { assert(inputC <= U_SHORT_MAX); } - const unsigned short baseLowerC = toBaseLowerCase(c); + const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); const bool matched = (inputC == baseLowerC || inputC == c); const bool hasChild = *newChildPosition != 0; if (matched) { @@ -952,20 +874,20 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in *newDiffs = diffs; *newInputIndex = inputIndex; } else { - const int *currentChars = getInputCharsAt(inputIndex); + int inputIndexForProximity = inputIndex; if (transposedPos >= 0) { - if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; - if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; + if (inputIndex == transposedPos) ++inputIndexForProximity; + if (inputIndex == (transposedPos + 1)) --inputIndexForProximity; } - int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos, - transposedPos); - if (UNRELATED_CHAR == matchedProximityCharId) return false; + ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId( + inputIndexForProximity, c, skipPos, excessivePos, transposedPos); + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false; mWord[depth] = c; // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. - if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); } bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 @@ -978,7 +900,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in // Start traversing all nodes after the index exceeds the user typed length *newTraverseAllNodes = isSameAsUserTypedLength; *newMatchRate = matchWeight; - *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); + *newDiffs = diffs + + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); *newInputIndex = inputIndex + 1; } // Optimization: Prune out words that are too long compared to how much was typed. @@ -1007,7 +930,7 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, uint16_t inWord[inputLength]; for (int i = 0; i < inputLength; ++i) { - inWord[i] = *getInputCharsAt(startInputIndex + i); + inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i); } return getMostFrequentWordLikeInner(inWord, inputLength, word); } @@ -1031,8 +954,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - int32_t baseChar = toBaseLowerCase(character); - const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); + int32_t baseChar = Dictionary::toBaseLowerCase(character); + const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; @@ -1044,8 +967,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, if (hasMultipleChars) { character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); while (NOT_A_CHARACTER != character) { - baseChar = toBaseLowerCase(character); - if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + baseChar = Dictionary::toBaseLowerCase(character); + if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false; @@ -1290,7 +1213,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags); // This has to be done for each virtual char (this forwards the "inputIndex" which - // is the index in the user-inputted chars, as read by getInputCharsAt. + // is the index in the user-inputted chars, as read by proximity chars. if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { mWord[depth] = c; @@ -1314,16 +1237,16 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in return false; } } else { - const int *currentChars = getInputCharsAt(inputIndex); + int inputIndexForProximity = inputIndex; if (transposedPos >= 0) { - if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; - if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; + if (inputIndex == transposedPos) ++inputIndexForProximity; + if (inputIndex == (transposedPos + 1)) --inputIndexForProximity; } - const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, - excessivePos, transposedPos); - if (UNRELATED_CHAR == matchedProximityCharId) { + int matchedProximityCharId = mProximityInfo->getMatchedProximityId( + inputIndexForProximity, c, skipPos, excessivePos, transposedPos); + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { // We found that this is an unrelated character, so we should give up traversing // this node and its children entirely. // However we may not be on the last virtual node yet so we skip the remaining @@ -1342,7 +1265,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in mWord[depth] = c; // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. - if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); } const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 @@ -1366,7 +1289,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in } // Start traversing all nodes after the index exceeds the user typed length traverseAllNodes = isSameAsUserTypedLength; - diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); + diffs = diffs + + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); // Finally, we are ready to go to the next character, the next "virtual node". // We should advance the input index. // We do this in this branch of the 'if traverseAllNodes' because we are still matching |