diff options
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r-- | native/src/unigram_dictionary.cpp | 48 |
1 files changed, 37 insertions, 11 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 3d5683ed9..f36eabb3f 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons } } +static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255; +static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); +} inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) { @@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { - finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER; + finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } } if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER; @@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex return false; } + +// In the following function, c is the current character of the dictionary word +// currently examined. +// currentChars is an array containing the keys close to the character the +// user actually typed at the same position. We want to see if c is in it: if so, +// then the word contains at that position a character close to what the user +// typed. +// What the user typed is actually the first character of the array. +// Notice : accented characters do not have a proximity list, so they are alone +// in their list. The non-accented version of the character should be considered +// "close", but not the other keys close to the non-accented version. inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId( const int *currentChars, const unsigned short c, const int skipPos, const int excessivePos, const int transposedPos) { const unsigned short lowerC = toLowerCase(c); - int j = 0; + + // The first char in the array is what user typed. If it matches right away, + // that means the user typed that same char for this pos. + if (currentChars[0] == lowerC || currentChars[0] == c) + return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; + + // If one of those is true, we should not check for close characters at all. + if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) + return UNRELATED_CHAR; + + // If the non-accented, lowercased version of that first character matches c, + // then we have a non-accented version of the accented character the user + // typed. Treat it as a close char. + if (toLowerCase(currentChars[0]) == lowerC) + return NEAR_PROXIMITY_CHAR; + + // Not an exact nor an accent-alike match: search the list of close keys + int j = 1; while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) { const bool matched = (currentChars[j] == lowerC || currentChars[j] == c); - // If skipPos is defined, not to search proximity collections. - // First char is what user typed. - if (matched) { - if (j > 0) return NEAR_PROXIMITY_CHAR; - return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; - } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) { - // Not to check proximity characters - return UNRELATED_CHAR; - } + if (matched) return NEAR_PROXIMITY_CHAR; ++j; } + + // Was not included, signal this as an unrelated character. return UNRELATED_CHAR; } |