diff options
author | 2011-02-18 17:50:58 +0900 | |
---|---|---|
committer | 2011-02-22 15:27:06 +0900 | |
commit | a5d58497018f465080f08fbbfed35de883bc8be3 (patch) | |
tree | bbddfeca6083af2cda1bc069de6abefcbfec53d1 /native/src/unigram_dictionary.cpp | |
parent | 050c0462dc2ada5a5afecec5b6745693c5066b85 (diff) | |
download | latinime-a5d58497018f465080f08fbbfed35de883bc8be3.tar.gz latinime-a5d58497018f465080f08fbbfed35de883bc8be3.tar.xz latinime-a5d58497018f465080f08fbbfed35de883bc8be3.zip |
Force autocorrection of matching words with different accents.
When entering a word without accents the user expects the system to
add accents automatically if there is no other matching word. This
patch ensures the accented version is promoted accordingly and
autocorrection really takes place.
Issue: 3400015
Change-Id: I8cd3db5bf131ec6844b26abecc1ecbd1d6269df4
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r-- | native/src/unigram_dictionary.cpp | 48 |
1 files changed, 37 insertions, 11 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 3d5683ed9..f36eabb3f 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons } } +static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255; +static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); +} inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) { @@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { - finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER; + finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } } if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER; @@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex return false; } + +// In the following function, c is the current character of the dictionary word +// currently examined. +// currentChars is an array containing the keys close to the character the +// user actually typed at the same position. We want to see if c is in it: if so, +// then the word contains at that position a character close to what the user +// typed. +// What the user typed is actually the first character of the array. +// Notice : accented characters do not have a proximity list, so they are alone +// in their list. The non-accented version of the character should be considered +// "close", but not the other keys close to the non-accented version. inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId( const int *currentChars, const unsigned short c, const int skipPos, const int excessivePos, const int transposedPos) { const unsigned short lowerC = toLowerCase(c); - int j = 0; + + // The first char in the array is what user typed. If it matches right away, + // that means the user typed that same char for this pos. + if (currentChars[0] == lowerC || currentChars[0] == c) + return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; + + // If one of those is true, we should not check for close characters at all. + if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) + return UNRELATED_CHAR; + + // If the non-accented, lowercased version of that first character matches c, + // then we have a non-accented version of the accented character the user + // typed. Treat it as a close char. + if (toLowerCase(currentChars[0]) == lowerC) + return NEAR_PROXIMITY_CHAR; + + // Not an exact nor an accent-alike match: search the list of close keys + int j = 1; while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) { const bool matched = (currentChars[j] == lowerC || currentChars[j] == c); - // If skipPos is defined, not to search proximity collections. - // First char is what user typed. - if (matched) { - if (j > 0) return NEAR_PROXIMITY_CHAR; - return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; - } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) { - // Not to check proximity characters - return UNRELATED_CHAR; - } + if (matched) return NEAR_PROXIMITY_CHAR; ++j; } + + // Was not included, signal this as an unrelated character. return UNRELATED_CHAR; } |