Force autocorrection of matching words with different accents.

When entering a word without accents the user expects the system to add accents automatically if there is no other matching word. This patch ensures the accented version is promoted accordingly and autocorrection really takes place. Issue: 3400015 Change-Id: I8cd3db5bf131ec6844b26abecc1ecbd1d6269df4
author: Jean Chalard <jchalard@google.com> 2011-02-18 17:50:58 +0900
committer: Jean Chalard <jchalard@google.com> 2011-02-22 15:27:06 +0900
commit: a5d58497018f465080f08fbbfed35de883bc8be3 (patch)
tree: bbddfeca6083af2cda1bc069de6abefcbfec53d1 /native/src/unigram_dictionary.cpp
parent: 050c0462dc2ada5a5afecec5b6745693c5066b85 (diff)
download: latinime-a5d58497018f465080f08fbbfed35de883bc8be3.tar.gz
latinime-a5d58497018f465080f08fbbfed35de883bc8be3.tar.xz
latinime-a5d58497018f465080f08fbbfed35de883bc8be3.zip
1 files changed, 37 insertions, 11 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 3d5683ed9..f36eabb3f 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
     }
 }
 
+static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255;
+static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+    return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+}
 inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
         const int snr, const int skipPos, const int excessivePos, const int transposedPos,
         const int freq, const bool sameLength) {
@@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
             multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
         }
         if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
-            finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
         }
     }
     if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
     return false;
 }
 
+
+// In the following function, c is the current character of the dictionary word
+// currently examined.
+// currentChars is an array containing the keys close to the character the
+// user actually typed at the same position. We want to see if c is in it: if so,
+// then the word contains at that position a character close to what the user
+// typed.
+// What the user typed is actually the first character of the array.
+// Notice : accented characters do not have a proximity list, so they are alone
+// in their list. The non-accented version of the character should be considered
+// "close", but not the other keys close to the non-accented version.
 inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
         const int *currentChars, const unsigned short c, const int skipPos,
         const int excessivePos, const int transposedPos) {
     const unsigned short lowerC = toLowerCase(c);
-    int j = 0;
+
+    // The first char in the array is what user typed. If it matches right away,
+    // that means the user typed that same char for this pos.
+    if (currentChars[0] == lowerC || currentChars[0] == c)
+        return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
+
+    // If one of those is true, we should not check for close characters at all.
+    if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
+        return UNRELATED_CHAR;
+
+    // If the non-accented, lowercased version of that first character matches c,
+    // then we have a non-accented version of the accented character the user
+    // typed. Treat it as a close char.
+    if (toLowerCase(currentChars[0]) == lowerC)
+        return NEAR_PROXIMITY_CHAR;
+
+    // Not an exact nor an accent-alike match: search the list of close keys
+    int j = 1;
     while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
         const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
-        // If skipPos is defined, not to search proximity collections.
-        // First char is what user  typed.
-        if (matched) {
-            if (j > 0) return NEAR_PROXIMITY_CHAR;
-            return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
-        } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
-            // Not to check proximity characters
-            return UNRELATED_CHAR;
-        }
+        if (matched) return NEAR_PROXIMITY_CHAR;
         ++j;
     }
+
+    // Was not included, signal this as an unrelated character.
     return UNRELATED_CHAR;
 }
author	Jean Chalard <jchalard@google.com>	2011-02-18 17:50:58 +0900
committer	Jean Chalard <jchalard@google.com>	2011-02-22 15:27:06 +0900
commit	a5d58497018f465080f08fbbfed35de883bc8be3 (patch)
tree	bbddfeca6083af2cda1bc069de6abefcbfec53d1 /native/src/unigram_dictionary.cpp
parent	050c0462dc2ada5a5afecec5b6745693c5066b85 (diff)
download	latinime-a5d58497018f465080f08fbbfed35de883bc8be3.tar.gz latinime-a5d58497018f465080f08fbbfed35de883bc8be3.tar.xz latinime-a5d58497018f465080f08fbbfed35de883bc8be3.zip