3 files changed, 98 insertions, 14 deletions
diff --git a/native/src/debug.h b/native/src/debug.h
new file mode 100644
index 000000000..e5572e1a5
--- /dev/null
+++ b/native/src/debug.h
@@ -0,0 +1,58 @@
+/*
+**
+** Copyright 2011, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef LATINIME_DEBUG_H
+#define LATINIME_DEBUG_H
+
+#include "defines.h"
+
+static inline unsigned char* convertToUnibyteString(unsigned short* input, unsigned char* output,
+        const unsigned int length) {
+    int i = 0;
+    for (; i <= length && input[i] != 0; ++i)
+        output[i] = input[i] & 0xFF;
+    output[i] = 0;
+    return output;
+}
+static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input,
+        unsigned char* output, const unsigned int length, unsigned char c) {
+    int i = 0;
+    for (; i <= length && input[i] != 0; ++i)
+        output[i] = input[i] & 0xFF;
+    output[i-1] = c;
+    output[i] = 0;
+    return output;
+}
+static inline void LOGI_S16(unsigned short* string, const unsigned int length) {
+    unsigned char tmp_buffer[length];
+    convertToUnibyteString(string, tmp_buffer, length);
+    LOGI(">> %s", tmp_buffer);
+    // The log facility is throwing out log that comes too fast. The following
+    // is a dirty way of slowing down processing so that we can see all log.
+    // TODO : refactor this in a blocking log or something.
+    // usleep(10);
+}
+static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length,
+        unsigned char c) {
+    unsigned char tmp_buffer[length+1];
+    convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c);
+    LOGI(">> %s", tmp_buffer);
+    // Likewise
+    // usleep(10);
+}
+
+#endif // LATINIME_DEBUG_H
diff --git a/native/src/defines.h b/native/src/defines.h
index 7fa7e35e0..918028afe 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -100,6 +100,9 @@ static void prof_out(void) {
 #ifndef U_SHORT_MAX
 #define U_SHORT_MAX 1 << 16
 #endif
+#ifndef S_INT_MAX
+#define S_INT_MAX ((1 << 31) - 1)
+#endif
 
 // Define this to use mmap() for dictionary loading.  Undefine to use malloc() instead of mmap().
 // We measured and compared performance of both, and found mmap() is fairly good in terms of
@@ -137,9 +140,6 @@ static void prof_out(void) {
 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120
 
-// This is used as a bare multiplier (not subject to /100)
-#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
-
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
 #define MAX_WORD_LENGTH_INTERNAL 48
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 3d5683ed9..f36eabb3f 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
     }
 }
 
+static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255;
+static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+    return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+}
 inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
         const int snr, const int skipPos, const int excessivePos, const int transposedPos,
         const int freq, const bool sameLength) {
@@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
             multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
         }
         if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
-            finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
         }
     }
     if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
     return false;
 }
 
+
+// In the following function, c is the current character of the dictionary word
+// currently examined.
+// currentChars is an array containing the keys close to the character the
+// user actually typed at the same position. We want to see if c is in it: if so,
+// then the word contains at that position a character close to what the user
+// typed.
+// What the user typed is actually the first character of the array.
+// Notice : accented characters do not have a proximity list, so they are alone
+// in their list. The non-accented version of the character should be considered
+// "close", but not the other keys close to the non-accented version.
 inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
         const int *currentChars, const unsigned short c, const int skipPos,
         const int excessivePos, const int transposedPos) {
     const unsigned short lowerC = toLowerCase(c);
-    int j = 0;
+
+    // The first char in the array is what user typed. If it matches right away,
+    // that means the user typed that same char for this pos.
+    if (currentChars[0] == lowerC || currentChars[0] == c)
+        return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
+
+    // If one of those is true, we should not check for close characters at all.
+    if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
+        return UNRELATED_CHAR;
+
+    // If the non-accented, lowercased version of that first character matches c,
+    // then we have a non-accented version of the accented character the user
+    // typed. Treat it as a close char.
+    if (toLowerCase(currentChars[0]) == lowerC)
+        return NEAR_PROXIMITY_CHAR;
+
+    // Not an exact nor an accent-alike match: search the list of close keys
+    int j = 1;
     while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
         const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
-        // If skipPos is defined, not to search proximity collections.
-        // First char is what user  typed.
-        if (matched) {
-            if (j > 0) return NEAR_PROXIMITY_CHAR;
-            return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
-        } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
-            // Not to check proximity characters
-            return UNRELATED_CHAR;
-        }
+        if (matched) return NEAR_PROXIMITY_CHAR;
         ++j;
     }
+
+    // Was not included, signal this as an unrelated character.
     return UNRELATED_CHAR;
 }