aboutsummaryrefslogtreecommitdiffstats
path: root/native/src/unigram_dictionary.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r--native/src/unigram_dictionary.cpp48
1 files changed, 37 insertions, 11 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 3d5683ed9..f36eabb3f 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
}
}
+static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255;
+static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+ return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+}
inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
const int snr, const int skipPos, const int excessivePos, const int transposedPos,
const int freq, const bool sameLength) {
@@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
- finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+ finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
return false;
}
+
+// In the following function, c is the current character of the dictionary word
+// currently examined.
+// currentChars is an array containing the keys close to the character the
+// user actually typed at the same position. We want to see if c is in it: if so,
+// then the word contains at that position a character close to what the user
+// typed.
+// What the user typed is actually the first character of the array.
+// Notice : accented characters do not have a proximity list, so they are alone
+// in their list. The non-accented version of the character should be considered
+// "close", but not the other keys close to the non-accented version.
inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
const int *currentChars, const unsigned short c, const int skipPos,
const int excessivePos, const int transposedPos) {
const unsigned short lowerC = toLowerCase(c);
- int j = 0;
+
+ // The first char in the array is what user typed. If it matches right away,
+ // that means the user typed that same char for this pos.
+ if (currentChars[0] == lowerC || currentChars[0] == c)
+ return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
+
+ // If one of those is true, we should not check for close characters at all.
+ if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
+ return UNRELATED_CHAR;
+
+ // If the non-accented, lowercased version of that first character matches c,
+ // then we have a non-accented version of the accented character the user
+ // typed. Treat it as a close char.
+ if (toLowerCase(currentChars[0]) == lowerC)
+ return NEAR_PROXIMITY_CHAR;
+
+ // Not an exact nor an accent-alike match: search the list of close keys
+ int j = 1;
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
- // If skipPos is defined, not to search proximity collections.
- // First char is what user typed.
- if (matched) {
- if (j > 0) return NEAR_PROXIMITY_CHAR;
- return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
- } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
- // Not to check proximity characters
- return UNRELATED_CHAR;
- }
+ if (matched) return NEAR_PROXIMITY_CHAR;
++j;
}
+
+ // Was not included, signal this as an unrelated character.
return UNRELATED_CHAR;
}