aboutsummaryrefslogtreecommitdiffstats
path: root/native/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/src')
-rw-r--r--native/src/debug.h58
-rw-r--r--native/src/defines.h6
-rw-r--r--native/src/unigram_dictionary.cpp48
3 files changed, 98 insertions, 14 deletions
diff --git a/native/src/debug.h b/native/src/debug.h
new file mode 100644
index 000000000..e5572e1a5
--- /dev/null
+++ b/native/src/debug.h
@@ -0,0 +1,58 @@
+/*
+**
+** Copyright 2011, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+** http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef LATINIME_DEBUG_H
+#define LATINIME_DEBUG_H
+
+#include "defines.h"
+
+static inline unsigned char* convertToUnibyteString(unsigned short* input, unsigned char* output,
+ const unsigned int length) {
+ int i = 0;
+ for (; i <= length && input[i] != 0; ++i)
+ output[i] = input[i] & 0xFF;
+ output[i] = 0;
+ return output;
+}
+static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input,
+ unsigned char* output, const unsigned int length, unsigned char c) {
+ int i = 0;
+ for (; i <= length && input[i] != 0; ++i)
+ output[i] = input[i] & 0xFF;
+ output[i-1] = c;
+ output[i] = 0;
+ return output;
+}
+static inline void LOGI_S16(unsigned short* string, const unsigned int length) {
+ unsigned char tmp_buffer[length];
+ convertToUnibyteString(string, tmp_buffer, length);
+ LOGI(">> %s", tmp_buffer);
+ // The log facility is throwing out log that comes too fast. The following
+ // is a dirty way of slowing down processing so that we can see all log.
+ // TODO : refactor this in a blocking log or something.
+ // usleep(10);
+}
+static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length,
+ unsigned char c) {
+ unsigned char tmp_buffer[length+1];
+ convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c);
+ LOGI(">> %s", tmp_buffer);
+ // Likewise
+ // usleep(10);
+}
+
+#endif // LATINIME_DEBUG_H
diff --git a/native/src/defines.h b/native/src/defines.h
index 7fa7e35e0..918028afe 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -100,6 +100,9 @@ static void prof_out(void) {
#ifndef U_SHORT_MAX
#define U_SHORT_MAX 1 << 16
#endif
+#ifndef S_INT_MAX
+#define S_INT_MAX ((1 << 31) - 1)
+#endif
// Define this to use mmap() for dictionary loading. Undefine to use malloc() instead of mmap().
// We measured and compared performance of both, and found mmap() is fairly good in terms of
@@ -137,9 +140,6 @@ static void prof_out(void) {
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
-// This is used as a bare multiplier (not subject to /100)
-#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
-
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
#define MAX_WORD_LENGTH_INTERNAL 48
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 3d5683ed9..f36eabb3f 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
}
}
+static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255;
+static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+ return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+}
inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
const int snr, const int skipPos, const int excessivePos, const int transposedPos,
const int freq, const bool sameLength) {
@@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
- finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+ finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
return false;
}
+
+// In the following function, c is the current character of the dictionary word
+// currently examined.
+// currentChars is an array containing the keys close to the character the
+// user actually typed at the same position. We want to see if c is in it: if so,
+// then the word contains at that position a character close to what the user
+// typed.
+// What the user typed is actually the first character of the array.
+// Notice : accented characters do not have a proximity list, so they are alone
+// in their list. The non-accented version of the character should be considered
+// "close", but not the other keys close to the non-accented version.
inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
const int *currentChars, const unsigned short c, const int skipPos,
const int excessivePos, const int transposedPos) {
const unsigned short lowerC = toLowerCase(c);
- int j = 0;
+
+ // The first char in the array is what user typed. If it matches right away,
+ // that means the user typed that same char for this pos.
+ if (currentChars[0] == lowerC || currentChars[0] == c)
+ return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
+
+ // If one of those is true, we should not check for close characters at all.
+ if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
+ return UNRELATED_CHAR;
+
+ // If the non-accented, lowercased version of that first character matches c,
+ // then we have a non-accented version of the accented character the user
+ // typed. Treat it as a close char.
+ if (toLowerCase(currentChars[0]) == lowerC)
+ return NEAR_PROXIMITY_CHAR;
+
+ // Not an exact nor an accent-alike match: search the list of close keys
+ int j = 1;
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
- // If skipPos is defined, not to search proximity collections.
- // First char is what user typed.
- if (matched) {
- if (j > 0) return NEAR_PROXIMITY_CHAR;
- return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
- } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
- // Not to check proximity characters
- return UNRELATED_CHAR;
- }
+ if (matched) return NEAR_PROXIMITY_CHAR;
++j;
}
+
+ // Was not included, signal this as an unrelated character.
return UNRELATED_CHAR;
}