1 files changed, 53 insertions, 73 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h
index e0edff584..89dfa39b3 100644
--- a/native/jni/src/defines.h
+++ b/native/jni/src/defines.h
@@ -35,46 +35,74 @@
 // Must be equal to ProximityInfo.MAX_PROXIMITY_CHARS_SIZE in Java
 #define MAX_PROXIMITY_CHARS_SIZE 16
 #define ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE 2
+#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
 
-#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG)
-#include <android/log.h>
-#ifndef LOG_TAG
-#define LOG_TAG "LatinIME: "
-#endif // LOG_TAG
-#define AKLOGE(fmt, ...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, fmt, ##__VA_ARGS__)
-#define AKLOGI(fmt, ...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, fmt, ##__VA_ARGS__)
-
-#define DUMP_RESULT(words, frequencies) do { dumpResult(words, frequencies); } while (0)
-#define DUMP_WORD(word, length) do { dumpWord(word, length); } while (0)
-#define INTS_TO_CHARS(input, length, output) do { \
-        intArrayToCharArray(input, length, output); } while (0)
-
-// TODO: Support full UTF-8 conversion
-AK_FORCE_INLINE static int intArrayToCharArray(const int *source, const int sourceSize,
-        char *dest) {
+AK_FORCE_INLINE static int intArrayToCharArray(const int *const source, const int sourceSize,
+        char *dest, const int destSize) {
+    // We want to always terminate with a 0 char, so stop one short of the length to make
+    // sure there is room.
+    const int destLimit = destSize - 1;
     int si = 0;
     int di = 0;
-    while (si < sourceSize && di < MAX_WORD_LENGTH - 1 && 0 != source[si]) {
+    while (si < sourceSize && di < destLimit && 0 != source[si]) {
         const int codePoint = source[si++];
-        if (codePoint < 0x7F) {
+        if (codePoint < 0x7F) { // One byte
             dest[di++] = codePoint;
-        } else if (codePoint < 0x7FF) {
+        } else if (codePoint < 0x7FF) { // Two bytes
+            if (di + 1 >= destLimit) break;
             dest[di++] = 0xC0 + (codePoint >> 6);
             dest[di++] = 0x80 + (codePoint & 0x3F);
-        } else if (codePoint < 0xFFFF) {
+        } else if (codePoint < 0xFFFF) { // Three bytes
+            if (di + 2 >= destLimit) break;
             dest[di++] = 0xE0 + (codePoint >> 12);
-            dest[di++] = 0x80 + ((codePoint & 0xFC0) >> 6);
+            dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F);
             dest[di++] = 0x80 + (codePoint & 0x3F);
+        } else if (codePoint <= 0x1FFFFF) { // Four bytes
+            if (di + 3 >= destLimit) break;
+            dest[di++] = 0xF0 + (codePoint >> 18);
+            dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F);
+            dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F);
+            dest[di++] = 0x80 + (codePoint & 0x3F);
+        } else if (codePoint <= 0x3FFFFFF) { // Five bytes
+            if (di + 4 >= destLimit) break;
+            dest[di++] = 0xF8 + (codePoint >> 24);
+            dest[di++] = 0x80 + ((codePoint >> 18) & 0x3F);
+            dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F);
+            dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F);
+            dest[di++] = codePoint & 0x3F;
+        } else if (codePoint <= 0x7FFFFFFF) { // Six bytes
+            if (di + 5 >= destLimit) break;
+            dest[di++] = 0xFC + (codePoint >> 30);
+            dest[di++] = 0x80 + ((codePoint >> 24) & 0x3F);
+            dest[di++] = 0x80 + ((codePoint >> 18) & 0x3F);
+            dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F);
+            dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F);
+            dest[di++] = codePoint & 0x3F;
+        } else {
+            // Not a code point... skip.
         }
     }
     dest[di] = 0;
     return di;
 }
 
+#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG)
+#include <android/log.h>
+#ifndef LOG_TAG
+#define LOG_TAG "LatinIME: "
+#endif // LOG_TAG
+#define AKLOGE(fmt, ...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, fmt, ##__VA_ARGS__)
+#define AKLOGI(fmt, ...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, fmt, ##__VA_ARGS__)
+
+#define DUMP_RESULT(words, frequencies) do { dumpResult(words, frequencies); } while (0)
+#define DUMP_WORD(word, length) do { dumpWord(word, length); } while (0)
+#define INTS_TO_CHARS(input, length, output, outlength) do { \
+        intArrayToCharArray(input, length, output, outlength); } while (0)
+
 static inline void dumpWordInfo(const int *word, const int length, const int rank,
         const int probability) {
     static char charBuf[50];
-    const int N = intArrayToCharArray(word, length, charBuf);
+    const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf));
     if (N > 1) {
         AKLOGI("%2d [ %s ] (%d)", rank, charBuf, probability);
     }
@@ -90,7 +118,7 @@ static inline void dumpResult(const int *outWords, const int *frequencies) {
 
 static AK_FORCE_INLINE void dumpWord(const int *word, const int length) {
     static char charBuf[50];
-    const int N = intArrayToCharArray(word, length, charBuf);
+    const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf));
     if (N > 1) {
         AKLOGI("[ %s ]", charBuf);
     }
@@ -264,37 +292,24 @@ static inline void prof_out(void) {
 // of the binary dictionary where a {key,value} string pair scheme is used.
 #define LARGEST_INT_DIGIT_COUNT 11
 
-#define NOT_VALID_WORD (-99)
 #define NOT_A_CODE_POINT (-1)
 #define NOT_A_DISTANCE (-1)
 #define NOT_A_COORDINATE (-1)
-#define MATCH_CHAR_WITHOUT_DISTANCE_INFO (-2)
-#define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO (-3)
-#define ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO (-4)
 #define NOT_AN_INDEX (-1)
 #define NOT_A_PROBABILITY (-1)
+#define NOT_A_DICT_POS (S_INT_MIN)
 
 #define KEYCODE_SPACE ' '
 #define KEYCODE_SINGLE_QUOTE '\''
 #define KEYCODE_HYPHEN_MINUS '-'
 
-#define CALIBRATE_SCORE_BY_TOUCH_COORDINATES true
-#define SUGGEST_MULTIPLE_WORDS true
 #define SUGGEST_INTERFACE_OUTPUT_SCALE 1000000.0f
-
-#define ZERO_DISTANCE_PROMOTION_RATE 110.0f
-#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
-#define HALF_SCORE_SQUARED_RADIUS 32.0f
 #define MAX_PROBABILITY 255
 #define MAX_BIGRAM_ENCODED_PROBABILITY 15
-#define MULTIPLE_WORDS_DEMOTION_RATE 80
 
 // Assuming locale strings such as en_US, sr-Latn etc.
 #define MAX_LOCALE_STRING_LENGTH 10
 
-/* heuristic... This should be changed if we change the unit of the probability. */
-#define SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ (MAX_PROBABILITY * 58 / 100)
-
 // Max value for length, distance and probability which are used in weighting
 // TODO: Remove
 #define MAX_VALUE_FOR_WEIGHTING 10000000
@@ -306,45 +321,9 @@ static inline void prof_out(void) {
 #define MAX_POINTER_COUNT 1
 #define MAX_POINTER_COUNT_G 2
 
-// Queue IDs and size for DicNodesCache
-#define DIC_NODES_CACHE_INITIAL_QUEUE_ID_ACTIVE 0
-#define DIC_NODES_CACHE_INITIAL_QUEUE_ID_NEXT_ACTIVE 1
-#define DIC_NODES_CACHE_INITIAL_QUEUE_ID_TERMINAL 2
-#define DIC_NODES_CACHE_INITIAL_QUEUE_ID_CACHE_FOR_CONTINUOUS_SUGGESTION 3
-#define DIC_NODES_CACHE_PRIORITY_QUEUES_SIZE 4
-
-// Size, in bytes, of the bloom filter index for bigrams
-// 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
-// where k is the number of hash functions, n the number of bigrams, and m the number of
-// bits we can test.
-// At the moment 100 is the maximum number of bigrams for a word with the current
-// dictionaries, so n = 100. 1024 buckets give us m = 1024.
-// With 1 hash function, our false positive rate is about 9.3%, which should be enough for
-// our uses since we are only using this to increase average performance. For the record,
-// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
-// and m = 4096 gives 2.4%.
-#define BIGRAM_FILTER_BYTE_SIZE 128
-// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
-// prime under 128 * 8.
-#define BIGRAM_FILTER_MODULO 1021
-#if BIGRAM_FILTER_BYTE_SIZE * 8 < BIGRAM_FILTER_MODULO
-#error "BIGRAM_FILTER_MODULO is larger than BIGRAM_FILTER_BYTE_SIZE"
-#endif
-
-// Max number of bigram maps (previous word contexts) to be cached. Increasing this number could
-// improve bigram lookup speed for multi-word suggestions, but at the cost of more memory usage.
-// Also, there are diminishing returns since the most frequently used bigrams are typically near
-// the beginning of the input and are thus the first ones to be cached. Note that these bigrams
-// are reset for each new composing word.
-#define MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP 25
-// Most common previous word contexts currently have 100 bigrams
-#define DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP 100
-
 template<typename T> AK_FORCE_INLINE const T &min(const T &a, const T &b) { return a < b ? a : b; }
 template<typename T> AK_FORCE_INLINE const T &max(const T &a, const T &b) { return a > b ? a : b; }
 
-#define NELEMS(x) (sizeof(x) / sizeof((x)[0]))
-
 // DEBUG
 #define INPUTLENGTH_FOR_DEBUG (-1)
 #define MIN_OUTPUT_INDEX_FOR_DEBUG (-1)
@@ -394,6 +373,7 @@ typedef enum {
     CT_TRANSPOSITION,
     CT_COMPLETION,
     CT_TERMINAL,
+    CT_TERMINAL_INSERTION,
     // Create new word with space omission
     CT_NEW_WORD_SPACE_OMITTION,
     // Create new word with space substitution