aboutsummaryrefslogtreecommitdiffstats
path: root/native
diff options
context:
space:
mode:
Diffstat (limited to 'native')
-rw-r--r--native/Android.mk65
-rw-r--r--native/jni/Android.mk138
-rw-r--r--native/jni/Application.mk1
-rw-r--r--native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp30
-rw-r--r--native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp121
-rw-r--r--native/jni/jni_common.cpp16
-rw-r--r--native/jni/jni_common.h8
-rw-r--r--native/jni/src/additional_proximity_chars.cpp41
-rw-r--r--native/jni/src/additional_proximity_chars.h94
-rw-r--r--native/jni/src/basechars.cpp (renamed from native/src/basechars.h)12
-rw-r--r--native/jni/src/bigram_dictionary.cpp (renamed from native/src/bigram_dictionary.cpp)30
-rw-r--r--native/jni/src/bigram_dictionary.h (renamed from native/src/bigram_dictionary.h)12
-rw-r--r--native/jni/src/binary_format.h (renamed from native/src/binary_format.h)69
-rw-r--r--native/jni/src/char_utils.cpp (renamed from native/src/char_utils.cpp)0
-rw-r--r--native/jni/src/char_utils.h65
-rw-r--r--native/jni/src/correction.cpp (renamed from native/src/correction.cpp)650
-rw-r--r--native/jni/src/correction.h (renamed from native/src/correction.h)107
-rw-r--r--native/jni/src/correction_state.h (renamed from native/src/correction_state.h)4
-rw-r--r--native/jni/src/debug.h (renamed from native/src/debug.h)6
-rw-r--r--native/jni/src/defines.h (renamed from native/src/defines.h)116
-rw-r--r--native/jni/src/dictionary.cpp (renamed from native/src/dictionary.cpp)25
-rw-r--r--native/jni/src/dictionary.h89
-rw-r--r--native/jni/src/proximity_info.cpp (renamed from native/src/proximity_info.cpp)224
-rw-r--r--native/jni/src/proximity_info.h (renamed from native/src/proximity_info.h)35
-rw-r--r--native/jni/src/terminal_attributes.h78
-rw-r--r--native/jni/src/unigram_dictionary.cpp894
-rw-r--r--native/jni/src/unigram_dictionary.h (renamed from native/src/unigram_dictionary.h)113
-rw-r--r--native/jni/src/words_priority_queue.h195
-rw-r--r--native/jni/src/words_priority_queue_pool.h90
-rw-r--r--native/src/char_utils.h26
-rw-r--r--native/src/dictionary.h174
-rw-r--r--native/src/unigram_dictionary.cpp729
32 files changed, 2781 insertions, 1476 deletions
diff --git a/native/Android.mk b/native/Android.mk
index 24c0037c9..5053e7d64 100644
--- a/native/Android.mk
+++ b/native/Android.mk
@@ -1,64 +1 @@
-LOCAL_PATH := $(call my-dir)
-include $(CLEAR_VARS)
-
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
-
-LOCAL_CFLAGS += -Werror -Wall
-
-# To suppress compiler warnings for unused variables/functions used for debug features etc.
-LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function
-
-LOCAL_SRC_FILES := \
- jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \
- jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
- jni/jni_common.cpp \
- src/bigram_dictionary.cpp \
- src/char_utils.cpp \
- src/correction.cpp \
- src/dictionary.cpp \
- src/proximity_info.cpp \
- src/unigram_dictionary.cpp
-
-#FLAG_DBG := true
-#FLAG_DO_PROFILE := true
-
-TARGETING_UNBUNDLED_FROYO := true
-
-ifeq ($(TARGET_ARCH), x86)
- TARGETING_UNBUNDLED_FROYO := false
-endif
-
-ifeq ($(TARGET_ARCH),mips)
- TARGETING_UNBUNDLED_FROYO := false
-endif
-
-ifeq ($(FLAG_DBG), true)
- TARGETING_UNBUNDLED_FROYO := false
-endif
-
-ifeq ($(FLAG_DO_PROFILE), true)
- TARGETING_UNBUNDLED_FROYO := false
-endif
-
-ifeq ($(TARGETING_UNBUNDLED_FROYO), true)
- LOCAL_NDK_VERSION := 4
- LOCAL_SDK_VERSION := 8
-endif
-
-LOCAL_MODULE := libjni_latinime
-
-LOCAL_MODULE_TAGS := optional
-
-ifeq ($(FLAG_DO_PROFILE), true)
- $(warning Making profiling version of native library)
- LOCAL_CFLAGS += -DFLAG_DO_PROFILE
- LOCAL_SHARED_LIBRARIES := libcutils libutils
-else # FLAG_DO_PROFILE
-ifeq ($(FLAG_DBG), true)
- $(warning Making debug version of native library)
- LOCAL_CFLAGS += -DFLAG_DBG
- LOCAL_SHARED_LIBRARIES := libcutils libutils
-endif # FLAG_DBG
-endif # FLAG_DO_PROFILE
-
-include $(BUILD_SHARED_LIBRARY)
+include $(call all-subdir-makefiles)
diff --git a/native/jni/Android.mk b/native/jni/Android.mk
new file mode 100644
index 000000000..5e0d3518d
--- /dev/null
+++ b/native/jni/Android.mk
@@ -0,0 +1,138 @@
+# Copyright (C) 2011 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOCAL_PATH := $(call my-dir)
+
+############ some local flags
+# If you change any of those flags, you need to rebuild both libjni_latinime_static
+# and the shared library.
+#FLAG_DBG := true
+#FLAG_DO_PROFILE := true
+
+TARGETING_UNBUNDLED_FROYO := true
+
+ifeq ($(TARGET_ARCH), x86)
+ TARGETING_UNBUNDLED_FROYO := false
+endif
+
+ifeq ($(TARGET_ARCH), mips)
+ TARGETING_UNBUNDLED_FROYO := false
+endif
+
+ifeq ($(FLAG_DBG), true)
+ TARGETING_UNBUNDLED_FROYO := false
+endif
+
+ifeq ($(FLAG_DO_PROFILE), true)
+ TARGETING_UNBUNDLED_FROYO := false
+endif
+
+######################################
+include $(CLEAR_VARS)
+
+LATIN_IME_SRC_DIR := src
+
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR)
+
+LOCAL_CFLAGS += -Werror -Wall
+
+# To suppress compiler warnings for unused variables/functions used for debug features etc.
+LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function
+
+LATIN_IME_JNI_SRC_FILES := \
+ com_android_inputmethod_keyboard_ProximityInfo.cpp \
+ com_android_inputmethod_latin_BinaryDictionary.cpp \
+ jni_common.cpp
+
+LATIN_IME_CORE_SRC_FILES := \
+ additional_proximity_chars.cpp \
+ basechars.cpp \
+ bigram_dictionary.cpp \
+ char_utils.cpp \
+ correction.cpp \
+ dictionary.cpp \
+ proximity_info.cpp \
+ unigram_dictionary.cpp
+
+LOCAL_SRC_FILES := \
+ $(LATIN_IME_JNI_SRC_FILES) \
+ $(addprefix $(LATIN_IME_SRC_DIR)/,$(LATIN_IME_CORE_SRC_FILES))
+
+ifeq ($(TARGETING_UNBUNDLED_FROYO), true)
+ LOCAL_NDK_VERSION := 4
+ LOCAL_SDK_VERSION := 8
+endif
+
+ifeq ($(FLAG_DO_PROFILE), true)
+ $(warning Making profiling version of native library)
+ LOCAL_CFLAGS += -DFLAG_DO_PROFILE
+else # FLAG_DO_PROFILE
+ifeq ($(FLAG_DBG), true)
+ $(warning Making debug version of native library)
+ LOCAL_CFLAGS += -DFLAG_DBG
+endif # FLAG_DBG
+endif # FLAG_DO_PROFILE
+
+LOCAL_MODULE := libjni_latinime_static
+LOCAL_MODULE_TAGS := optional
+
+ifdef HISTORICAL_NDK_VERSIONS_ROOT # In the platform build system
+include external/stlport/libstlport.mk
+else # In the NDK build system
+LOCAL_C_INCLUDES += external/stlport/stlport bionic
+endif
+
+include $(BUILD_STATIC_LIBRARY)
+
+######################################
+include $(CLEAR_VARS)
+
+# All code in LOCAL_WHOLE_STATIC_LIBRARIES will be built into this shared library.
+LOCAL_WHOLE_STATIC_LIBRARIES := libjni_latinime_static
+
+ifdef HISTORICAL_NDK_VERSIONS_ROOT # In the platform build system
+LOCAL_SHARED_LIBRARIES := libstlport
+else # In the NDK build system
+LOCAL_SHARED_LIBRARIES := libstlport_static
+endif
+
+ifeq ($(FLAG_DO_PROFILE), true)
+ $(warning Making profiling version of native library)
+ LOCAL_SHARED_LIBRARIES += libcutils libutils
+else # FLAG_DO_PROFILE
+ifeq ($(FLAG_DBG), true)
+ $(warning Making debug version of native library)
+ LOCAL_SHARED_LIBRARIES += libcutils libutils
+endif # FLAG_DBG
+endif # FLAG_DO_PROFILE
+
+ifeq ($(TARGETING_UNBUNDLED_FROYO), true)
+ LOCAL_NDK_VERSION := 4
+ LOCAL_SDK_VERSION := 8
+endif
+
+LOCAL_MODULE := libjni_latinime
+LOCAL_MODULE_TAGS := optional
+
+ifdef HISTORICAL_NDK_VERSIONS_ROOT # In the platform build system
+include external/stlport/libstlport.mk
+endif
+
+include $(BUILD_SHARED_LIBRARY)
+
+#################### Clean up the tmp vars
+LATIN_IME_CORE_SRC_FILES :=
+LATIN_IME_JNI_SRC_FILES :=
+LATIN_IME_SRC_DIR :=
+TARGETING_UNBUNDLED_FROYO :=
diff --git a/native/jni/Application.mk b/native/jni/Application.mk
new file mode 100644
index 000000000..caf3b2622
--- /dev/null
+++ b/native/jni/Application.mk
@@ -0,0 +1 @@
+APP_STL := stlport_static
diff --git a/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp b/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp
index 595ea2fdc..9eb437c06 100644
--- a/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp
+++ b/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp
@@ -25,17 +25,20 @@
#include <assert.h>
#include <errno.h>
#include <stdio.h>
+#include <string>
namespace latinime {
-static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object,
- jint maxProximityCharsSize, jint displayWidth, jint displayHeight, jint gridWidth,
- jint gridHeight, jintArray proximityCharsArray, jint keyCount,
- jintArray keyXCoordinateArray, jintArray keyYCoordinateArray, jintArray keyWidthArray,
- jintArray keyHeightArray, jintArray keyCharCodeArray,
+static jlong latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object,
+ jstring localejStr, jint maxProximityCharsSize, jint displayWidth, jint displayHeight,
+ jint gridWidth, jint gridHeight, jint mostCommonkeyWidth, jintArray proximityCharsArray,
+ jint keyCount, jintArray keyXCoordinateArray, jintArray keyYCoordinateArray,
+ jintArray keyWidthArray, jintArray keyHeightArray, jintArray keyCharCodeArray,
jfloatArray sweetSpotCenterXArray, jfloatArray sweetSpotCenterYArray,
jfloatArray sweetSpotRadiusArray) {
- jint *proximityChars = env->GetIntArrayElements(proximityCharsArray, NULL);
+ const char *localeStrPtr = env->GetStringUTFChars(localejStr, 0);
+ const std::string localeStr(localeStrPtr);
+ jint *proximityChars = env->GetIntArrayElements(proximityCharsArray, 0);
jint *keyXCoordinates = safeGetIntArrayElements(env, keyXCoordinateArray);
jint *keyYCoordinates = safeGetIntArrayElements(env, keyYCoordinateArray);
jint *keyWidths = safeGetIntArrayElements(env, keyWidthArray);
@@ -44,8 +47,10 @@ static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object,
jfloat *sweetSpotCenterXs = safeGetFloatArrayElements(env, sweetSpotCenterXArray);
jfloat *sweetSpotCenterYs = safeGetFloatArrayElements(env, sweetSpotCenterYArray);
jfloat *sweetSpotRadii = safeGetFloatArrayElements(env, sweetSpotRadiusArray);
- ProximityInfo *proximityInfo = new ProximityInfo(maxProximityCharsSize, displayWidth,
- displayHeight, gridWidth, gridHeight, (const uint32_t*)proximityChars,
+ ProximityInfo *proximityInfo = new ProximityInfo(
+ localeStr, maxProximityCharsSize, displayWidth,
+ displayHeight, gridWidth, gridHeight, mostCommonkeyWidth,
+ (const int32_t*)proximityChars,
keyCount, (const int32_t*)keyXCoordinates, (const int32_t*)keyYCoordinates,
(const int32_t*)keyWidths, (const int32_t*)keyHeights, (const int32_t*)keyCharCodes,
(const float*)sweetSpotCenterXs, (const float*)sweetSpotCenterYs,
@@ -59,19 +64,20 @@ static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object,
safeReleaseIntArrayElements(env, keyYCoordinateArray, keyYCoordinates);
safeReleaseIntArrayElements(env, keyXCoordinateArray, keyXCoordinates);
env->ReleaseIntArrayElements(proximityCharsArray, proximityChars, 0);
- return (jint)proximityInfo;
+ env->ReleaseStringUTFChars(localejStr, localeStrPtr);
+ return (jlong)proximityInfo;
}
-static void latinime_Keyboard_release(JNIEnv *env, jobject object, jint proximityInfo) {
+static void latinime_Keyboard_release(JNIEnv *env, jobject object, jlong proximityInfo) {
ProximityInfo *pi = (ProximityInfo*)proximityInfo;
if (!pi) return;
delete pi;
}
static JNINativeMethod sKeyboardMethods[] = {
- {"setProximityInfoNative", "(IIIII[II[I[I[I[I[I[F[F[F)I",
+ {"setProximityInfoNative", "(Ljava/lang/String;IIIIII[II[I[I[I[I[I[F[F[F)J",
(void*)latinime_Keyboard_setProximityInfo},
- {"releaseProximityInfoNative", "(I)V", (void*)latinime_Keyboard_release}
+ {"releaseProximityInfoNative", "(J)V", (void*)latinime_Keyboard_release}
};
int register_ProximityInfo(JNIEnv *env) {
diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
index 18c972444..20e44c2b0 100644
--- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
+++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
@@ -18,6 +18,7 @@
#define LOG_TAG "LatinIME: jni: BinaryDictionary"
#include "binary_format.h"
+#include "correction.h"
#include "com_android_inputmethod_latin_BinaryDictionary.h"
#include "dictionary.h"
#include "jni.h"
@@ -33,6 +34,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
+#include <unistd.h>
#else // USE_MMAP_FOR_DICTIONARY
#include <stdlib.h>
#endif // USE_MMAP_FOR_DICTIONARY
@@ -41,75 +43,74 @@ namespace latinime {
void releaseDictBuf(void* dictBuf, const size_t length, int fd);
-static jint latinime_BinaryDictionary_open(JNIEnv *env, jobject object,
+static jlong latinime_BinaryDictionary_open(JNIEnv *env, jobject object,
jstring sourceDir, jlong dictOffset, jlong dictSize,
- jint typedLetterMultiplier, jint fullWordMultiplier, jint maxWordLength, jint maxWords,
- jint maxAlternatives) {
+ jint typedLetterMultiplier, jint fullWordMultiplier, jint maxWordLength, jint maxWords) {
PROF_OPEN;
PROF_START(66);
- const char *sourceDirChars = env->GetStringUTFChars(sourceDir, NULL);
- if (sourceDirChars == NULL) {
- LOGE("DICT: Can't get sourceDir string");
+ const char *sourceDirChars = env->GetStringUTFChars(sourceDir, 0);
+ if (sourceDirChars == 0) {
+ AKLOGE("DICT: Can't get sourceDir string");
return 0;
}
int fd = 0;
- void *dictBuf = NULL;
+ void *dictBuf = 0;
int adjust = 0;
#ifdef USE_MMAP_FOR_DICTIONARY
/* mmap version */
fd = open(sourceDirChars, O_RDONLY);
if (fd < 0) {
- LOGE("DICT: Can't open sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno);
+ AKLOGE("DICT: Can't open sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno);
return 0;
}
int pagesize = getpagesize();
adjust = dictOffset % pagesize;
int adjDictOffset = dictOffset - adjust;
int adjDictSize = dictSize + adjust;
- dictBuf = mmap(NULL, sizeof(char) * adjDictSize, PROT_READ, MAP_PRIVATE, fd, adjDictOffset);
+ dictBuf = mmap(0, sizeof(char) * adjDictSize, PROT_READ, MAP_PRIVATE, fd, adjDictOffset);
if (dictBuf == MAP_FAILED) {
- LOGE("DICT: Can't mmap dictionary. errno=%d", errno);
+ AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno);
return 0;
}
dictBuf = (void *)((char *)dictBuf + adjust);
#else // USE_MMAP_FOR_DICTIONARY
/* malloc version */
- FILE *file = NULL;
+ FILE *file = 0;
file = fopen(sourceDirChars, "rb");
- if (file == NULL) {
- LOGE("DICT: Can't fopen sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno);
+ if (file == 0) {
+ AKLOGE("DICT: Can't fopen sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno);
return 0;
}
dictBuf = malloc(sizeof(char) * dictSize);
if (!dictBuf) {
- LOGE("DICT: Can't allocate memory region for dictionary. errno=%d", errno);
+ AKLOGE("DICT: Can't allocate memory region for dictionary. errno=%d", errno);
return 0;
}
int ret = fseek(file, (long)dictOffset, SEEK_SET);
if (ret != 0) {
- LOGE("DICT: Failure in fseek. ret=%d errno=%d", ret, errno);
+ AKLOGE("DICT: Failure in fseek. ret=%d errno=%d", ret, errno);
return 0;
}
ret = fread(dictBuf, sizeof(char) * dictSize, 1, file);
if (ret != 1) {
- LOGE("DICT: Failure in fread. ret=%d errno=%d", ret, errno);
+ AKLOGE("DICT: Failure in fread. ret=%d errno=%d", ret, errno);
return 0;
}
ret = fclose(file);
if (ret != 0) {
- LOGE("DICT: Failure in fclose. ret=%d errno=%d", ret, errno);
+ AKLOGE("DICT: Failure in fclose. ret=%d errno=%d", ret, errno);
return 0;
}
#endif // USE_MMAP_FOR_DICTIONARY
env->ReleaseStringUTFChars(sourceDir, sourceDirChars);
if (!dictBuf) {
- LOGE("DICT: dictBuf is null");
+ AKLOGE("DICT: dictBuf is null");
return 0;
}
- Dictionary *dictionary = NULL;
+ Dictionary *dictionary = 0;
if (BinaryFormat::UNKNOWN_FORMAT == BinaryFormat::detectFormat((uint8_t*)dictBuf)) {
- LOGE("DICT: dictionary format is unknown, bad magic number");
+ AKLOGE("DICT: dictionary format is unknown, bad magic number");
#ifdef USE_MMAP_FOR_DICTIONARY
releaseDictBuf(((char*)dictBuf) - adjust, adjDictSize, fd);
#else // USE_MMAP_FOR_DICTIONARY
@@ -117,27 +118,27 @@ static jint latinime_BinaryDictionary_open(JNIEnv *env, jobject object,
#endif // USE_MMAP_FOR_DICTIONARY
} else {
dictionary = new Dictionary(dictBuf, dictSize, fd, adjust, typedLetterMultiplier,
- fullWordMultiplier, maxWordLength, maxWords, maxAlternatives);
+ fullWordMultiplier, maxWordLength, maxWords);
}
PROF_END(66);
PROF_CLOSE;
- return (jint)dictionary;
+ return (jlong)dictionary;
}
-static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jint dict,
- jint proximityInfo, jintArray xCoordinatesArray, jintArray yCoordinatesArray,
+static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jlong dict,
+ jlong proximityInfo, jintArray xCoordinatesArray, jintArray yCoordinatesArray,
jintArray inputArray, jint arraySize, jint flags,
jcharArray outputArray, jintArray frequencyArray) {
Dictionary *dictionary = (Dictionary*)dict;
if (!dictionary) return 0;
ProximityInfo *pInfo = (ProximityInfo*)proximityInfo;
- int *xCoordinates = env->GetIntArrayElements(xCoordinatesArray, NULL);
- int *yCoordinates = env->GetIntArrayElements(yCoordinatesArray, NULL);
+ int *xCoordinates = env->GetIntArrayElements(xCoordinatesArray, 0);
+ int *yCoordinates = env->GetIntArrayElements(yCoordinatesArray, 0);
- int *frequencies = env->GetIntArrayElements(frequencyArray, NULL);
- int *inputCodes = env->GetIntArrayElements(inputArray, NULL);
- jchar *outputChars = env->GetCharArrayElements(outputArray, NULL);
+ int *frequencies = env->GetIntArrayElements(frequencyArray, 0);
+ int *inputCodes = env->GetIntArrayElements(inputArray, 0);
+ jchar *outputChars = env->GetCharArrayElements(outputArray, 0);
int count = dictionary->getSuggestions(pInfo, xCoordinates, yCoordinates, inputCodes,
arraySize, flags, (unsigned short*) outputChars, frequencies);
@@ -151,21 +152,19 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object,
return count;
}
-static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jint dict,
+static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jlong dict,
jcharArray prevWordArray, jint prevWordLength, jintArray inputArray, jint inputArraySize,
- jcharArray outputArray, jintArray frequencyArray, jint maxWordLength, jint maxBigrams,
- jint maxAlternatives) {
+ jcharArray outputArray, jintArray frequencyArray, jint maxWordLength, jint maxBigrams) {
Dictionary *dictionary = (Dictionary*)dict;
if (!dictionary) return 0;
- jchar *prevWord = env->GetCharArrayElements(prevWordArray, NULL);
- int *inputCodes = env->GetIntArrayElements(inputArray, NULL);
- jchar *outputChars = env->GetCharArrayElements(outputArray, NULL);
- int *frequencies = env->GetIntArrayElements(frequencyArray, NULL);
+ jchar *prevWord = env->GetCharArrayElements(prevWordArray, 0);
+ int *inputCodes = env->GetIntArrayElements(inputArray, 0);
+ jchar *outputChars = env->GetCharArrayElements(outputArray, 0);
+ int *frequencies = env->GetIntArrayElements(frequencyArray, 0);
int count = dictionary->getBigrams((unsigned short*) prevWord, prevWordLength, inputCodes,
- inputArraySize, (unsigned short*) outputChars, frequencies, maxWordLength, maxBigrams,
- maxAlternatives);
+ inputArraySize, (unsigned short*) outputChars, frequencies, maxWordLength, maxBigrams);
env->ReleaseCharArrayElements(prevWordArray, prevWord, JNI_ABORT);
env->ReleaseIntArrayElements(inputArray, inputCodes, JNI_ABORT);
@@ -175,19 +174,42 @@ static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jin
return count;
}
-static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jint dict,
+static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jlong dict,
jcharArray wordArray, jint wordLength) {
Dictionary *dictionary = (Dictionary*)dict;
if (!dictionary) return (jboolean) false;
- jchar *word = env->GetCharArrayElements(wordArray, NULL);
+ jchar *word = env->GetCharArrayElements(wordArray, 0);
jboolean result = dictionary->isValidWord((unsigned short*) word, wordLength);
env->ReleaseCharArrayElements(wordArray, word, JNI_ABORT);
return result;
}
-static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jint dict) {
+static jdouble latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jobject object,
+ jcharArray before, jint beforeLength, jcharArray after, jint afterLength, jint score) {
+ jchar *beforeChars = env->GetCharArrayElements(before, 0);
+ jchar *afterChars = env->GetCharArrayElements(after, 0);
+ jdouble result = Correction::RankingAlgorithm::calcNormalizedScore(
+ (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength,
+ score);
+ env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT);
+ env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT);
+ return result;
+}
+
+static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jobject object,
+ jcharArray before, jint beforeLength, jcharArray after, jint afterLength) {
+ jchar *beforeChars = env->GetCharArrayElements(before, 0);
+ jchar *afterChars = env->GetCharArrayElements(after, 0);
+ jint result = Correction::RankingAlgorithm::editDistance(
+ (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength);
+ env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT);
+ env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT);
+ return result;
+}
+
+static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jlong dict) {
Dictionary *dictionary = (Dictionary*)dict;
if (!dictionary) return;
void *dictBuf = dictionary->getDict();
@@ -205,11 +227,11 @@ void releaseDictBuf(void* dictBuf, const size_t length, int fd) {
#ifdef USE_MMAP_FOR_DICTIONARY
int ret = munmap(dictBuf, length);
if (ret != 0) {
- LOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno);
+ AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno);
}
ret = close(fd);
if (ret != 0) {
- LOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno);
+ AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno);
}
#else // USE_MMAP_FOR_DICTIONARY
free(dictBuf);
@@ -217,11 +239,14 @@ void releaseDictBuf(void* dictBuf, const size_t length, int fd) {
}
static JNINativeMethod sMethods[] = {
- {"openNative", "(Ljava/lang/String;JJIIIII)I", (void*)latinime_BinaryDictionary_open},
- {"closeNative", "(I)V", (void*)latinime_BinaryDictionary_close},
- {"getSuggestionsNative", "(II[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions},
- {"isValidWordNative", "(I[CI)Z", (void*)latinime_BinaryDictionary_isValidWord},
- {"getBigramsNative", "(I[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams}
+ {"openNative", "(Ljava/lang/String;JJIIII)J", (void*)latinime_BinaryDictionary_open},
+ {"closeNative", "(J)V", (void*)latinime_BinaryDictionary_close},
+ {"getSuggestionsNative", "(JJ[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions},
+ {"isValidWordNative", "(J[CI)Z", (void*)latinime_BinaryDictionary_isValidWord},
+ {"getBigramsNative", "(J[CI[II[C[III)I", (void*)latinime_BinaryDictionary_getBigrams},
+ {"calcNormalizedScoreNative", "([CI[CII)D",
+ (void*)latinime_BinaryDictionary_calcNormalizedScore},
+ {"editDistanceNative", "([CI[CI)I", (void*)latinime_BinaryDictionary_editDistance}
};
int register_BinaryDictionary(JNIEnv *env) {
diff --git a/native/jni/jni_common.cpp b/native/jni/jni_common.cpp
index 8643f723f..85d26836a 100644
--- a/native/jni/jni_common.cpp
+++ b/native/jni/jni_common.cpp
@@ -32,22 +32,22 @@ using namespace latinime;
* Returns the JNI version on success, -1 on failure.
*/
jint JNI_OnLoad(JavaVM* vm, void* reserved) {
- JNIEnv* env = NULL;
+ JNIEnv* env = 0;
jint result = -1;
if (vm->GetEnv((void**) &env, JNI_VERSION_1_4) != JNI_OK) {
- LOGE("ERROR: GetEnv failed");
+ AKLOGE("ERROR: GetEnv failed");
goto bail;
}
- assert(env != NULL);
+ assert(env != 0);
if (!register_BinaryDictionary(env)) {
- LOGE("ERROR: BinaryDictionary native registration failed");
+ AKLOGE("ERROR: BinaryDictionary native registration failed");
goto bail;
}
if (!register_ProximityInfo(env)) {
- LOGE("ERROR: ProximityInfo native registration failed");
+ AKLOGE("ERROR: ProximityInfo native registration failed");
goto bail;
}
@@ -63,12 +63,12 @@ namespace latinime {
int registerNativeMethods(JNIEnv* env, const char* className, JNINativeMethod* methods,
int numMethods) {
jclass clazz = env->FindClass(className);
- if (clazz == NULL) {
- LOGE("Native registration unable to find class '%s'", className);
+ if (clazz == 0) {
+ AKLOGE("Native registration unable to find class '%s'", className);
return JNI_FALSE;
}
if (env->RegisterNatives(clazz, methods, numMethods) < 0) {
- LOGE("RegisterNatives failed for '%s'", className);
+ AKLOGE("RegisterNatives failed for '%s'", className);
env->DeleteLocalRef(clazz);
return JNI_FALSE;
}
diff --git a/native/jni/jni_common.h b/native/jni/jni_common.h
index 9548e1b3f..6741443ac 100644
--- a/native/jni/jni_common.h
+++ b/native/jni/jni_common.h
@@ -29,17 +29,17 @@ int registerNativeMethods(JNIEnv *env, const char *className, JNINativeMethod *m
inline jint *safeGetIntArrayElements(JNIEnv *env, jintArray jArray) {
if (jArray) {
- return env->GetIntArrayElements(jArray, NULL);
+ return env->GetIntArrayElements(jArray, 0);
} else {
- return NULL;
+ return 0;
}
}
inline jfloat *safeGetFloatArrayElements(JNIEnv *env, jfloatArray jArray) {
if (jArray) {
- return env->GetFloatArrayElements(jArray, NULL);
+ return env->GetFloatArrayElements(jArray, 0);
} else {
- return NULL;
+ return 0;
}
}
diff --git a/native/jni/src/additional_proximity_chars.cpp b/native/jni/src/additional_proximity_chars.cpp
new file mode 100644
index 000000000..224f020f2
--- /dev/null
+++ b/native/jni/src/additional_proximity_chars.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "additional_proximity_chars.h"
+
+namespace latinime {
+const std::string AdditionalProximityChars::LOCALE_EN_US("en");
+
+const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_A[EN_US_ADDITIONAL_A_SIZE] = {
+ 'e', 'i', 'o', 'u'
+};
+
+const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_E[EN_US_ADDITIONAL_E_SIZE] = {
+ 'a', 'i', 'o', 'u'
+};
+
+const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_I[EN_US_ADDITIONAL_I_SIZE] = {
+ 'a', 'e', 'o', 'u'
+};
+
+const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_O[EN_US_ADDITIONAL_O_SIZE] = {
+ 'a', 'e', 'i', 'u'
+};
+
+const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_U[EN_US_ADDITIONAL_U_SIZE] = {
+ 'a', 'e', 'i', 'o'
+};
+}
diff --git a/native/jni/src/additional_proximity_chars.h b/native/jni/src/additional_proximity_chars.h
new file mode 100644
index 000000000..e0ecc0e1d
--- /dev/null
+++ b/native/jni/src/additional_proximity_chars.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_ADDITIONAL_PROXIMITY_CHARS_H
+#define LATINIME_ADDITIONAL_PROXIMITY_CHARS_H
+
+#include <stdint.h>
+#include <string>
+
+#include "defines.h"
+
+namespace latinime {
+
+class AdditionalProximityChars {
+ private:
+ static const std::string LOCALE_EN_US;
+ static const int EN_US_ADDITIONAL_A_SIZE = 4;
+ static const int32_t EN_US_ADDITIONAL_A[];
+ static const int EN_US_ADDITIONAL_E_SIZE = 4;
+ static const int32_t EN_US_ADDITIONAL_E[];
+ static const int EN_US_ADDITIONAL_I_SIZE = 4;
+ static const int32_t EN_US_ADDITIONAL_I[];
+ static const int EN_US_ADDITIONAL_O_SIZE = 4;
+ static const int32_t EN_US_ADDITIONAL_O[];
+ static const int EN_US_ADDITIONAL_U_SIZE = 4;
+ static const int32_t EN_US_ADDITIONAL_U[];
+
+ static bool isEnLocale(const std::string *locale_str) {
+ return locale_str && locale_str->size() >= LOCALE_EN_US.size()
+ && LOCALE_EN_US.compare(0, LOCALE_EN_US.size(), *locale_str);
+ }
+
+ public:
+ static int getAdditionalCharsSize(const std::string* locale_str, const int32_t c) {
+ if (!isEnLocale(locale_str)) {
+ return 0;
+ }
+ switch(c) {
+ case 'a':
+ return EN_US_ADDITIONAL_A_SIZE;
+ case 'e':
+ return EN_US_ADDITIONAL_E_SIZE;
+ case 'i':
+ return EN_US_ADDITIONAL_I_SIZE;
+ case 'o':
+ return EN_US_ADDITIONAL_O_SIZE;
+ case 'u':
+ return EN_US_ADDITIONAL_U_SIZE;
+ default:
+ return 0;
+ }
+ }
+
+ static const int32_t* getAdditionalChars(const std::string *locale_str, const int32_t c) {
+ if (!isEnLocale(locale_str)) {
+ return 0;
+ }
+ switch(c) {
+ case 'a':
+ return EN_US_ADDITIONAL_A;
+ case 'e':
+ return EN_US_ADDITIONAL_E;
+ case 'i':
+ return EN_US_ADDITIONAL_I;
+ case 'o':
+ return EN_US_ADDITIONAL_O;
+ case 'u':
+ return EN_US_ADDITIONAL_U;
+ default:
+ return 0;
+ }
+ }
+
+ static bool hasAdditionalChars(const std::string *locale_str, const int32_t c) {
+ return getAdditionalCharsSize(locale_str, c) > 0;
+ }
+};
+
+}
+
+#endif // LATINIME_ADDITIONAL_PROXIMITY_CHARS_H
diff --git a/native/src/basechars.h b/native/jni/src/basechars.cpp
index 3843e11c5..31f1e18a8 100644
--- a/native/src/basechars.h
+++ b/native/jni/src/basechars.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 The Android Open Source Project
+ * Copyright (C) 2011 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,8 +14,9 @@
* limitations under the License.
*/
-#ifndef LATINIME_BASECHARS_H
-#define LATINIME_BASECHARS_H
+#include "char_utils.h"
+
+namespace latinime {
/**
* Table mapping most combined Latin, Greek, and Cyrillic characters
@@ -23,7 +24,7 @@
* if c is not a combined character, or the base character if it
* is combined.
*/
-static unsigned short BASE_CHARS[] = {
+const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = {
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
@@ -189,4 +190,5 @@ static unsigned short BASE_CHARS[] = {
// generated with:
// cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }'
-#endif // LATINIME_BASECHARS_H
+
+} // namespace latinime
diff --git a/native/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index c340c6c1a..f7a3d3e60 100644
--- a/native/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -26,14 +26,14 @@
namespace latinime {
BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
- int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
+ const bool isLatestDictVersion, const bool hasBigram,
Dictionary *parentDictionary)
- : DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength),
- MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
+ : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
+ IS_LATEST_DICT_VERSION(isLatestDictVersion),
HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
if (DEBUG_DICT) {
- LOGI("BigramDictionary - constructor");
- LOGI("Has Bigram : %d", hasBigram);
+ AKLOGI("BigramDictionary - constructor");
+ AKLOGI("Has Bigram : %d", hasBigram);
}
}
@@ -46,7 +46,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
#ifdef FLAG_DBG
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
- LOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
+ AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
#endif
}
@@ -60,7 +60,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
insertAt++;
}
if (DEBUG_DICT) {
- LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
+ AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
}
if (insertAt < mMaxBigrams) {
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
@@ -76,7 +76,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
}
*dest = 0; // NULL terminate
if (DEBUG_DICT) {
- LOGI("Bigram: Added word at %d", insertAt);
+ AKLOGI("Bigram: Added word at %d", insertAt);
}
return true;
}
@@ -92,7 +92,6 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
* bigramFreq: an array to output frequencies.
* maxWordLength: the maximum size of a word.
* maxBigrams: the maximum number of bigrams fitting in the bigramChars array.
- * maxAlteratives: unused.
* This method returns the number of bigrams this word has, for backward compatibility.
* Note: this is not the number of bigrams output in the array, which is the number of
* bigrams this word has WHOSE first letter also matches the letter the user typed.
@@ -103,7 +102,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
*/
int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes,
int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength,
- int maxBigrams, int maxAlternatives) {
+ int maxBigrams) {
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
mBigramFreq = bigramFreq;
@@ -134,11 +133,13 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
bigramBuffer);
- if (checkFirstCharacter(bigramBuffer)) {
+ // codesSize == 0 means we are trying to find bigram predictions.
+ if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) {
const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
- addWordBigram(bigramBuffer, length, frequency);
+ if (addWordBigram(bigramBuffer, length, frequency)) {
+ ++bigramCount;
+ }
}
- ++bigramCount;
} while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
return bigramCount;
}
@@ -149,8 +150,9 @@ bool BigramDictionary::checkFirstCharacter(unsigned short *word) {
int *inputCodes = mInputCodes;
int maxAlt = MAX_ALTERNATIVES;
+ const unsigned short firstBaseChar = toBaseLowerCase(*word);
while (maxAlt > 0) {
- if ((unsigned int) *inputCodes == (unsigned int) *word) {
+ if (toBaseLowerCase(*inputCodes) == firstBaseChar) {
return true;
}
inputCodes++;
diff --git a/native/src/bigram_dictionary.h b/native/jni/src/bigram_dictionary.h
index c07458a38..8132fbc59 100644
--- a/native/src/bigram_dictionary.h
+++ b/native/jni/src/bigram_dictionary.h
@@ -21,14 +21,13 @@ namespace latinime {
class Dictionary;
class BigramDictionary {
-public:
- BigramDictionary(const unsigned char *dict, int maxWordLength, int maxAlternatives,
+ public:
+ BigramDictionary(const unsigned char *dict, int maxWordLength,
const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary);
int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
- unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
- int maxAlternatives);
+ unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams);
~BigramDictionary();
-private:
+ private:
bool addWordBigram(unsigned short *word, int length, int frequency);
int getBigramAddress(int *pos, bool advance);
int getBigramFreq(int *pos);
@@ -39,7 +38,8 @@ private:
const unsigned char *DICT;
const int MAX_WORD_LENGTH;
- const int MAX_ALTERNATIVES;
+ // TODO: Re-implement proximity correction for bigram correction
+ static const int MAX_ALTERNATIVES = 1;
const bool IS_LATEST_DICT_VERSION;
const bool HAS_BIGRAM;
diff --git a/native/src/binary_format.h b/native/jni/src/binary_format.h
index 6f65088db..ab033ad90 100644
--- a/native/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -17,22 +17,31 @@
#ifndef LATINIME_BINARY_FORMAT_H
#define LATINIME_BINARY_FORMAT_H
+#include <limits>
#include "unigram_dictionary.h"
namespace latinime {
class BinaryFormat {
-private:
+ private:
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
-public:
+ public:
const static int UNKNOWN_FORMAT = -1;
- const static int FORMAT_VERSION_1 = 1;
- const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
+ // Originally, format version 1 had a 16-bit magic number, then the version number `01'
+ // then options that must be 0. Hence the first 32-bits of the format are always as follow
+ // and it's okay to consider them a magic number as a whole.
+ const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
+ const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
+ // The versions of Latin IME that only handle format version 1 only test for the magic
+ // number, so we had to change it so that version 2 files would be rejected by older
+ // implementations. On this occasion, we made the magic number 32 bits long.
+ const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
static int detectFormat(const uint8_t* const dict);
+ static unsigned int getHeaderSize(const uint8_t* const dict);
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
@@ -55,13 +64,43 @@ public:
};
inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
- const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
- if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
- return UNKNOWN_FORMAT;
+ // The magic number is stored big-endian.
+ const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
+ switch (magicNumber) {
+ case FORMAT_VERSION_1_MAGIC_NUMBER:
+ // Format 1 header is exactly 5 bytes long and looks like:
+ // Magic number (2 bytes) 0x78 0xB1
+ // Version number (1 byte) 0x01
+ // Options (2 bytes) must be 0x00 0x00
+ return 1;
+ case FORMAT_VERSION_2_MAGIC_NUMBER:
+ // Format 2 header is as follows:
+ // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
+ // Version number (2 bytes) 0x00 0x02
+ // Options (2 bytes) must be 0x00 0x00
+ // Header size (4 bytes) : integer, big endian
+ return (dict[4] << 8) + dict[5];
+ default:
+ return UNKNOWN_FORMAT;
+ }
+}
+
+inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
+ switch (detectFormat(dict)) {
+ case 1:
+ return FORMAT_VERSION_1_HEADER_SIZE;
+ case 2:
+ // See the format of the header in the comment in detectFormat() above
+ return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];
+ default:
+ return std::numeric_limits<unsigned int>::max();
+ }
}
inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
- return dict[(*pos)++];
+ const int msb = dict[(*pos)++];
+ if (msb < 0x80) return msb;
+ return ((msb & 0x7F) << 8) | dict[(*pos)++];
}
inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
@@ -145,15 +184,15 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
const int pos) {
- // This function skips all attributes. The format makes provision for future extension
- // with other attributes (notably shortcuts) but for the time being, bigrams are the
- // only attributes that may be found in a character group, so we only look at bigrams
- // in this version.
+ // This function skips all attributes: shortcuts and bigrams.
+ int newPos = pos;
+ if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
+ newPos = skipAttributes(dict, newPos);
+ }
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
- return skipAttributes(dict, pos);
- } else {
- return pos;
+ newPos = skipAttributes(dict, newPos);
}
+ return newPos;
}
inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
diff --git a/native/src/char_utils.cpp b/native/jni/src/char_utils.cpp
index a31a0632c..a31a0632c 100644
--- a/native/src/char_utils.cpp
+++ b/native/jni/src/char_utils.cpp
diff --git a/native/jni/src/char_utils.h b/native/jni/src/char_utils.h
new file mode 100644
index 000000000..607dc5195
--- /dev/null
+++ b/native/jni/src/char_utils.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_CHAR_UTILS_H
+#define LATINIME_CHAR_UTILS_H
+
+namespace latinime {
+
+inline static int isAsciiUpper(unsigned short c) {
+ return c >= 'A' && c <= 'Z';
+}
+
+inline static unsigned short toAsciiLower(unsigned short c) {
+ return c - 'A' + 'a';
+}
+
+inline static int isAscii(unsigned short c) {
+ return c <= 127;
+}
+
+unsigned short latin_tolower(unsigned short c);
+
+/**
+ * Table mapping most combined Latin, Greek, and Cyrillic characters
+ * to their base characters. If c is in range, BASE_CHARS[c] == c
+ * if c is not a combined character, or the base character if it
+ * is combined.
+ */
+
+static const int BASE_CHARS_SIZE = 0x0500;
+extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
+
+inline static unsigned short toBaseChar(unsigned short c) {
+ if (c < BASE_CHARS_SIZE) {
+ return BASE_CHARS[c];
+ }
+ return c;
+}
+
+inline static unsigned short toBaseLowerCase(unsigned short c) {
+ c = toBaseChar(c);
+ if (isAsciiUpper(c)) {
+ return toAsciiLower(c);
+ } else if (isAscii(c)) {
+ return c;
+ }
+ return latin_tolower(c);
+}
+
+} // namespace latinime
+
+#endif // LATINIME_CHAR_UTILS_H
diff --git a/native/src/correction.cpp b/native/jni/src/correction.cpp
index 27dc40745..087219ed4 100644
--- a/native/src/correction.cpp
+++ b/native/jni/src/correction.cpp
@@ -16,12 +16,15 @@
#include <assert.h>
#include <ctype.h>
+#include <math.h>
#include <stdio.h>
#include <string.h>
#define LOG_TAG "LatinIME: correction.cpp"
+#include "char_utils.h"
#include "correction.h"
+#include "defines.h"
#include "dictionary.h"
#include "proximity_info.h"
@@ -31,81 +34,60 @@ namespace latinime {
// edit distance funcitons //
/////////////////////////////
-#if 0 /* no longer used */
-inline static int editDistance(
- int* editDistanceTable, const unsigned short* input,
- const int inputLength, const unsigned short* output, const int outputLength) {
- // dp[li][lo] dp[a][b] = dp[ a * lo + b]
- int* dp = editDistanceTable;
- const int li = inputLength + 1;
- const int lo = outputLength + 1;
- for (int i = 0; i < li; ++i) {
- dp[lo * i] = i;
- }
- for (int i = 0; i < lo; ++i) {
- dp[i] = i;
- }
-
- for (int i = 0; i < li - 1; ++i) {
- for (int j = 0; j < lo - 1; ++j) {
- const uint32_t ci = Dictionary::toBaseLowerCase(input[i]);
- const uint32_t co = Dictionary::toBaseLowerCase(output[j]);
- const uint16_t cost = (ci == co) ? 0 : 1;
- dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
- min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
- if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1])
- && co == Dictionary::toBaseLowerCase(input[i - 1])) {
- dp[(i + 1) * lo + (j + 1)] = min(
- dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
- }
- }
+inline static void initEditDistance(int *editDistanceTable) {
+ for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
+ editDistanceTable[i] = i;
}
+}
- if (DEBUG_EDIT_DISTANCE) {
- LOGI("IN = %d, OUT = %d", inputLength, outputLength);
- for (int i = 0; i < li; ++i) {
- for (int j = 0; j < lo; ++j) {
- LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
+inline static void dumpEditDistance10ForDebug(int *editDistanceTable,
+ const int editDistanceTableWidth, const int outputLength) {
+ if (DEBUG_DICT) {
+ AKLOGI("EditDistanceTable");
+ for (int i = 0; i <= 10; ++i) {
+ int c[11];
+ for (int j = 0; j <= 10; ++j) {
+ if (j < editDistanceTableWidth + 1 && i < outputLength + 1) {
+ c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j];
+ } else {
+ c[j] = -1;
+ }
}
+ AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]",
+ c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]);
}
}
- return dp[li * lo - 1];
-}
-#endif
-
-inline static void initEditDistance(int *editDistanceTable) {
- for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
- editDistanceTable[i] = i;
- }
}
inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input,
const int inputLength, const unsigned short *output, const int outputLength) {
+ // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched.
// Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j].
// Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated,
// and calculate dp[ouputLength][0] ... dp[outputLength][inputLength].
int *const current = editDistanceTable + outputLength * (inputLength + 1);
const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1);
const int *const prevprev =
- outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : NULL;
+ outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0;
current[0] = outputLength;
- const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]);
- const uint32_t prevCO =
- outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0;
+ const uint32_t co = toBaseLowerCase(output[outputLength - 1]);
+ const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0;
for (int i = 1; i <= inputLength; ++i) {
- const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]);
+ const uint32_t ci = toBaseLowerCase(input[i - 1]);
const uint16_t cost = (ci == co) ? 0 : 1;
current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
- if (i >= 2 && prevprev && ci == prevCO
- && co == Dictionary::toBaseLowerCase(input[i - 2])) {
+ if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) {
current[i] = min(current[i], prevprev[i - 2] + 1);
}
}
}
-inline static int getCurrentEditDistance(
- int *editDistanceTable, const int inputLength, const int outputLength) {
- return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
+inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth,
+ const int outputLength, const int inputLength) {
+ if (DEBUG_EDIT_DISTANCE) {
+ AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
+ }
+ return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength];
}
//////////////////////
@@ -133,6 +115,9 @@ void Correction::initCorrection(const ProximityInfo *pi, const int inputLength,
mInputLength = inputLength;
mMaxDepth = maxDepth;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
+ // TODO: This is not supposed to be required. Check what's going wrong with
+ // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL]
+ initEditDistance(mEditDistanceTable);
}
void Correction::initCorrectionState(
@@ -146,7 +131,7 @@ void Correction::initCorrectionState(
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
- const bool useFullEditDistance) {
+ const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
// TODO: remove
mTransposedPos = transposedPos;
mExcessivePos = excessivePos;
@@ -159,6 +144,8 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
mSpaceProximityPos = spaceProximityPos;
mMissingSpacePos = missingSpacePos;
mUseFullEditDistance = useFullEditDistance;
+ mDoAutoCompletion = doAutoCompletion;
+ mMaxErrors = maxErrors;
}
void Correction::checkState() {
@@ -172,23 +159,34 @@ void Correction::checkState() {
}
}
-int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
- const unsigned short *word) {
- return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
- firstFreq, secondFreq, this, word);
+int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
+ const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
+ return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
+ wordCount, this, isSpaceProximity, word);
}
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
+ return getFinalFreqInternal(freq, word, wordLength, mInputLength);
+}
+
+int Correction::getFinalFreqForSubQueue(const int freq, unsigned short **word, int *wordLength,
+ const int inputLength) {
+ return getFinalFreqInternal(freq, word, wordLength, inputLength);
+}
+
+int Correction::getFinalFreqInternal(const int freq, unsigned short **word, int *wordLength,
+ const int inputLength) {
const int outputIndex = mTerminalOutputIndex;
const int inputIndex = mTerminalInputIndex;
*wordLength = outputIndex + 1;
- if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) {
- return -1;
+ if (outputIndex < MIN_SUGGEST_DEPTH) {
+ return NOT_A_FREQUENCY;
}
*word = mWord;
- return Correction::RankingAlgorithm::calculateFinalFreq(
- inputIndex, outputIndex, freq, mEditDistanceTable, this);
+ int finalFreq = Correction::RankingAlgorithm::calculateFinalFreq(
+ inputIndex, outputIndex, freq, mEditDistanceTable, this, inputLength);
+ return finalFreq;
}
bool Correction::initProcessState(const int outputIndex) {
@@ -213,6 +211,7 @@ bool Correction::initProcessState(const int outputIndex) {
mMatching = false;
mProximityMatching = false;
+ mAdditionalProximityMatching = false;
mTransposing = false;
mExceeding = false;
mSkipping = false;
@@ -229,20 +228,10 @@ int Correction::goDownTree(
}
// TODO: remove
-int Correction::getOutputIndex() {
- return mOutputIndex;
-}
-
-// TODO: remove
int Correction::getInputIndex() {
return mInputIndex;
}
-// TODO: remove
-bool Correction::needsToTraverseAllNodes() {
- return mNeedsToTraverseAllNodes;
-}
-
void Correction::incrementInputIndex() {
++mInputIndex;
}
@@ -269,6 +258,7 @@ void Correction::incrementOutputIndex() {
mCorrectionStates[mOutputIndex].mMatching = mMatching;
mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching;
+ mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching;
mCorrectionStates[mOutputIndex].mTransposing = mTransposing;
mCorrectionStates[mOutputIndex].mExceeding = mExceeding;
mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
@@ -280,7 +270,9 @@ void Correction::startToTraverseAllNodes() {
bool Correction::needsToPrune() const {
// TODO: use edit distance here
- return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance;
+ return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
+ // Allow one char longer word for missing character
+ || (!mDoAutoCompletion && (mOutputIndex > mInputLength));
}
void Correction::addCharToCurrentWord(const int32_t c) {
@@ -290,13 +282,12 @@ void Correction::addCharToCurrentWord(const int32_t c) {
mWord, mOutputIndex + 1);
}
-// TODO: inline?
Correction::CorrectionType Correction::processSkipChar(
const int32_t c, const bool isTerminal, const bool inputIndexIncremented) {
addCharToCurrentWord(c);
- if (needsToTraverseAllNodes() && isTerminal) {
- mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0);
- mTerminalOutputIndex = mOutputIndex;
+ mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0);
+ mTerminalOutputIndex = mOutputIndex;
+ if (mNeedsToTraverseAllNodes && isTerminal) {
incrementOutputIndex();
return TRAVERSE_ALL_ON_TERMINAL;
} else {
@@ -305,19 +296,36 @@ Correction::CorrectionType Correction::processSkipChar(
}
}
+Correction::CorrectionType Correction::processUnrelatedCorrectionType() {
+ // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType
+ mTerminalInputIndex = mInputIndex;
+ mTerminalOutputIndex = mOutputIndex;
+ return UNRELATED;
+}
+
inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
return type == ProximityInfo::EQUIVALENT_CHAR;
}
+inline bool isProximityCharOrEquivalentChar(ProximityInfo::ProximityType type) {
+ return type == ProximityInfo::EQUIVALENT_CHAR
+ || type == ProximityInfo::NEAR_PROXIMITY_CHAR;
+}
+
Correction::CorrectionType Correction::processCharAndCalcState(
const int32_t c, const bool isTerminal) {
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
+ if (correctionCount > mMaxErrors) {
+ return processUnrelatedCorrectionType();
+ }
+
// TODO: Change the limit if we'll allow two or more corrections
const bool noCorrectionsHappenedSoFar = correctionCount == 0;
const bool canTryCorrection = noCorrectionsHappenedSoFar;
int proximityIndex = 0;
mDistances[mOutputIndex] = NOT_A_DISTANCE;
+ // Skip checking this node
if (mNeedsToTraverseAllNodes || isQuote(c)) {
bool incremented = false;
if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
@@ -342,6 +350,7 @@ Correction::CorrectionType Correction::processCharAndCalcState(
return processSkipChar(c, isTerminal, incremented);
}
+ // Check possible corrections.
if (mExcessivePos >= 0) {
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
mExcessivePos = mOutputIndex;
@@ -382,30 +391,42 @@ Correction::CorrectionType Correction::processCharAndCalcState(
incrementInputIndex();
} else {
--mTransposedCount;
- if (DEBUG_CORRECTION) {
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
+ || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
DUMP_WORD(mWord, mOutputIndex);
- LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
- return UNRELATED;
+ return processUnrelatedCorrectionType();
}
}
// TODO: Change the limit if we'll allow two or more proximity chars with corrections
- const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0;
+ // Work around: When the mMaxErrors is 1, we only allow just one error
+ // including proximity correction.
+ const bool checkProximityChars = (mMaxErrors > 1)
+ ? (noCorrectionsHappenedSoFar || mProximityCount == 0)
+ : (noCorrectionsHappenedSoFar && mProximityCount == 0);
+
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
? ProximityInfo::EQUIVALENT_CHAR
: mProximityInfo->getMatchedProximityId(
mInputIndex, c, checkProximityChars, &proximityIndex);
- if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
+ if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId
+ || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
if (canTryCorrection && mOutputIndex > 0
&& mCorrectionStates[mOutputIndex].mProximityMatching
&& mCorrectionStates[mOutputIndex].mExceeding
&& isEquivalentChar(mProximityInfo->getMatchedProximityId(
mInputIndex, mWord[mOutputIndex - 1], false))) {
- if (DEBUG_CORRECTION) {
- LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
+ || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
+ AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
}
// Conversion p->e
// Example:
@@ -423,7 +444,11 @@ Correction::CorrectionType Correction::processCharAndCalcState(
}
}
- if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
+ if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId
+ || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
+ if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
+ mAdditionalProximityMatching = true;
+ }
// TODO: Optimize
// As the current char turned out to be an unrelated char,
// we will try other correction-types. Please note that mCorrectionStates[mOutputIndex]
@@ -465,6 +490,18 @@ Correction::CorrectionType Correction::processCharAndCalcState(
++mSkippedCount;
--mProximityCount;
return processSkipChar(c, isTerminal, false);
+ } else if (mInputIndex - 1 < mInputLength
+ && mSkippedCount > 0
+ && mCorrectionStates[mOutputIndex].mSkipping
+ && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching
+ && isProximityCharOrEquivalentChar(
+ mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
+ // Conversion s->a
+ incrementInputIndex();
+ --mSkippedCount;
+ mProximityMatching = true;
+ ++mProximityCount;
+ mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
} else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength
&& isEquivalentChar(
mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
@@ -475,17 +512,52 @@ Correction::CorrectionType Correction::processCharAndCalcState(
++mExcessiveCount;
incrementInputIndex();
}
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
+ || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
+ DUMP_WORD(mWord, mOutputIndex);
+ if (mTransposing) {
+ AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ mTransposedCount, mExcessiveCount, c);
+ } else {
+ AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ mTransposedCount, mExcessiveCount, c);
+ }
+ }
} else if (mSkipping) {
// 3. Skip correction
++mSkippedCount;
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
+ || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
+ AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ mTransposedCount, mExcessiveCount, c);
+ }
return processSkipChar(c, isTerminal, false);
+ } else if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
+ // As a last resort, use additional proximity characters
+ mProximityMatching = true;
+ ++mProximityCount;
+ mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
+ || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
+ AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ mTransposedCount, mExcessiveCount, c);
+ }
} else {
- if (DEBUG_CORRECTION) {
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
+ || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
DUMP_WORD(mWord, mOutputIndex);
- LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
- return UNRELATED;
+ return processUnrelatedCorrectionType();
}
} else if (secondTransposing) {
// If inputIndex is greater than mInputLength, that means there is no
@@ -500,6 +572,13 @@ Correction::CorrectionType Correction::processCharAndCalcState(
++mProximityCount;
mDistances[mOutputIndex] =
mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex);
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
+ || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
+ AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ mTransposedCount, mExcessiveCount, c);
+ }
}
addCharToCurrentWord(c);
@@ -533,13 +612,17 @@ Correction::CorrectionType Correction::processCharAndCalcState(
|| isSameAsUserTypedLength) && isTerminal) {
mTerminalInputIndex = mInputIndex - 1;
mTerminalOutputIndex = mOutputIndex - 1;
- if (DEBUG_CORRECTION) {
+ if (DEBUG_CORRECTION
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
+ && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
DUMP_WORD(mWord, mOutputIndex);
- LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
+ AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
return ON_TERMINAL;
} else {
+ mTerminalInputIndex = mInputIndex - 1;
+ mTerminalOutputIndex = mOutputIndex - 1;
return NOT_ON_TERMINAL;
}
}
@@ -547,55 +630,6 @@ Correction::CorrectionType Correction::processCharAndCalcState(
Correction::~Correction() {
}
-/////////////////////////
-// static inline utils //
-/////////////////////////
-
-static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
-static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
- return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
-}
-
-static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
-inline static void multiplyIntCapped(const int multiplier, int *base) {
- const int temp = *base;
- if (temp != S_INT_MAX) {
- // Branch if multiplier == 2 for the optimization
- if (multiplier == 2) {
- *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
- } else {
- // TODO: This overflow check gives a wrong answer when, for example,
- // temp = 2^16 + 1 and multiplier = 2^17 + 1.
- // Fix this behavior.
- const int tempRetval = temp * multiplier;
- *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
- }
- }
-}
-
-inline static int powerIntCapped(const int base, const int n) {
- if (n <= 0) return 1;
- if (base == 2) {
- return n < 31 ? 1 << n : S_INT_MAX;
- } else {
- int ret = base;
- for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
- return ret;
- }
-}
-
-inline static void multiplyRate(const int rate, int *freq) {
- if (*freq != S_INT_MAX) {
- if (*freq > 1000000) {
- *freq /= 100;
- multiplyIntCapped(rate, freq);
- } else {
- multiplyIntCapped(rate, freq);
- *freq /= 100;
- }
- }
-}
-
inline static int getQuoteCount(const unsigned short* word, const int length) {
int quoteCount = 0;
for (int i = 0; i < length; ++i) {
@@ -607,13 +641,7 @@ inline static int getQuoteCount(const unsigned short* word, const int length) {
}
inline static bool isUpperCase(unsigned short c) {
- if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
- c = BASE_CHARS[c];
- }
- if (isupper(c)) {
- return true;
- }
- return false;
+ return isAsciiUpper(toBaseChar(c));
}
//////////////////////
@@ -622,9 +650,9 @@ inline static bool isUpperCase(unsigned short c) {
/* static */
int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
- const int freq, int* editDistanceTable, const Correction* correction) {
+ const int freq, int* editDistanceTable, const Correction* correction,
+ const int inputLength) {
const int excessivePos = correction->getExcessivePos();
- const int inputLength = correction->mInputLength;
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
const ProximityInfo *proximityInfo = correction->mProximityInfo;
@@ -649,45 +677,55 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
const unsigned short* word = correction->mWord;
const bool skipped = skippedCount > 0;
- const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
+ const int quoteDiffCount = max(0, getQuoteCount(word, outputLength)
- getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
// TODO: Calculate edit distance for transposed and excessive
int ed = 0;
+ if (DEBUG_DICT_FULL) {
+ dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength);
+ }
int adjustedProximityMatchedCount = proximityMatchedCount;
int finalFreq = freq;
+ if (DEBUG_CORRECTION_FREQ
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) {
+ AKLOGI("FinalFreq0: %d", finalFreq);
+ }
// TODO: Optimize this.
- // TODO: Ignoring edit distance for transposed char, for now
- if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) {
- ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1);
+ if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) {
+ ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength,
+ inputLength) - transposedCount;
+
const int matchWeight = powerIntCapped(typedLetterMultiplier,
- max(inputLength, outputIndex + 1) - ed);
+ max(inputLength, outputLength) - ed);
multiplyIntCapped(matchWeight, &finalFreq);
// TODO: Demote further if there are two or more excessive chars with longer user input?
- if (inputLength > outputIndex + 1) {
+ if (inputLength > outputLength) {
multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
}
ed = max(0, ed - quoteDiffCount);
-
- if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) {
- // Promote a word with just one skipped or excessive char
- if (sameLength) {
- multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq);
- } else {
+ adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)),
+ proximityMatchedCount);
+ if (transposedCount < 1) {
+ if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) {
+ // Promote a word with just one skipped or excessive char
+ if (sameLength) {
+ multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE
+ + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength,
+ &finalFreq);
+ } else {
+ multiplyIntCapped(typedLetterMultiplier, &finalFreq);
+ }
+ } else if (ed == 0) {
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
+ sameLength = true;
}
- } else if (ed == 0) {
- multiplyIntCapped(typedLetterMultiplier, &finalFreq);
- sameLength = true;
}
- adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
- proximityMatchedCount);
} else {
- // TODO: Calculate the edit distance for transposed char
const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
multiplyIntCapped(matchWeight, &finalFreq);
}
@@ -707,7 +745,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
/ (10 * inputLength
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
if (DEBUG_DICT_FULL) {
- LOGI("Demotion rate for missing character is %d.", demotionRate);
+ AKLOGI("Demotion rate for missing character is %d.", demotionRate);
}
multiplyRate(demotionRate, &finalFreq);
}
@@ -720,8 +758,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
if (excessiveCount > 0) {
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) {
- if (DEBUG_CORRECTION_FREQ) {
- LOGI("Double excessive demotion");
+ if (DEBUG_DICT_FULL) {
+ AKLOGI("Double excessive demotion");
}
// If an excessive character is not adjacent to the left char or the right char,
// we will demote this word.
@@ -729,11 +767,14 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
}
}
+ const bool performTouchPositionCorrection =
+ CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled()
+ && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0;
// Score calibration by touch coordinates is being done only for pure-fat finger typing error
// cases.
+ int additionalProximityCount = 0;
// TODO: Remove this constraint.
- if (CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled()
- && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0) {
+ if (performTouchPositionCorrection) {
for (int i = 0; i < outputLength; ++i) {
const int squaredDistance = correction->mDistances[i];
if (i < adjustedProximityMatchedCount) {
@@ -744,40 +785,60 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f;
static const float B = 1.0f;
static const float C = 0.5f;
+ static const float MIN = 0.3f;
static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS;
static const float R2 = HALF_SCORE_SQUARED_RADIUS;
const float x = (float)squaredDistance
/ ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR;
- const float factor = (x < R1)
+ const float factor = max((x < R1)
? (A * (R1 - x) + B * x) / R1
- : (B * (R2 - x) + C * (x - R1)) / (R2 - R1);
+ : (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN);
// factor is piecewise linear function like:
// A -_ .
// ^-_ .
// B \ .
- // \ .
- // C \ .
- // 0 R1 R2
- if (factor <= 0) {
- return -1;
- }
+ // \_ .
+ // C ------------.
+ // .
+ // 0 R1 R2 .
multiplyRate((int)(factor * 100), &finalFreq);
} else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) {
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
+ } else if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
+ ++additionalProximityCount;
+ multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
}
}
} else {
+ // Demote additional proximity characters
+ for (int i = 0; i < outputLength; ++i) {
+ const int squaredDistance = correction->mDistances[i];
+ if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
+ ++additionalProximityCount;
+ }
+ }
// Promotion for a word with proximity characters
for (int i = 0; i < adjustedProximityMatchedCount; ++i) {
// A word with proximity corrections
if (DEBUG_DICT_FULL) {
- LOGI("Found a proximity correction.");
+ AKLOGI("Found a proximity correction.");
}
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
- multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
+ if (i < additionalProximityCount) {
+ multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
+ } else {
+ multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
+ }
}
}
+ // If the user types too many(three or more) proximity characters with additional proximity
+ // character,do not treat as the same length word.
+ if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3
+ || transposedCount > 0 || skipped || excessiveCount > 0)) {
+ sameLength = false;
+ }
+
const int errorCount = adjustedProximityMatchedCount > 0
? adjustedProximityMatchedCount
: (proximityMatchedCount + transposedCount);
@@ -787,13 +848,15 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
// Promotion for an exactly matched word
if (ed == 0) {
// Full exact match
- if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) {
+ if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0
+ && quoteDiffCount == 0 && additionalProximityCount == 0) {
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
// Promote a word with no correction
- if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0) {
+ if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0
+ && additionalProximityCount == 0) {
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
@@ -832,71 +895,101 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
}
if (DEBUG_DICT_FULL) {
- LOGI("calc: %d, %d", outputIndex, sameLength);
+ AKLOGI("calc: %d, %d", outputLength, sameLength);
}
- if (DEBUG_CORRECTION_FREQ) {
- DUMP_WORD(correction->mWord, outputIndex + 1);
- LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount,
- skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength,
- quoteDiffCount, ed, finalFreq);
+ if (DEBUG_CORRECTION_FREQ
+ && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) {
+ DUMP_WORD(proximityInfo->getPrimaryInputWord(), inputLength);
+ DUMP_WORD(correction->mWord, outputLength);
+ AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount,
+ skippedCount, transposedCount, excessiveCount, additionalProximityCount,
+ outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq);
}
return finalFreq;
}
/* static */
-int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
- const int firstFreq, const int secondFreq, const Correction* correction,
- const unsigned short *word) {
- const int spaceProximityPos = correction->mSpaceProximityPos;
- const int missingSpacePos = correction->mMissingSpacePos;
- if (DEBUG_DICT) {
- int inputCount = 0;
- if (spaceProximityPos >= 0) ++inputCount;
- if (missingSpacePos >= 0) ++inputCount;
- assert(inputCount <= 1);
- }
- const bool isSpaceProximity = spaceProximityPos >= 0;
- const int inputLength = correction->mInputLength;
- const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
- const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
- : (inputLength - missingSpacePos);
+int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
+ const int *freqArray, const int *wordLengthArray, const int wordCount,
+ const Correction* correction, const bool isSpaceProximity, const unsigned short *word) {
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
bool firstCapitalizedWordDemotion = false;
- if (firstWordLength >= 2) {
- firstCapitalizedWordDemotion = isUpperCase(word[0]);
- }
-
bool secondCapitalizedWordDemotion = false;
- if (secondWordLength >= 2) {
- secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+
+ {
+ // TODO: Handle multiple capitalized word demotion properly
+ const int firstWordLength = wordLengthArray[0];
+ const int secondWordLength = wordLengthArray[1];
+ if (firstWordLength >= 2) {
+ firstCapitalizedWordDemotion = isUpperCase(word[0]);
+ }
+
+ if (secondWordLength >= 2) {
+ // FIXME: word[firstWordLength + 1] is incorrect.
+ secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+ }
}
+
const bool capitalizedWordDemotion =
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
- if (DEBUG_DICT_FULL) {
- LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion);
+ int totalLength = 0;
+ int totalFreq = 0;
+ for (int i = 0; i < wordCount; ++i){
+ const int wordLength = wordLengthArray[i];
+ if (wordLength <= 0) {
+ return 0;
+ }
+ totalLength += wordLength;
+ const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
+ int tempFirstFreq = freqArray[i];
+ multiplyRate(demotionRate, &tempFirstFreq);
+ totalFreq += tempFirstFreq;
}
- if (firstWordLength == 0 || secondWordLength == 0) {
+ if (totalLength <= 0 || totalFreq <= 0) {
return 0;
}
- const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
- int tempFirstFreq = firstFreq;
- multiplyRate(firstDemotionRate, &tempFirstFreq);
-
- const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
- int tempSecondFreq = secondFreq;
- multiplyRate(secondDemotionRate, &tempSecondFreq);
-
- const int totalLength = firstWordLength + secondWordLength;
+ // TODO: Currently totalFreq is adjusted to two word metrix.
// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
// length.
- int totalFreq = tempFirstFreq + tempSecondFreq;
+ totalFreq = totalFreq * 2 / wordCount;
+ if (wordCount > 2) {
+ // Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
+ int oneLengthCounter = 0;
+ int twoLengthCounter = 0;
+ for (int i = 0; i < wordCount; ++i) {
+ const int wordLength = wordLengthArray[i];
+ // TODO: Use bigram instead of this safety net
+ if (i < wordCount - 1) {
+ const int nextWordLength = wordLengthArray[i + 1];
+ if (wordLength == 1 && nextWordLength == 2) {
+ // Safety net to filter 1 length and 2 length sequential words
+ return 0;
+ }
+ }
+ const int freq = freqArray[i];
+ // Demote too short weak words
+ if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) {
+ multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
+ }
+ if (wordLength == 1) {
+ ++oneLengthCounter;
+ } else if (wordLength == 2) {
+ ++twoLengthCounter;
+ }
+ if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
+ // Safety net to filter too many short words
+ return 0;
+ }
+ }
+ multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
+ }
// This is a workaround to try offsetting the not-enough-demotion which will be done in
// calcNormalizedScore in Utils.java.
@@ -923,19 +1016,128 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
if (isSpaceProximity) {
// A word pair with one space proximity correction
if (DEBUG_DICT) {
- LOGI("Found a word pair with space proximity correction.");
+ AKLOGI("Found a word pair with space proximity correction.");
}
multiplyIntCapped(typedLetterMultiplier, &totalFreq);
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
}
- multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
+ if (isSpaceProximity) {
+ multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq);
+ } else {
+ multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
+ }
if (capitalizedWordDemotion) {
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
}
+ if (DEBUG_CORRECTION_FREQ) {
+ AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
+ wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
+ DUMP_WORD(word, wordLengthArray[0]);
+ }
+
return totalFreq;
}
+/* Damerau-Levenshtein distance */
+inline static int editDistanceInternal(
+ int* editDistanceTable, const unsigned short* before,
+ const int beforeLength, const unsigned short* after, const int afterLength) {
+ // dp[li][lo] dp[a][b] = dp[ a * lo + b]
+ int* dp = editDistanceTable;
+ const int li = beforeLength + 1;
+ const int lo = afterLength + 1;
+ for (int i = 0; i < li; ++i) {
+ dp[lo * i] = i;
+ }
+ for (int i = 0; i < lo; ++i) {
+ dp[i] = i;
+ }
+
+ for (int i = 0; i < li - 1; ++i) {
+ for (int j = 0; j < lo - 1; ++j) {
+ const uint32_t ci = toBaseLowerCase(before[i]);
+ const uint32_t co = toBaseLowerCase(after[j]);
+ const uint16_t cost = (ci == co) ? 0 : 1;
+ dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
+ min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
+ if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1])
+ && co == toBaseLowerCase(before[i - 1])) {
+ dp[(i + 1) * lo + (j + 1)] = min(
+ dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
+ }
+ }
+ }
+
+ if (DEBUG_EDIT_DISTANCE) {
+ AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength);
+ for (int i = 0; i < li; ++i) {
+ for (int j = 0; j < lo; ++j) {
+ AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
+ }
+ }
+ }
+ return dp[li * lo - 1];
+}
+
+int Correction::RankingAlgorithm::editDistance(const unsigned short* before,
+ const int beforeLength, const unsigned short* after, const int afterLength) {
+ int table[(beforeLength + 1) * (afterLength + 1)];
+ return editDistanceInternal(table, before, beforeLength, after, afterLength);
+}
+
+
+// In dictionary.cpp, getSuggestion() method,
+// suggestion scores are computed using the below formula.
+// original score
+// := pow(mTypedLetterMultiplier (this is defined 2),
+// (the number of matched characters between typed word and suggested word))
+// * (individual word's score which defined in the unigram dictionary,
+// and this score is defined in range [0, 255].)
+// Then, the following processing is applied.
+// - If the dictionary word is matched up to the point of the user entry
+// (full match up to min(before.length(), after.length())
+// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2)
+// - If the word is a true full match except for differences in accents or
+// capitalization, then treat it as if the score was 255.
+// - If before.length() == after.length()
+// => multiply by mFullWordMultiplier (this is defined 2))
+// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2
+// For historical reasons we ignore the 1.2 modifier (because the measure for a good
+// autocorrection threshold was done at a time when it didn't exist). This doesn't change
+// the result.
+// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2.
+
+/* static */
+double Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before,
+ const int beforeLength, const unsigned short* after, const int afterLength,
+ const int score) {
+ if (0 == beforeLength || 0 == afterLength) {
+ return 0;
+ }
+ const int distance = editDistance(before, beforeLength, after, afterLength);
+ int spaceCount = 0;
+ for (int i = 0; i < afterLength; ++i) {
+ if (after[i] == CODE_SPACE) {
+ ++spaceCount;
+ }
+ }
+
+ if (spaceCount == afterLength) {
+ return 0;
+ }
+
+ const double maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE
+ * pow((double)TYPED_LETTER_MULTIPLIER,
+ (double)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER;
+
+ // add a weight based on edit distance.
+ // distance <= max(afterLength, beforeLength) == afterLength,
+ // so, 0 <= distance / afterLength <= 1
+ const double weight = 1.0 - (double) distance / afterLength;
+ return (score / maxScore) * weight;
+}
+
} // namespace latinime
diff --git a/native/src/correction.h b/native/jni/src/correction.h
index d4e99f0ce..ee55c9604 100644
--- a/native/src/correction.h
+++ b/native/jni/src/correction.h
@@ -17,6 +17,7 @@
#ifndef LATINIME_CORRECTION_H
#define LATINIME_CORRECTION_H
+#include <assert.h>
#include <stdint.h>
#include "correction_state.h"
@@ -27,8 +28,7 @@ namespace latinime {
class ProximityInfo;
class Correction {
-
-public:
+ public:
typedef enum {
TRAVERSE_ALL_ON_TERMINAL,
TRAVERSE_ALL_NOT_ON_TERMINAL,
@@ -37,6 +37,62 @@ public:
NOT_ON_TERMINAL
} CorrectionType;
+ /////////////////////////
+ // static inline utils //
+ /////////////////////////
+
+ static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
+ static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+ return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+ }
+
+ static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
+ inline static void multiplyIntCapped(const int multiplier, int *base) {
+ const int temp = *base;
+ if (temp != S_INT_MAX) {
+ // Branch if multiplier == 2 for the optimization
+ if (multiplier < 0) {
+ if (DEBUG_DICT) {
+ assert(false);
+ }
+ AKLOGI("--- Invalid multiplier: %d", multiplier);
+ } else if (multiplier == 0) {
+ *base = 0;
+ } else if (multiplier == 2) {
+ *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
+ } else {
+ // TODO: This overflow check gives a wrong answer when, for example,
+ // temp = 2^16 + 1 and multiplier = 2^17 + 1.
+ // Fix this behavior.
+ const int tempRetval = temp * multiplier;
+ *base = tempRetval >= temp ? tempRetval : S_INT_MAX;
+ }
+ }
+ }
+
+ inline static int powerIntCapped(const int base, const int n) {
+ if (n <= 0) return 1;
+ if (base == 2) {
+ return n < 31 ? 1 << n : S_INT_MAX;
+ } else {
+ int ret = base;
+ for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
+ return ret;
+ }
+ }
+
+ inline static void multiplyRate(const int rate, int *freq) {
+ if (*freq != S_INT_MAX) {
+ if (*freq > 1000000) {
+ *freq /= 100;
+ multiplyIntCapped(rate, freq);
+ } else {
+ multiplyIntCapped(rate, freq);
+ *freq /= 100;
+ }
+ }
+ }
+
Correction(const int typedLetterMultiplier, const int fullWordMultiplier);
void initCorrection(
const ProximityInfo *pi, const int inputLength, const int maxWordLength);
@@ -44,11 +100,11 @@ public:
// TODO: remove
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
- const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance);
+ const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance,
+ const bool doAutoCompletion, const int maxErrors);
void checkState();
bool initProcessState(const int index);
- int getOutputIndex();
int getInputIndex();
virtual ~Correction();
@@ -73,9 +129,12 @@ public:
bool needsToPrune() const;
- int getFreqForSplitTwoWords(
- const int firstFreq, const int secondFreq, const unsigned short *word);
+ int getFreqForSplitMultipleWords(
+ const int *freqArray, const int *wordLengthArray, const int wordCount,
+ const bool isSpaceProximity, const unsigned short *word);
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
+ int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
+ const int inputLength);
CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
@@ -94,21 +153,44 @@ public:
inline int getTreeParentIndex(const int index) const {
return mCorrectionStates[index].mParentIndex;
}
-private:
+
+ class RankingAlgorithm {
+ public:
+ static int calculateFinalFreq(const int inputIndex, const int depth,
+ const int freq, int *editDistanceTable, const Correction* correction,
+ const int inputLength);
+ static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
+ const int wordCount, const Correction* correction, const bool isSpaceProximity,
+ const unsigned short *word);
+ static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
+ const unsigned short* after, const int afterLength, const int score);
+ static int editDistance(const unsigned short* before,
+ const int beforeLength, const unsigned short* after, const int afterLength);
+ private:
+ static const int CODE_SPACE = ' ';
+ static const int MAX_INITIAL_SCORE = 255;
+ static const int TYPED_LETTER_MULTIPLIER = 2;
+ static const int FULL_WORD_MULTIPLIER = 2;
+ };
+
+ private:
inline void incrementInputIndex();
inline void incrementOutputIndex();
- inline bool needsToTraverseAllNodes();
inline void startToTraverseAllNodes();
inline bool isQuote(const unsigned short c);
inline CorrectionType processSkipChar(
const int32_t c, const bool isTerminal, const bool inputIndexIncremented);
+ inline CorrectionType processUnrelatedCorrectionType();
inline void addCharToCurrentWord(const int32_t c);
+ inline int getFinalFreqInternal(const int freq, unsigned short **word, int* wordLength,
+ const int inputLength);
const int TYPED_LETTER_MULTIPLIER;
const int FULL_WORD_MULTIPLIER;
const ProximityInfo *mProximityInfo;
bool mUseFullEditDistance;
+ bool mDoAutoCompletion;
int mMaxEditDistance;
int mMaxDepth;
int mInputLength;
@@ -116,6 +198,7 @@ private:
int mMissingSpacePos;
int mTerminalInputIndex;
int mTerminalOutputIndex;
+ int mMaxErrors;
// The following arrays are state buffer.
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
@@ -146,17 +229,11 @@ private:
bool mMatching;
bool mProximityMatching;
+ bool mAdditionalProximityMatching;
bool mExceeding;
bool mTransposing;
bool mSkipping;
- class RankingAlgorithm {
- public:
- static int calculateFinalFreq(const int inputIndex, const int depth,
- const int freq, int *editDistanceTable, const Correction* correction);
- static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
- const Correction* correction, const unsigned short *word);
- };
};
} // namespace latinime
#endif // LATINIME_CORRECTION_H
diff --git a/native/src/correction_state.h b/native/jni/src/correction_state.h
index c04146e54..5b2cbd3a2 100644
--- a/native/src/correction_state.h
+++ b/native/jni/src/correction_state.h
@@ -47,9 +47,9 @@ struct CorrectionState {
bool mExceeding;
bool mSkipping;
bool mProximityMatching;
+ bool mAdditionalProximityMatching;
bool mNeedsToTraverseAllNodes;
-
};
inline static void initCorrectionState(CorrectionState *state, const int rootPos,
@@ -77,7 +77,7 @@ inline static void initCorrectionState(CorrectionState *state, const int rootPos
state->mTransposing = false;
state->mExceeding = false;
state->mSkipping = false;
-
+ state->mAdditionalProximityMatching = false;
}
} // namespace latinime
diff --git a/native/src/debug.h b/native/jni/src/debug.h
index 38b2f107a..b13052c95 100644
--- a/native/src/debug.h
+++ b/native/jni/src/debug.h
@@ -42,7 +42,7 @@ static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned s
static inline void LOGI_S16(unsigned short* string, const unsigned int length) {
unsigned char tmp_buffer[length];
convertToUnibyteString(string, tmp_buffer, length);
- LOGI(">> %s", tmp_buffer);
+ AKLOGI(">> %s", tmp_buffer);
// The log facility is throwing out log that comes too fast. The following
// is a dirty way of slowing down processing so that we can see all log.
// TODO : refactor this in a blocking log or something.
@@ -53,7 +53,7 @@ static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int leng
unsigned char c) {
unsigned char tmp_buffer[length+1];
convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c);
- LOGI(">> %s", tmp_buffer);
+ AKLOGI(">> %s", tmp_buffer);
// Likewise
// usleep(10);
}
@@ -64,7 +64,7 @@ static inline void printDebug(const char* tag, int* codes, int codesSize, int MA
buf[codesSize] = 0;
while (--codesSize >= 0)
buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS];
- LOGI("%s, WORD = %s", tag, buf);
+ AKLOGI("%s, WORD = %s", tag, buf);
free(buf);
}
diff --git a/native/src/defines.h b/native/jni/src/defines.h
index ef1beb92f..e882c3714 100644
--- a/native/src/defines.h
+++ b/native/jni/src/defines.h
@@ -20,15 +20,31 @@
#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG)
#include <cutils/log.h>
+#define AKLOGE ALOGE
+#define AKLOGI ALOGI
+
+#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0)
+
+static inline void dumpWord(const unsigned short* word, const int length) {
+ static char charBuf[50];
+
+ for (int i = 0; i < length; ++i) {
+ charBuf[i] = word[i];
+ }
+ charBuf[length] = 0;
+ AKLOGI("[ %s ]", charBuf);
+}
+
#else
-#define LOGE(fmt, ...)
-#define LOGI(fmt, ...)
+#define AKLOGE(fmt, ...)
+#define AKLOGI(fmt, ...)
+#define DUMP_WORD(word, length)
#endif
#ifdef FLAG_DO_PROFILE
// Profiler
-#include <cutils/log.h>
#include <time.h>
+
#define PROF_BUF_SIZE 100
static double profile_buf[PROF_BUF_SIZE];
static double profile_old[PROF_BUF_SIZE];
@@ -42,10 +58,10 @@ static unsigned int profile_counter[PROF_BUF_SIZE];
#define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while(0)
#define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id])
#define PROF_CLOCKOUT(prof_buf_id) \
- LOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id]))
-#define PROF_OUTALL do { LOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0)
+ AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id]))
+#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0)
-static void prof_reset(void) {
+static inline void prof_reset(void) {
for (int i = 0; i < PROF_BUF_SIZE; ++i) {
profile_buf[i] = 0;
profile_old[i] = 0;
@@ -53,11 +69,11 @@ static void prof_reset(void) {
}
}
-static void prof_out(void) {
+static inline void prof_out(void) {
if (profile_counter[PROF_BUF_SIZE - 1] != 1) {
- LOGI("Error: You must call PROF_OPEN before PROF_CLOSE.");
+ AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE.");
}
- LOGI("Total time is %6.3f ms.",
+ AKLOGI("Total time is %6.3f ms.",
profile_buf[PROF_BUF_SIZE - 1] * 1000 / (double)CLOCKS_PER_SEC);
double all = 0;
for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
@@ -66,7 +82,7 @@ static void prof_out(void) {
if (all == 0) all = 1;
for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
if (profile_buf[i] != 0) {
- LOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.",
+ AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.",
i, (profile_buf[i] * 100 / all),
profile_buf[i] * 1000 / (double)CLOCKS_PER_SEC, profile_counter[i]);
}
@@ -98,21 +114,11 @@ static void prof_out(void) {
#define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE DEBUG_DICT_FULL
#define DEBUG_TRACE DEBUG_DICT_FULL
-#define DEBUG_PROXIMITY_INFO true
+#define DEBUG_PROXIMITY_INFO false
+#define DEBUG_PROXIMITY_CHARS false
#define DEBUG_CORRECTION false
-#define DEBUG_CORRECTION_FREQ true
-
-#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0)
-
-static char charBuf[50];
-
-static void dumpWord(const unsigned short* word, const int length) {
- for (int i = 0; i < length; ++i) {
- charBuf[i] = word[i];
- }
- charBuf[length] = 0;
- LOGI("[ %s ]", charBuf);
-}
+#define DEBUG_CORRECTION_FREQ false
+#define DEBUG_WORDS_PRIORITY_QUEUE false
#else // FLAG_DBG
@@ -123,10 +129,11 @@ static void dumpWord(const unsigned short* word, const int length) {
#define DEBUG_NODE false
#define DEBUG_TRACE false
#define DEBUG_PROXIMITY_INFO false
+#define DEBUG_PROXIMITY_CHARS false
#define DEBUG_CORRECTION false
#define DEBUG_CORRECTION_FREQ false
+#define DEBUG_WORDS_PRIORITY_QUEUE false
-#define DUMP_WORD(word, length)
#endif // FLAG_DBG
@@ -157,64 +164,91 @@ static void dumpWord(const unsigned short* word, const int length) {
#define FLAG_BIGRAM_FREQ 0x7F
#define DICTIONARY_VERSION_MIN 200
-// TODO: remove this constant when the switch to the new dict format is over
-#define DICTIONARY_HEADER_SIZE 2
-#define NEW_DICTIONARY_HEADER_SIZE 5
#define NOT_VALID_WORD -99
#define NOT_A_CHARACTER -1
#define NOT_A_DISTANCE -1
+#define NOT_A_COORDINATE -1
#define EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO -2
#define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO -3
-#define NOT_A_INDEX -1
+#define ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO -4
+#define NOT_AN_INDEX -1
+#define NOT_A_FREQUENCY -1
#define KEYCODE_SPACE ' '
#define CALIBRATE_SCORE_BY_TOUCH_COORDINATES true
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
-#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
-#define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true
+#define SUGGEST_MULTIPLE_WORDS true
// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12
-#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67
+#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58
+#define WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE 50
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
#define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75
-#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
+#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
+#define WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE 70
#define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
-#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160
+#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 148
+#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER 3
#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45
#define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
#define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
#define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50
+#define TWO_WORDS_CORRECTION_DEMOTION_BASE 80
+#define TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER 1
#define ZERO_DISTANCE_PROMOTION_RATE 110
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
#define HALF_SCORE_SQUARED_RADIUS 32.0f
+#define MAX_FREQ 255
-// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
+// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
#define MAX_WORD_LENGTH_INTERNAL 48
+// This must be equal to ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE in KeyDetector.java
+#define ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE 2
+
+// Word limit for sub queues used in WordsPriorityQueuePool. Sub queues are temporary queues used
+// for better performance.
+// Holds up to 1 candidate for each word
+#define SUB_QUEUE_MAX_WORDS 1
+#define SUB_QUEUE_MAX_COUNT 10
+#define SUB_QUEUE_MIN_WORD_LENGTH 4
+#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10
+#define MULTIPLE_WORDS_DEMOTION_RATE 80
+#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6
+
+#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
+#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
+
#define MAX_DEPTH_MULTIPLIER 3
-// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
-// word in the dictionary
-#define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5
+#define FIRST_WORD_INDEX 0
+
+// TODO: Reduce this constant if possible; check the maximum number of digraphs in the same
+// word in the dictionary for languages with digraphs, like German and French
+#define DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH 5
// Minimum suggest depth for one word for all cases except for missing space suggestions.
#define MIN_SUGGEST_DEPTH 1
-#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3
+#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
-#define min(a,b) ((a)<(b)?(a):(b))
-#define max(a,b) ((a)>(b)?(a):(b))
+template<typename T> inline T min(T a, T b) { return a < b ? a : b; }
+template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
// The ratio of neutral area radius to sweet spot radius.
#define NEUTRAL_AREA_RADIUS_RATIO 1.3f
+// DEBUG
+#define INPUTLENGTH_FOR_DEBUG -1
+#define MIN_OUTPUT_INDEX_FOR_DEBUG -1
+
#endif // LATINIME_DEFINES_H
diff --git a/native/src/dictionary.cpp b/native/jni/src/dictionary.cpp
index a49769bdb..981a983ee 100644
--- a/native/src/dictionary.cpp
+++ b/native/jni/src/dictionary.cpp
@@ -19,6 +19,7 @@
#define LOG_TAG "LatinIME: dictionary.cpp"
+#include "binary_format.h"
#include "dictionary.h"
namespace latinime {
@@ -26,33 +27,35 @@ namespace latinime {
// TODO: Change the type of all keyCodes to uint32_t
Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust,
int typedLetterMultiplier, int fullWordMultiplier,
- int maxWordLength, int maxWords, int maxAlternatives)
+ int maxWordLength, int maxWords)
: mDict((unsigned char*) dict), mDictSize(dictSize),
mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust),
// Checks whether it has the latest dictionary or the old dictionary
IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) {
if (DEBUG_DICT) {
if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) {
- LOGI("Max word length (%d) is greater than %d",
+ AKLOGI("Max word length (%d) is greater than %d",
maxWordLength, MAX_WORD_LENGTH_INTERNAL);
- LOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF));
+ AKLOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF));
}
}
- mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier,
- maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
- mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives,
- IS_LATEST_DICT_VERSION, hasBigram(), this);
+ mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier);
+ mWordsPriorityQueuePool = new WordsPriorityQueuePool(
+ maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength);
+ const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict);
+ mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier,
+ fullWordMultiplier, maxWordLength, maxWords, IS_LATEST_DICT_VERSION);
+ mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength,
+ IS_LATEST_DICT_VERSION, true /* hasBigram */, this);
}
Dictionary::~Dictionary() {
+ delete mCorrection;
+ delete mWordsPriorityQueuePool;
delete mUnigramDictionary;
delete mBigramDictionary;
}
-bool Dictionary::hasBigram() {
- return ((mDict[1] & 0xFF) == 1);
-}
-
bool Dictionary::isValidWord(unsigned short *word, int length) {
return mUnigramDictionary->isValidWord(word, length);
}
diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h
new file mode 100644
index 000000000..139d3f0d7
--- /dev/null
+++ b/native/jni/src/dictionary.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICTIONARY_H
+#define LATINIME_DICTIONARY_H
+
+#include "bigram_dictionary.h"
+#include "char_utils.h"
+#include "correction.h"
+#include "defines.h"
+#include "proximity_info.h"
+#include "unigram_dictionary.h"
+#include "words_priority_queue_pool.h"
+
+namespace latinime {
+
+class Dictionary {
+ public:
+ Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler,
+ int fullWordMultiplier, int maxWordLength, int maxWords);
+
+ int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates,
+ int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) {
+ return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool,
+ mCorrection, xcoordinates, ycoordinates, codes,
+ codesSize, flags, outWords, frequencies);
+ }
+
+ // TODO: Call mBigramDictionary instead of mUnigramDictionary
+ int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
+ unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) {
+ return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies,
+ maxWordLength, maxBigrams);
+ }
+
+ bool isValidWord(unsigned short *word, int length);
+ void *getDict() { return (void *)mDict; }
+ int getDictSize() { return mDictSize; }
+ int getMmapFd() { return mMmapFd; }
+ int getDictBufAdjust() { return mDictBufAdjust; }
+ ~Dictionary();
+
+ // public static utility methods
+ // static inline methods should be defined in the header file
+ static int wideStrLen(unsigned short *str);
+
+ private:
+ bool hasBigram();
+
+ const unsigned char *mDict;
+
+ // Used only for the mmap version of dictionary loading, but we use these as dummy variables
+ // also for the malloc version.
+ const int mDictSize;
+ const int mMmapFd;
+ const int mDictBufAdjust;
+
+ const bool IS_LATEST_DICT_VERSION;
+ UnigramDictionary *mUnigramDictionary;
+ BigramDictionary *mBigramDictionary;
+ WordsPriorityQueuePool *mWordsPriorityQueuePool;
+ Correction *mCorrection;
+};
+
+// public static utility methods
+// static inline methods should be defined in the header file
+inline int Dictionary::wideStrLen(unsigned short *str) {
+ if (!str) return 0;
+ unsigned short *end = str;
+ while (*end)
+ end++;
+ return end - str;
+}
+} // namespace latinime
+
+#endif // LATINIME_DICTIONARY_H
diff --git a/native/src/proximity_info.cpp b/native/jni/src/proximity_info.cpp
index 763a3a174..c00c4c20f 100644
--- a/native/src/proximity_info.cpp
+++ b/native/jni/src/proximity_info.cpp
@@ -16,10 +16,11 @@
#include <assert.h>
#include <stdio.h>
-#include <string.h>
+#include <string>
#define LOG_TAG "LatinIME: proximity_info.cpp"
+#include "additional_proximity_chars.h"
#include "dictionary.h"
#include "proximity_info.h"
@@ -33,26 +34,30 @@ inline void copyOrFillZero(void *to, const void *from, size_t size) {
}
}
-ProximityInfo::ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth,
- const int keyboardHeight, const int gridWidth, const int gridHeight,
- const uint32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates,
+ProximityInfo::ProximityInfo(const std::string localeStr, const int maxProximityCharsSize,
+ const int keyboardWidth, const int keyboardHeight, const int gridWidth,
+ const int gridHeight, const int mostCommonKeyWidth,
+ const int32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates,
const int32_t *keyYCoordinates, const int32_t *keyWidths, const int32_t *keyHeights,
const int32_t *keyCharCodes, const float *sweetSpotCenterXs, const float *sweetSpotCenterYs,
const float *sweetSpotRadii)
: MAX_PROXIMITY_CHARS_SIZE(maxProximityCharsSize), KEYBOARD_WIDTH(keyboardWidth),
KEYBOARD_HEIGHT(keyboardHeight), GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight),
+ MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth),
CELL_WIDTH((keyboardWidth + gridWidth - 1) / gridWidth),
CELL_HEIGHT((keyboardHeight + gridHeight - 1) / gridHeight),
KEY_COUNT(min(keyCount, MAX_KEY_COUNT_IN_A_KEYBOARD)),
HAS_TOUCH_POSITION_CORRECTION_DATA(keyCount > 0 && keyXCoordinates && keyYCoordinates
&& keyWidths && keyHeights && keyCharCodes && sweetSpotCenterXs
&& sweetSpotCenterYs && sweetSpotRadii),
- mInputXCoordinates(NULL), mInputYCoordinates(NULL),
+ mLocaleStr(localeStr),
+ mInputXCoordinates(0), mInputYCoordinates(0),
mTouchPositionCorrectionEnabled(false) {
const int proximityGridLength = GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE;
- mProximityCharsArray = new uint32_t[proximityGridLength];
+ mProximityCharsArray = new int32_t[proximityGridLength];
+ mInputCodes = new int32_t[MAX_PROXIMITY_CHARS_SIZE * MAX_WORD_LENGTH_INTERNAL];
if (DEBUG_PROXIMITY_INFO) {
- LOGI("Create proximity info array %d", proximityGridLength);
+ AKLOGI("Create proximity info array %d", proximityGridLength);
}
memcpy(mProximityCharsArray, proximityCharsArray,
proximityGridLength * sizeof(mProximityCharsArray[0]));
@@ -92,6 +97,7 @@ void ProximityInfo::initializeCodeToKeyIndex() {
ProximityInfo::~ProximityInfo() {
delete[] mNormalizedSquaredDistances;
delete[] mProximityCharsArray;
+ delete[] mInputCodes;
}
inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y) const {
@@ -100,13 +106,21 @@ inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y)
}
bool ProximityInfo::hasSpaceProximity(const int x, const int y) const {
+ if (x < 0 || y < 0) {
+ if (DEBUG_DICT) {
+ AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y);
+ assert(false);
+ }
+ return false;
+ }
+
const int startIndex = getStartIndexFromCoordinates(x, y);
if (DEBUG_PROXIMITY_INFO) {
- LOGI("hasSpaceProximity: index %d", startIndex);
+ AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y);
}
for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) {
if (DEBUG_PROXIMITY_INFO) {
- LOGI("Index: %d", mProximityCharsArray[startIndex + i]);
+ AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]);
}
if (mProximityCharsArray[startIndex + i] == KEYCODE_SPACE) {
return true;
@@ -115,10 +129,126 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const {
return false;
}
-// TODO: Calculate nearby codes here.
-void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength,
+bool ProximityInfo::isOnKey(const int keyId, const int x, const int y) const {
+ if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case
+ const int left = mKeyXCoordinates[keyId];
+ const int top = mKeyYCoordinates[keyId];
+ const int right = left + mKeyWidths[keyId] + 1;
+ const int bottom = top + mKeyHeights[keyId];
+ return left < right && top < bottom && x >= left && x < right && y >= top && y < bottom;
+}
+
+int ProximityInfo::squaredDistanceToEdge(const int keyId, const int x, const int y) const {
+ if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case
+ const int left = mKeyXCoordinates[keyId];
+ const int top = mKeyYCoordinates[keyId];
+ const int right = left + mKeyWidths[keyId];
+ const int bottom = top + mKeyHeights[keyId];
+ const int edgeX = x < left ? left : (x > right ? right : x);
+ const int edgeY = y < top ? top : (y > bottom ? bottom : y);
+ const int dx = x - edgeX;
+ const int dy = y - edgeY;
+ return dx * dx + dy * dy;
+}
+
+void ProximityInfo::calculateNearbyKeyCodes(
+ const int x, const int y, const int32_t primaryKey, int *inputCodes) const {
+ int insertPos = 0;
+ inputCodes[insertPos++] = primaryKey;
+ const int startIndex = getStartIndexFromCoordinates(x, y);
+ if (startIndex >= 0) {
+ for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) {
+ const int32_t c = mProximityCharsArray[startIndex + i];
+ if (c < KEYCODE_SPACE || c == primaryKey) {
+ continue;
+ }
+ const int keyIndex = getKeyIndex(c);
+ const bool onKey = isOnKey(keyIndex, x, y);
+ const int distance = squaredDistanceToEdge(keyIndex, x, y);
+ if (onKey || distance < MOST_COMMON_KEY_WIDTH_SQUARE) {
+ inputCodes[insertPos++] = c;
+ if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) {
+ if (DEBUG_DICT) {
+ assert(false);
+ }
+ return;
+ }
+ }
+ }
+ const int additionalProximitySize =
+ AdditionalProximityChars::getAdditionalCharsSize(&mLocaleStr, primaryKey);
+ if (additionalProximitySize > 0) {
+ inputCodes[insertPos++] = ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE;
+ if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) {
+ if (DEBUG_DICT) {
+ assert(false);
+ }
+ return;
+ }
+
+ const int32_t* additionalProximityChars =
+ AdditionalProximityChars::getAdditionalChars(&mLocaleStr, primaryKey);
+ for (int j = 0; j < additionalProximitySize; ++j) {
+ const int32_t ac = additionalProximityChars[j];
+ int k = 0;
+ for (; k < insertPos; ++k) {
+ if ((int)ac == inputCodes[k]) {
+ break;
+ }
+ }
+ if (k < insertPos) {
+ continue;
+ }
+ inputCodes[insertPos++] = ac;
+ if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) {
+ if (DEBUG_DICT) {
+ assert(false);
+ }
+ return;
+ }
+ }
+ }
+ }
+ // Add a delimiter for the proximity characters
+ for (int i = insertPos; i < MAX_PROXIMITY_CHARS_SIZE; ++i) {
+ inputCodes[i] = NOT_A_CODE;
+ }
+}
+
+void ProximityInfo::setInputParams(const int32_t* inputCodes, const int inputLength,
const int* xCoordinates, const int* yCoordinates) {
- mInputCodes = inputCodes;
+ memset(mInputCodes, 0,
+ MAX_WORD_LENGTH_INTERNAL * MAX_PROXIMITY_CHARS_SIZE * sizeof(mInputCodes[0]));
+
+ for (int i = 0; i < inputLength; ++i) {
+ const int32_t primaryKey = inputCodes[i];
+ const int x = xCoordinates[i];
+ const int y = yCoordinates[i];
+ int *proximities = &mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE];
+ calculateNearbyKeyCodes(x, y, primaryKey, proximities);
+ }
+
+ if (DEBUG_PROXIMITY_CHARS) {
+ for (int i = 0; i < inputLength; ++i) {
+ AKLOGI("---");
+ for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE; ++j) {
+ int icc = mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE + j];
+ int icfjc = inputCodes[i * MAX_PROXIMITY_CHARS_SIZE + j];
+ icc+= 0;
+ icfjc += 0;
+ AKLOGI("--- (%d)%c,%c", i, icc, icfjc);
+ AKLOGI("--- A<%d>,B<%d>", icc, icfjc);
+ }
+ }
+ }
+ //Keep for debug, sorry
+ //for (int i = 0; i < MAX_WORD_LENGTH_INTERNAL * MAX_PROXIMITY_CHARS_SIZE; ++i) {
+ //if (i < inputLength * MAX_PROXIMITY_CHARS_SIZE) {
+ //mInputCodes[i] = mInputCodesFromJava[i];
+ //} else {
+ // mInputCodes[i] = 0;
+ // }
+ //}
mInputXCoordinates = xCoordinates;
mInputYCoordinates = yCoordinates;
mTouchPositionCorrectionEnabled =
@@ -128,12 +258,35 @@ void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength,
mPrimaryInputWord[i] = getPrimaryCharAt(i);
}
mPrimaryInputWord[inputLength] = 0;
+ if (DEBUG_PROXIMITY_CHARS) {
+ AKLOGI("--- setInputParams");
+ }
for (int i = 0; i < mInputLength; ++i) {
const int *proximityChars = getProximityCharsAt(i);
+ const int primaryKey = proximityChars[0];
+ const int x = xCoordinates[i];
+ const int y = yCoordinates[i];
+ if (DEBUG_PROXIMITY_CHARS) {
+ int a = x + y + primaryKey;
+ a += 0;
+ AKLOGI("--- Primary = %c, x = %d, y = %d", primaryKey, x, y);
+ // Keep debug code just in case
+ //int proximities[50];
+ //for (int m = 0; m < 50; ++m) {
+ //proximities[m] = 0;
+ //}
+ //calculateNearbyKeyCodes(x, y, primaryKey, proximities);
+ //for (int l = 0; l < 50 && proximities[l] > 0; ++l) {
+ //if (DEBUG_PROXIMITY_CHARS) {
+ //AKLOGI("--- native Proximity (%d) = %c", l, proximities[l]);
+ //}
+ //}
+ }
for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE && proximityChars[j] > 0; ++j) {
const int currentChar = proximityChars[j];
- const int keyIndex = getKeyIndex(currentChar);
- const float squaredDistance = calculateNormalizedSquaredDistance(keyIndex, i);
+ const float squaredDistance = hasInputCoordinates()
+ ? calculateNormalizedSquaredDistance(getKeyIndex(currentChar), i)
+ : NOT_A_DISTANCE_FLOAT;
if (squaredDistance >= 0.0f) {
mNormalizedSquaredDistances[i * MAX_PROXIMITY_CHARS_SIZE + j] =
(int)(squaredDistance * NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR);
@@ -142,6 +295,9 @@ void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength,
? EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO
: PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO;
}
+ if (DEBUG_PROXIMITY_CHARS) {
+ AKLOGI("--- Proximity (%d) = %c", j, currentChar);
+ }
}
}
}
@@ -150,26 +306,32 @@ inline float square(const float x) { return x * x; }
float ProximityInfo::calculateNormalizedSquaredDistance(
const int keyIndex, const int inputIndex) const {
- static const float NOT_A_DISTANCE_FLOAT = -1.0f;
- if (keyIndex == NOT_A_INDEX) {
+ if (keyIndex == NOT_AN_INDEX) {
return NOT_A_DISTANCE_FLOAT;
}
if (!hasSweetSpotData(keyIndex)) {
return NOT_A_DISTANCE_FLOAT;
}
+ if (NOT_A_COORDINATE == mInputXCoordinates[inputIndex]) {
+ return NOT_A_DISTANCE_FLOAT;
+ }
const float squaredDistance = calculateSquaredDistanceFromSweetSpotCenter(keyIndex, inputIndex);
const float squaredRadius = square(mSweetSpotRadii[keyIndex]);
return squaredDistance / squaredRadius;
}
+bool ProximityInfo::hasInputCoordinates() const {
+ return mInputXCoordinates && mInputYCoordinates;
+}
+
int ProximityInfo::getKeyIndex(const int c) const {
- if (KEY_COUNT == 0 || !mInputXCoordinates || !mInputYCoordinates) {
+ if (KEY_COUNT == 0) {
// We do not have the coordinate data
- return NOT_A_INDEX;
+ return NOT_AN_INDEX;
}
- const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
+ const unsigned short baseLowerC = toBaseLowerCase(c);
if (baseLowerC > MAX_CHAR_CODE) {
- return NOT_A_INDEX;
+ return NOT_AN_INDEX;
}
return mCodeToKeyIndex[baseLowerC];
}
@@ -232,7 +394,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde
const unsigned short c, const bool checkProximityChars, int *proximityIndex) const {
const int *currentChars = getProximityCharsAt(index);
const int firstChar = currentChars[0];
- const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
+ const unsigned short baseLowerC = toBaseLowerCase(c);
// The first char in the array is what user typed. If it matches right away,
// that means the user typed that same char for this pos.
@@ -245,12 +407,13 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde
// If the non-accented, lowercased version of that first character matches c,
// then we have a non-accented version of the accented character the user
// typed. Treat it as a close char.
- if (Dictionary::toBaseLowerCase(firstChar) == baseLowerC)
+ if (toBaseLowerCase(firstChar) == baseLowerC)
return NEAR_PROXIMITY_CHAR;
// Not an exact nor an accent-alike match: search the list of close keys
int j = 1;
- while (j < MAX_PROXIMITY_CHARS_SIZE && currentChars[j] > 0) {
+ while (j < MAX_PROXIMITY_CHARS_SIZE
+ && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) {
const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
if (matched) {
if (proximityIndex) {
@@ -260,6 +423,21 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde
}
++j;
}
+ if (j < MAX_PROXIMITY_CHARS_SIZE
+ && currentChars[j] == ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) {
+ ++j;
+ while (j < MAX_PROXIMITY_CHARS_SIZE
+ && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) {
+ const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
+ if (matched) {
+ if (proximityIndex) {
+ *proximityIndex = j;
+ }
+ return ADDITIONAL_PROXIMITY_CHAR;
+ }
+ ++j;
+ }
+ }
// Was not included, signal this as an unrelated character.
return UNRELATED_CHAR;
diff --git a/native/src/proximity_info.h b/native/jni/src/proximity_info.h
index 35e354c6e..c1eefeacc 100644
--- a/native/src/proximity_info.h
+++ b/native/jni/src/proximity_info.h
@@ -18,6 +18,7 @@
#define LATINIME_PROXIMITY_INFO_H
#include <stdint.h>
+#include <string>
#include "defines.h"
@@ -26,7 +27,7 @@ namespace latinime {
class Correction;
class ProximityInfo {
-public:
+ public:
static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2 = 10;
static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR =
1 << NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2;
@@ -38,25 +39,28 @@ public:
// It is a char located nearby on the keyboard
NEAR_PROXIMITY_CHAR,
// It is an unrelated char
- UNRELATED_CHAR
+ UNRELATED_CHAR,
+ // Additional proximity char which can differ by language.
+ ADDITIONAL_PROXIMITY_CHAR
} ProximityType;
- ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth,
- const int keybaordHeight, const int gridWidth, const int gridHeight,
- const uint32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates,
+ ProximityInfo(const std::string localeStr, const int maxProximityCharsSize,
+ const int keyboardWidth, const int keybaordHeight, const int gridWidth,
+ const int gridHeight, const int mostCommonkeyWidth,
+ const int32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates,
const int32_t *keyYCoordinates, const int32_t *keyWidths, const int32_t *keyHeights,
const int32_t *keyCharCodes, const float *sweetSpotCenterXs,
const float *sweetSpotCenterYs, const float *sweetSpotRadii);
~ProximityInfo();
bool hasSpaceProximity(const int x, const int y) const;
- void setInputParams(const int* inputCodes, const int inputLength,
+ void setInputParams(const int32_t *inputCodes, const int inputLength,
const int *xCoordinates, const int *yCoordinates);
const int* getProximityCharsAt(const int index) const;
unsigned short getPrimaryCharAt(const int index) const;
bool existsCharInProximityAt(const int index, const int c) const;
bool existsAdjacentProximityChars(const int index) const;
ProximityType getMatchedProximityId(const int index, const unsigned short c,
- const bool checkProximityChars, int *proximityIndex = NULL) const;
+ const bool checkProximityChars, int *proximityIndex = 0) const;
int getNormalizedSquaredDistance(const int inputIndex, const int proximityIndex) const {
return mNormalizedSquaredDistances[inputIndex * MAX_PROXIMITY_CHARS_SIZE + proximityIndex];
}
@@ -68,38 +72,49 @@ public:
return mTouchPositionCorrectionEnabled;
}
-private:
+ private:
// The max number of the keys in one keyboard layout
static const int MAX_KEY_COUNT_IN_A_KEYBOARD = 64;
// The upper limit of the char code in mCodeToKeyIndex
static const int MAX_CHAR_CODE = 127;
+ static const float NOT_A_DISTANCE_FLOAT = -1.0f;
+ static const int NOT_A_CODE = -1;
int getStartIndexFromCoordinates(const int x, const int y) const;
void initializeCodeToKeyIndex();
float calculateNormalizedSquaredDistance(const int keyIndex, const int inputIndex) const;
float calculateSquaredDistanceFromSweetSpotCenter(
const int keyIndex, const int inputIndex) const;
+ bool hasInputCoordinates() const;
int getKeyIndex(const int c) const;
bool hasSweetSpotData(const int keyIndex) const {
// When there are no calibration data for a key,
// the radius of the key is assigned to zero.
return mSweetSpotRadii[keyIndex] > 0.0;
}
+ bool isOnKey(const int keyId, const int x, const int y) const;
+ int squaredDistanceToEdge(const int keyId, const int x, const int y) const;
+ void calculateNearbyKeyCodes(
+ const int x, const int y, const int32_t primaryKey, int *inputCodes) const;
const int MAX_PROXIMITY_CHARS_SIZE;
const int KEYBOARD_WIDTH;
const int KEYBOARD_HEIGHT;
const int GRID_WIDTH;
const int GRID_HEIGHT;
+ const int MOST_COMMON_KEY_WIDTH_SQUARE;
const int CELL_WIDTH;
const int CELL_HEIGHT;
const int KEY_COUNT;
const bool HAS_TOUCH_POSITION_CORRECTION_DATA;
- const int *mInputCodes;
+ const std::string mLocaleStr;
+ // TODO: remove this
+ const int *mInputCodesFromJava;
+ int32_t *mInputCodes;
const int *mInputXCoordinates;
const int *mInputYCoordinates;
bool mTouchPositionCorrectionEnabled;
- uint32_t *mProximityCharsArray;
+ int32_t *mProximityCharsArray;
int *mNormalizedSquaredDistances;
int32_t mKeyXCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD];
int32_t mKeyYCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD];
diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h
new file mode 100644
index 000000000..1f9815936
--- /dev/null
+++ b/native/jni/src/terminal_attributes.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_TERMINAL_ATTRIBUTES_H
+#define LATINIME_TERMINAL_ATTRIBUTES_H
+
+#include "unigram_dictionary.h"
+
+namespace latinime {
+
+/**
+ * This class encapsulates information about a terminal that allows to
+ * retrieve local node attributes like the list of shortcuts without
+ * exposing the format structure to the client.
+ */
+class TerminalAttributes {
+ public:
+ class ShortcutIterator {
+ const uint8_t* const mDict;
+ bool mHasNextShortcutTarget;
+ int mPos;
+
+ public:
+ ShortcutIterator(const uint8_t* dict, const int pos, const uint8_t flags) : mDict(dict),
+ mPos(pos) {
+ mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS));
+ }
+
+ inline bool hasNextShortcutTarget() const {
+ return mHasNextShortcutTarget;
+ }
+
+ // Gets the shortcut target itself as a uint16_t string. For parameters and return value
+ // see BinaryFormat::getWordAtAddress.
+ inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
+ const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
+ mHasNextShortcutTarget =
+ 0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
+ int shortcutAddress =
+ BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos);
+ return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord);
+ }
+ };
+
+ private:
+ const uint8_t* const mDict;
+ const uint8_t mFlags;
+ const int mStartPos;
+
+ public:
+ TerminalAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) :
+ mDict(dict), mFlags(flags), mStartPos(pos) {
+ }
+
+ inline bool isShortcutOnly() const {
+ return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY);
+ }
+
+ inline ShortcutIterator getShortcutIterator() const {
+ return ShortcutIterator(mDict, mStartPos, mFlags);
+ }
+};
+} // namespace latinime
+
+#endif // LATINIME_TERMINAL_ATTRIBUTES_H
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
new file mode 100644
index 000000000..ed4c066f3
--- /dev/null
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -0,0 +1,894 @@
+/*
+**
+** Copyright 2010, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+** http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <string.h>
+
+#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
+
+#include "char_utils.h"
+#include "dictionary.h"
+#include "unigram_dictionary.h"
+
+#include "binary_format.h"
+#include "terminal_attributes.h"
+
+namespace latinime {
+
+const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
+ { { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS
+ { 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS
+ { 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS
+
+const UnigramDictionary::digraph_t UnigramDictionary::FRENCH_LIGATURES_DIGRAPHS[] =
+ { { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE
+ { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE
+
+// TODO: check the header
+UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
+ int fullWordMultiplier, int maxWordLength, int maxWords,
+ const bool isLatestDictVersion)
+ : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
+ IS_LATEST_DICT_VERSION(isLatestDictVersion),
+ TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
+ // TODO : remove this variable.
+ ROOT_POS(0),
+ BYTES_IN_ONE_CHAR(sizeof(int)),
+ MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH) {
+ if (DEBUG_DICT) {
+ AKLOGI("UnigramDictionary - constructor");
+ }
+}
+
+UnigramDictionary::~UnigramDictionary() {
+}
+
+static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize) {
+ return sizeof(*codes) * codesSize;
+}
+
+// TODO: This needs to take a const unsigned short* and not tinker with its contents
+static inline void addWord(
+ unsigned short *word, int length, int frequency, WordsPriorityQueue *queue) {
+ queue->push(frequency, word, length);
+}
+
+// Return the replacement code point for a digraph, or 0 if none.
+int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int codesSize,
+ const digraph_t* const digraphs, const unsigned int digraphsSize) const {
+
+ // There can't be a digraph if we don't have at least 2 characters to examine
+ if (i + 2 > codesSize) return false;
+
+ // Search for the first char of some digraph
+ int lastDigraphIndex = -1;
+ const int thisChar = codes[i];
+ for (lastDigraphIndex = digraphsSize - 1; lastDigraphIndex >= 0; --lastDigraphIndex) {
+ if (thisChar == digraphs[lastDigraphIndex].first) break;
+ }
+ // No match: return early
+ if (lastDigraphIndex < 0) return 0;
+
+ // It's an interesting digraph if the second char matches too.
+ if (digraphs[lastDigraphIndex].second == codes[i + 1]) {
+ return digraphs[lastDigraphIndex].replacement;
+ } else {
+ return 0;
+ }
+}
+
+// Mostly the same arguments as the non-recursive version, except:
+// codes is the original value. It points to the start of the work buffer, and gets passed as is.
+// codesSize is the size of the user input (thus, it is the size of codesSrc).
+// codesDest is the current point in the work buffer.
+// codesSrc is the current point in the user-input, original, content-unmodified buffer.
+// codesRemain is the remaining size in codesSrc.
+void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codesBuffer,
+ int *xCoordinatesBuffer, int *yCoordinatesBuffer,
+ const int codesBufferSize, const int flags, const int *codesSrc,
+ const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
+ WordsPriorityQueuePool *queuePool,
+ const digraph_t* const digraphs, const unsigned int digraphsSize) {
+
+ const int startIndex = codesDest - codesBuffer;
+ if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) {
+ for (int i = 0; i < codesRemain; ++i) {
+ xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i];
+ yCoordinatesBuffer[startIndex + i] = ycoordinates[codesBufferSize - codesRemain + i];
+ const int replacementCodePoint =
+ getDigraphReplacement(codesSrc, i, codesRemain, digraphs, digraphsSize);
+ if (0 != replacementCodePoint) {
+ // Found a digraph. We will try both spellings. eg. the word is "pruefen"
+
+ // Copy the word up to the first char of the digraph, including proximity chars,
+ // and overwrite the primary code with the replacement code point. Then, continue
+ // processing on the remaining part of the word, skipping the second char of the
+ // digraph.
+ // In our example, copy "pru", replace "u" with the version with the diaeresis and
+ // continue running on "fen".
+ // Make i the index of the second char of the digraph for simplicity. Forgetting
+ // to do that results in an infinite recursion so take care!
+ ++i;
+ memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
+ codesDest[(i - 1) * (BYTES_IN_ONE_CHAR / sizeof(codesDest[0]))] =
+ replacementCodePoint;
+ getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
+ codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, flags,
+ codesSrc + i + 1, codesRemain - i - 1,
+ currentDepth + 1, codesDest + i, correction,
+ queuePool, digraphs, digraphsSize);
+
+ // Copy the second char of the digraph in place, then continue processing on
+ // the remaining part of the word.
+ // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
+ memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR);
+ getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
+ codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, flags,
+ codesSrc + i, codesRemain - i, currentDepth + 1,
+ codesDest + i, correction, queuePool,
+ digraphs, digraphsSize);
+ return;
+ }
+ }
+ }
+
+ // If we come here, we hit the end of the word: let's check it against the dictionary.
+ // In our example, we'll come here once for "prufen" and then once for "pruefen".
+ // If the word contains several digraphs, we'll come it for the product of them.
+ // eg. if the word is "ueberpruefen" we'll test, in order, against
+ // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
+ const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
+ if (0 != remainingBytes) {
+ memcpy(codesDest, codesSrc, remainingBytes);
+ memcpy(&xCoordinatesBuffer[startIndex], &xcoordinates[codesBufferSize - codesRemain],
+ sizeof(int) * codesRemain);
+ memcpy(&yCoordinatesBuffer[startIndex], &ycoordinates[codesBufferSize - codesRemain],
+ sizeof(int) * codesRemain);
+ }
+
+ getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer,
+ startIndex + codesRemain, flags, correction,
+ queuePool);
+}
+
+int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
+ WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates,
+ const int *ycoordinates, const int *codes, const int codesSize, const int flags,
+ unsigned short *outWords, int *frequencies) {
+
+ queuePool->clearAll();
+ Correction* masterCorrection = correction;
+ if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
+ { // Incrementally tune the word and try all possibilities
+ int codesBuffer[getCodesBufferSize(codes, codesSize)];
+ int xCoordinatesBuffer[codesSize];
+ int yCoordinatesBuffer[codesSize];
+ getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+ xCoordinatesBuffer, yCoordinatesBuffer,
+ codesSize, flags, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool,
+ GERMAN_UMLAUT_DIGRAPHS,
+ sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]));
+ } else if (REQUIRES_FRENCH_LIGATURES_PROCESSING & flags) {
+ int codesBuffer[getCodesBufferSize(codes, codesSize)];
+ int xCoordinatesBuffer[codesSize];
+ int yCoordinatesBuffer[codesSize];
+ getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+ xCoordinatesBuffer, yCoordinatesBuffer,
+ codesSize, flags, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool,
+ FRENCH_LIGATURES_DIGRAPHS,
+ sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0]));
+ } else { // Normal processing
+ getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, flags,
+ masterCorrection, queuePool);
+ }
+
+ PROF_START(20);
+ if (DEBUG_DICT) {
+ double ns = queuePool->getMasterQueue()->getHighestNormalizedScore(
+ proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0);
+ ns += 0;
+ AKLOGI("Max normalized score = %f", ns);
+ }
+ const int suggestedWordsCount =
+ queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords);
+
+ if (DEBUG_DICT) {
+ double ns = queuePool->getMasterQueue()->getHighestNormalizedScore(
+ proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0);
+ ns += 0;
+ AKLOGI("Returning %d words", suggestedWordsCount);
+ /// Print the returned words
+ for (int j = 0; j < suggestedWordsCount; ++j) {
+ short unsigned int* w = outWords + j * MAX_WORD_LENGTH;
+ char s[MAX_WORD_LENGTH];
+ for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
+ AKLOGI("%s %i", s, frequencies[j]);
+ }
+ }
+ PROF_END(20);
+ PROF_CLOSE;
+ return suggestedWordsCount;
+}
+
+void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const int inputLength, const int flags, Correction *correction,
+ WordsPriorityQueuePool *queuePool) {
+
+ PROF_OPEN;
+ PROF_START(0);
+ PROF_END(0);
+
+ PROF_START(1);
+ const bool useFullEditDistance = USE_FULL_EDIT_DISTANCE & flags;
+ getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance,
+ inputLength, correction, queuePool);
+ PROF_END(1);
+
+ PROF_START(2);
+ // Note: This line is intentionally left blank
+ PROF_END(2);
+
+ PROF_START(3);
+ // Note: This line is intentionally left blank
+ PROF_END(3);
+
+ PROF_START(4);
+ bool hasAutoCorrectionCandidate = false;
+ WordsPriorityQueue* masterQueue = queuePool->getMasterQueue();
+ if (masterQueue->size() > 0) {
+ double nsForMaster = masterQueue->getHighestNormalizedScore(
+ proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0);
+ hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD);
+ }
+ PROF_END(4);
+
+ PROF_START(5);
+ // Multiple word suggestions
+ if (SUGGEST_MULTIPLE_WORDS
+ && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
+ getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
+ useFullEditDistance, inputLength, correction, queuePool,
+ hasAutoCorrectionCandidate);
+ }
+ PROF_END(5);
+
+ PROF_START(6);
+ // Note: This line is intentionally left blank
+ PROF_END(6);
+
+ if (DEBUG_DICT) {
+ queuePool->dumpSubQueue1TopSuggestions();
+ for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
+ WordsPriorityQueue* queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i);
+ if (queue->size() > 0) {
+ WordsPriorityQueue::SuggestedWord* sw = queue->top();
+ const int score = sw->mScore;
+ const unsigned short* word = sw->mWord;
+ const int wordLength = sw->mWordLength;
+ double ns = Correction::RankingAlgorithm::calcNormalizedScore(
+ proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
+ ns += 0;
+ AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
+ (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD));
+ DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
+ DUMP_WORD(word, wordLength);
+ }
+ }
+ }
+}
+
+void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates,
+ const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) {
+ if (DEBUG_DICT) {
+ AKLOGI("initSuggest");
+ }
+ proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates);
+ const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
+ correction->initCorrection(proximityInfo, inputLength, maxDepth);
+}
+
+static const char QUOTE = '\'';
+static const char SPACE = ' ';
+
+void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const bool useFullEditDistance, const int inputLength, Correction *correction,
+ WordsPriorityQueuePool *queuePool) {
+ initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
+ getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool,
+ true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX);
+}
+
+void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
+ const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool,
+ const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) {
+ // TODO: Remove setCorrectionParams
+ correction->setCorrectionParams(0, 0, 0,
+ -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
+ doAutoCompletion, maxErrors);
+ int rootPosition = ROOT_POS;
+ // Get the number of children of root, then increment the position
+ int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition);
+ int outputIndex = 0;
+
+ correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0));
+
+ // Depth first search
+ while (outputIndex >= 0) {
+ if (correction->initProcessState(outputIndex)) {
+ int siblingPos = correction->getTreeSiblingPos(outputIndex);
+ int firstChildPos;
+
+ const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
+ correction, &childCount, &firstChildPos, &siblingPos, queuePool,
+ currentWordIndex);
+ // Update next sibling pos
+ correction->setTreeSiblingPos(outputIndex, siblingPos);
+
+ if (needsToTraverseChildrenNodes) {
+ // Goes to child node
+ outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos);
+ }
+ } else {
+ // Goes to parent sibling node
+ outputIndex = correction->getTreeParentIndex(outputIndex);
+ }
+ }
+}
+
+inline void UnigramDictionary::onTerminal(const int freq,
+ const TerminalAttributes& terminalAttributes, Correction *correction,
+ WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
+ const int currentWordIndex) {
+ const int inputIndex = correction->getInputIndex();
+ const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT;
+
+ int wordLength;
+ unsigned short* wordPointer;
+
+ if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) {
+ WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
+ const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
+ if (finalFreq != NOT_A_FREQUENCY) {
+ if (!terminalAttributes.isShortcutOnly()) {
+ addWord(wordPointer, wordLength, finalFreq, masterQueue);
+ }
+
+ // Please note that the shortcut candidates will be added to the master queue only.
+ TerminalAttributes::ShortcutIterator iterator =
+ terminalAttributes.getShortcutIterator();
+ while (iterator.hasNextShortcutTarget()) {
+ // TODO: addWord only supports weak ordering, meaning we have no means
+ // to control the order of the shortcuts relative to one another or to the word.
+ // We need to either modulate the frequency of each shortcut according
+ // to its own shortcut frequency or to make the queue
+ // so that the insert order is protected inside the queue for words
+ // with the same score.
+ uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
+ const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
+ MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
+ addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue);
+ }
+ }
+ }
+
+ // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH
+ // or more length.
+ if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) {
+ WordsPriorityQueue *subQueue;
+ subQueue = queuePool->getSubQueue(currentWordIndex, inputIndex);
+ if (!subQueue) {
+ return;
+ }
+ const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength,
+ inputIndex);
+ addWord(wordPointer, wordLength, finalFreq, subQueue);
+ }
+}
+
+bool UnigramDictionary::getSubStringSuggestion(
+ ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
+ const int *codes, const bool useFullEditDistance, Correction *correction,
+ WordsPriorityQueuePool* queuePool, const int inputLength,
+ const bool hasAutoCorrectionCandidate, const int currentWordIndex,
+ const int inputWordStartPos, const int inputWordLength,
+ const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
+ int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) {
+ unsigned short* tempOutputWord = 0;
+ int nextWordLength = 0;
+ // TODO: Optimize init suggestion
+ initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
+ inputLength, correction);
+
+ int freq = getMostFrequentWordLike(
+ inputWordStartPos, inputWordLength, proximityInfo, mWord);
+ if (freq > 0) {
+ nextWordLength = inputWordLength;
+ tempOutputWord = mWord;
+ } else if (!hasAutoCorrectionCandidate) {
+ if (inputWordStartPos > 0) {
+ const int offset = inputWordStartPos;
+ initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset],
+ codes + offset, inputWordLength, correction);
+ queuePool->clearSubQueue(currentWordIndex);
+ getSuggestionCandidates(useFullEditDistance, inputWordLength, correction,
+ queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
+ if (DEBUG_DICT) {
+ if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
+ AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
+ for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
+ queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord();
+ }
+ }
+ }
+ }
+ WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength);
+ if (!queue || queue->size() < 1) {
+ return false;
+ }
+ int score = 0;
+ const double ns = queue->getHighestNormalizedScore(
+ proximityInfo->getPrimaryInputWord(), inputWordLength,
+ &tempOutputWord, &score, &nextWordLength);
+ if (DEBUG_DICT) {
+ AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score);
+ }
+ // Two words correction won't be done if the score of the first word doesn't exceed the
+ // threshold.
+ if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
+ || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
+ return false;
+ }
+ freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
+ }
+ if (DEBUG_DICT) {
+ AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)"
+ , currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos,
+ wordLengthArray[0]);
+ }
+ if (freq <= 0 || nextWordLength <= 0
+ || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) {
+ return false;
+ }
+ for (int i = 0; i < nextWordLength; ++i) {
+ outputWord[outputWordStartPos + i] = tempOutputWord[i];
+ }
+
+ // Put output values
+ freqArray[currentWordIndex] = freq;
+ // TODO: put output length instead of input length
+ wordLengthArray[currentWordIndex] = inputWordLength;
+ const int tempOutputWordLength = outputWordStartPos + nextWordLength;
+ if (outputWordLength) {
+ *outputWordLength = tempOutputWordLength;
+ }
+
+ if ((inputWordStartPos + inputWordLength) < inputLength) {
+ if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
+ return false;
+ }
+ outputWord[tempOutputWordLength] = SPACE;
+ if (outputWordLength) {
+ ++*outputWordLength;
+ }
+ } else if (currentWordIndex >= 1) {
+ // TODO: Handle 3 or more words
+ const int pairFreq = correction->getFreqForSplitMultipleWords(
+ freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
+ if (DEBUG_DICT) {
+ DUMP_WORD(outputWord, tempOutputWordLength);
+ AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq,
+ inputLength, wordLengthArray[0], tempOutputWordLength);
+ }
+ addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
+ }
+ return true;
+}
+
+void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const bool useFullEditDistance, const int inputLength,
+ Correction *correction, WordsPriorityQueuePool* queuePool,
+ const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex,
+ const int outputWordLength, int *freqArray, int* wordLengthArray,
+ unsigned short* outputWord) {
+ if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) {
+ // Return if the last word index
+ return;
+ }
+ if (startWordIndex >= 1
+ && (hasAutoCorrectionCandidate
+ || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
+ // Do not suggest 3+ words if already has auto correction candidate
+ return;
+ }
+ for (int i = startInputPos + 1; i < inputLength; ++i) {
+ if (DEBUG_CORRECTION_FREQ) {
+ AKLOGI("Multi words(%d), start in %d sep %d start out %d",
+ startWordIndex, startInputPos, i, outputWordLength);
+ DUMP_WORD(outputWord, outputWordLength);
+ }
+ int tempOutputWordLength = 0;
+ // Current word
+ int inputWordStartPos = startInputPos;
+ int inputWordLength = i - startInputPos;
+ if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+ useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
+ startWordIndex, inputWordStartPos, inputWordLength, outputWordLength,
+ true /* not used */, freqArray, wordLengthArray, outputWord,
+ &tempOutputWordLength)) {
+ continue;
+ }
+
+ if (DEBUG_CORRECTION_FREQ) {
+ AKLOGI("Do missing space correction");
+ }
+ // Next word
+ // Missing space
+ inputWordStartPos = i;
+ inputWordLength = inputLength - i;
+ if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+ useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
+ startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
+ false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) {
+ getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
+ useFullEditDistance, inputLength, correction, queuePool,
+ hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
+ tempOutputWordLength, freqArray, wordLengthArray, outputWord);
+ }
+
+ // Mistyped space
+ ++inputWordStartPos;
+ --inputWordLength;
+
+ if (inputWordLength <= 0) {
+ continue;
+ }
+
+ const int x = xcoordinates[inputWordStartPos - 1];
+ const int y = ycoordinates[inputWordStartPos - 1];
+ if (!proximityInfo->hasSpaceProximity(x, y)) {
+ continue;
+ }
+
+ if (DEBUG_CORRECTION_FREQ) {
+ AKLOGI("Do mistyped space correction");
+ }
+ getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+ useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
+ startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
+ true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
+ }
+}
+
+void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const bool useFullEditDistance, const int inputLength,
+ Correction *correction, WordsPriorityQueuePool* queuePool,
+ const bool hasAutoCorrectionCandidate) {
+ if (inputLength >= MAX_WORD_LENGTH) return;
+ if (DEBUG_DICT) {
+ AKLOGI("--- Suggest multiple words");
+ }
+
+ // Allocating fixed length array on stack
+ unsigned short outputWord[MAX_WORD_LENGTH];
+ int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
+ int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
+ const int outputWordLength = 0;
+ const int startInputPos = 0;
+ const int startWordIndex = 0;
+ getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
+ useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate,
+ startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray,
+ outputWord);
+}
+
+// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
+// interface.
+inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
+ const int inputLength, ProximityInfo *proximityInfo, unsigned short *word) {
+ uint16_t inWord[inputLength];
+
+ for (int i = 0; i < inputLength; ++i) {
+ inWord[i] = (uint16_t)proximityInfo->getPrimaryCharAt(startInputIndex + i);
+ }
+ return getMostFrequentWordLikeInner(inWord, inputLength, word);
+}
+
+// This function will take the position of a character array within a CharGroup,
+// and check it actually like-matches the word in inWord starting at startInputIndex,
+// that is, it matches it with case and accents squashed.
+// The function returns true if there was a full match, false otherwise.
+// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
+// It will also place the end position of the array in outPos; in outInputIndex,
+// it will place the index of the first char AFTER the match if there was a match,
+// and the initial position if there was not. It makes sense because if there was
+// a match we want to continue searching, but if there was not, we want to go to
+// the next CharGroup.
+// In and out parameters may point to the same location. This function takes care
+// not to use any input parameters after it wrote into its outputs.
+static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
+ const uint8_t* const root, const int startPos,
+ const uint16_t* const inWord, const int startInputIndex,
+ int32_t* outNewWord, int* outInputIndex, int* outPos) {
+ const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
+ int pos = startPos;
+ int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+ int32_t baseChar = toBaseLowerCase(character);
+ const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
+
+ if (baseChar != wChar) {
+ *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
+ *outInputIndex = startInputIndex;
+ return false;
+ }
+ int inputIndex = startInputIndex;
+ outNewWord[inputIndex] = character;
+ if (hasMultipleChars) {
+ character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+ while (NOT_A_CHARACTER != character) {
+ baseChar = toBaseLowerCase(character);
+ if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
+ *outPos = BinaryFormat::skipOtherCharacters(root, pos);
+ *outInputIndex = startInputIndex;
+ return false;
+ }
+ outNewWord[inputIndex] = character;
+ character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+ }
+ }
+ *outInputIndex = inputIndex + 1;
+ *outPos = pos;
+ return true;
+}
+
+// This function is invoked when a word like the word searched for is found.
+// It will compare the frequency to the max frequency, and if greater, will
+// copy the word into the output buffer. In output value maxFreq, it will
+// write the new maximum frequency if it changed.
+static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
+ short unsigned int* outWord, int* maxFreq) {
+ if (freq > *maxFreq) {
+ for (int q = 0; q < length; ++q)
+ outWord[q] = newWord[q];
+ outWord[length] = 0;
+ *maxFreq = freq;
+ }
+}
+
+// Will find the highest frequency of the words like the one passed as an argument,
+// that is, everything that only differs by case/accents.
+int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
+ const int length, short unsigned int* outWord) {
+ int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
+ int depth = 0;
+ int maxFreq = -1;
+ const uint8_t* const root = DICT_ROOT;
+
+ int startPos = 0;
+ mStackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos);
+ mStackInputIndex[0] = 0;
+ mStackSiblingPos[0] = startPos;
+ while (depth >= 0) {
+ const int charGroupCount = mStackChildCount[depth];
+ int pos = mStackSiblingPos[depth];
+ for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
+ int inputIndex = mStackInputIndex[depth];
+ const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
+ // Test whether all chars in this group match with the word we are searching for. If so,
+ // we want to traverse its children (or if the length match, evaluate its frequency).
+ // Note that this function will output the position regardless, but will only write
+ // into inputIndex if there is a match.
+ const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
+ inputIndex, newWord, &inputIndex, &pos);
+ if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
+ const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
+ onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
+ }
+ pos = BinaryFormat::skipFrequency(flags, pos);
+ const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
+ const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
+ // If we had a match and the word has children, we want to traverse them. We don't have
+ // to traverse words longer than the one we are searching for, since they will not match
+ // anyway, so don't traverse unless inputIndex < length.
+ if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
+ // Save position for this depth, to get back to this once children are done
+ mStackChildCount[depth] = charGroupIndex;
+ mStackSiblingPos[depth] = siblingPos;
+ // Prepare stack values for next depth
+ ++depth;
+ int childrenPos = childrenNodePos;
+ mStackChildCount[depth] =
+ BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
+ mStackSiblingPos[depth] = childrenPos;
+ mStackInputIndex[depth] = inputIndex;
+ pos = childrenPos;
+ // Go to the next depth level.
+ ++depth;
+ break;
+ } else {
+ // No match, or no children, or word too long to ever match: go the next sibling.
+ pos = siblingPos;
+ }
+ }
+ --depth;
+ }
+ return maxFreq;
+}
+
+bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
+ return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
+}
+
+// TODO: remove this function.
+int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
+ int length) const {
+ return -1;
+}
+
+// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
+// If the return value is false, then the caller should read in the output "nextSiblingPosition"
+// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
+// It is worthy to note that when false is returned, the output values other than
+// nextSiblingPosition are undefined.
+// If the return value is true, then the caller must proceed to traverse the children of this
+// node. processCurrentNode will output the information about the children: their count in
+// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
+// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
+// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
+// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
+// there aren't any more nodes at this level, it merely returns the address of the first byte after
+// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
+// given level, as output into newCount when traversing this level's parent.
+inline bool UnigramDictionary::processCurrentNode(const int initialPos,
+ Correction *correction, int *newCount,
+ int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
+ const int currentWordIndex) {
+ if (DEBUG_DICT) {
+ correction->checkState();
+ }
+ int pos = initialPos;
+
+ // Flags contain the following information:
+ // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
+ // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
+ // is on the specified number of bytes.
+ // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
+ // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
+ // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
+ // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
+ const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
+ const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
+ const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
+
+ bool needsToInvokeOnTerminal = false;
+
+ // This gets only ONE character from the stream. Next there will be:
+ // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
+ // else if FLAG_IS_TERMINAL: the frequency
+ // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
+ // Note that you can't have a node that both is not a terminal and has no children.
+ int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
+ assert(NOT_A_CHARACTER != c);
+
+ // We are going to loop through each character and make it look like it's a different
+ // node each time. To do that, we will process characters in this node in order until
+ // we find the character terminator. This is signalled by getCharCode* returning
+ // NOT_A_CHARACTER.
+ // As a special case, if there is only one character in this node, we must not read the
+ // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
+ // This way, each loop run will look like a "virtual node".
+ do {
+ // We prefetch the next char. If 'c' is the last char of this node, we will have
+ // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
+ // should behave as a terminal or not and whether we have children.
+ const int32_t nextc = hasMultipleChars
+ ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
+ const bool isLastChar = (NOT_A_CHARACTER == nextc);
+ // If there are more chars in this nodes, then this virtual node is not a terminal.
+ // If we are on the last char, this virtual node is a terminal if this node is.
+ const bool isTerminal = isLastChar && isTerminalNode;
+
+ Correction::CorrectionType stateType = correction->processCharAndCalcState(
+ c, isTerminal);
+ if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL
+ || stateType == Correction::ON_TERMINAL) {
+ needsToInvokeOnTerminal = true;
+ } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) {
+ // We found that this is an unrelated character, so we should give up traversing
+ // this node and its children entirely.
+ // However we may not be on the last virtual node yet so we skip the remaining
+ // characters in this node, the frequency if it's there, read the next sibling
+ // position to output it, then return false.
+ // We don't have to output other values because we return false, as in
+ // "don't traverse children".
+ if (!isLastChar) {
+ pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
+ }
+ pos = BinaryFormat::skipFrequency(flags, pos);
+ *nextSiblingPosition =
+ BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
+ return false;
+ }
+
+ // Prepare for the next character. Promote the prefetched char to current char - the loop
+ // will take care of prefetching the next. If we finally found our last char, nextc will
+ // contain NOT_A_CHARACTER.
+ c = nextc;
+ } while (NOT_A_CHARACTER != c);
+
+ if (isTerminalNode) {
+ // The frequency should be here, because we come here only if this is actually
+ // a terminal node, and we are on its last char.
+ const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
+ const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos);
+ const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
+ TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
+ onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
+ currentWordIndex);
+
+ // If there are more chars in this node, then this virtual node has children.
+ // If we are on the last char, this virtual node has children if this node has.
+ const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
+
+ // This character matched the typed character (enough to traverse the node at least)
+ // so we just evaluated it. Now we should evaluate this virtual node's children - that
+ // is, if it has any. If it has no children, we're done here - so we skip the end of
+ // the node, output the siblings position, and return false "don't traverse children".
+ // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
+ // remaining char in this group for there can't be any.
+ if (!hasChildren) {
+ pos = BinaryFormat::skipFrequency(flags, pos);
+ *nextSiblingPosition =
+ BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
+ return false;
+ }
+
+ // Optimization: Prune out words that are too long compared to how much was typed.
+ if (correction->needsToPrune()) {
+ pos = BinaryFormat::skipFrequency(flags, pos);
+ *nextSiblingPosition =
+ BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
+ if (DEBUG_DICT_FULL) {
+ AKLOGI("Traversing was pruned.");
+ }
+ return false;
+ }
+ }
+
+ // Now we finished processing this node, and we want to traverse children. If there are no
+ // children, we can't come here.
+ assert(BinaryFormat::hasChildrenInFlags(flags));
+
+ // If this node was a terminal it still has the frequency under the pointer (it may have been
+ // read, but not skipped - see readFrequencyWithoutMovingPointer).
+ // Next come the children position, then possibly attributes (attributes are bigrams only for
+ // now, maybe something related to shortcuts in the future).
+ // Once this is read, we still need to output the number of nodes in the immediate children of
+ // this node, so we read and output it before returning true, as in "please traverse children".
+ pos = BinaryFormat::skipFrequency(flags, pos);
+ int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
+ *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
+ *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
+ *newChildrenPosition = childrenPos;
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index ef9709a89..c8f15566c 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -22,17 +22,16 @@
#include "correction_state.h"
#include "defines.h"
#include "proximity_info.h"
-
-#ifndef NULL
-#define NULL 0
-#endif
+#include "words_priority_queue.h"
+#include "words_priority_queue_pool.h"
namespace latinime {
+class TerminalAttributes;
class UnigramDictionary {
+ typedef struct { int first; int second; int replacement; } digraph_t;
-public:
-
+ public:
// Mask and flags for children address type selection.
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
@@ -46,8 +45,14 @@ public:
// Flag for terminal groups
static const int FLAG_IS_TERMINAL = 0x10;
+ // Flag for shortcut targets presence
+ static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
// Flag for bigram presence
static const int FLAG_HAS_BIGRAMS = 0x04;
+ // Flag for shortcut-only words. Some words are shortcut-only, which means they match when
+ // the user types them but they don't pop in the suggestion strip, only the words they are
+ // shortcuts for do.
+ static const int FLAG_IS_SHORTCUT_ONLY = 0x02;
// Attribute (bigram/shortcut) related flags:
// Flag for presence of more attributes
@@ -64,58 +69,86 @@ public:
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+ // Error tolerances
+ static const int DEFAULT_MAX_ERRORS = 2;
+ static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
+
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
- int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
+ int fullWordMultiplier, int maxWordLength, int maxWords,
const bool isLatestDictVersion);
bool isValidWord(const uint16_t* const inWord, const int length) const;
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
- int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
+ int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool,
+ Correction *correction, const int *xcoordinates,
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
unsigned short *outWords, int *frequencies);
virtual ~UnigramDictionary();
-private:
-
+ private:
void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
- const int *ycoordinates, const int *codes, const int codesSize,
- unsigned short *outWords, int *frequencies, const int flags);
- bool isDigraph(const int* codes, const int i, const int codesSize) const;
+ const int *ycoordinates, const int *codes, const int inputLength,
+ const int flags, Correction *correction, WordsPriorityQueuePool *queuePool);
+ int getDigraphReplacement(const int *codes, const int i, const int codesSize,
+ const digraph_t* const digraphs, const unsigned int digraphsSize) const;
void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
- const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
- const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
+ int *xCoordinatesBuffer, int *yCoordinatesBuffer,
+ const int codesBufferSize, const int flags, const int* codesSrc,
+ const int codesRemain, const int currentDepth, int* codesDest, Correction *correction,
+ WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs,
+ const unsigned int digraphsSize);
void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
- const int *ycoordinates, const int *codes, const int codesSize,
- unsigned short *outWords, int *frequencies);
- void getSuggestionCandidates(const bool useFullEditDistance);
- bool addWord(unsigned short *word, int length, int frequency);
- void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction);
- void getMissingSpaceWords(const int inputLength, const int missingSpacePos,
- Correction *correction, const bool useFullEditDistance);
- void getMistypedSpaceWords(const int inputLength, const int spaceProximityPos,
- Correction *correction, const bool useFullEditDistance);
- void onTerminal(const int freq, Correction *correction);
+ const int *ycoordinates, const int *codes, const int codesSize, Correction *correction);
+ void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
+ const int *ycoordinates, const int *codes, const bool useFullEditDistance,
+ const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool);
+ void getSuggestionCandidates(
+ const bool useFullEditDistance, const int inputLength, Correction *correction,
+ WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
+ const int currentWordIndex);
+ void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const bool useFullEditDistance, const int inputLength,
+ Correction *correction, WordsPriorityQueuePool* queuePool,
+ const bool hasAutoCorrectionCandidate);
+ void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
+ Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
+ const int currentWordIndex);
bool needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth);
// Process a node by considering proximity, missing and excessive character
- bool processCurrentNode(const int initialPos,
- Correction *correction, int *newCount,
- int *newChildPosition, int *nextSiblingPosition);
+ bool processCurrentNode(const int initialPos, Correction *correction, int *newCount,
+ int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
+ const int currentWordIndex);
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
- unsigned short *word);
+ ProximityInfo *proximityInfo, unsigned short *word);
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
- short unsigned int* outWord);
+ short unsigned int *outWord);
+ bool getSubStringSuggestion(
+ ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
+ const int *codes, const bool useFullEditDistance, Correction *correction,
+ WordsPriorityQueuePool* queuePool, const int inputLength,
+ const bool hasAutoCorrectionCandidate, const int currentWordIndex,
+ const int inputWordStartPos, const int inputWordLength,
+ const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
+ int *wordLengthArray, unsigned short* outputWord, int *outputWordLength);
+ void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const bool useFullEditDistance, const int inputLength,
+ Correction *correction, WordsPriorityQueuePool* queuePool,
+ const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex,
+ const int outputWordLength, int *freqArray, int* wordLengthArray,
+ unsigned short* outputWord);
const uint8_t* const DICT_ROOT;
const int MAX_WORD_LENGTH;
const int MAX_WORDS;
- const int MAX_PROXIMITY_CHARS;
const bool IS_LATEST_DICT_VERSION;
const int TYPED_LETTER_MULTIPLIER;
const int FULL_WORD_MULTIPLIER;
const int ROOT_POS;
const unsigned int BYTES_IN_ONE_CHAR;
- const int MAX_UMLAUT_SEARCH_DEPTH;
+ const int MAX_DIGRAPH_SEARCH_DEPTH;
// Flags for special processing
// Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
@@ -123,18 +156,14 @@ private:
// Please update both at the same time.
enum {
REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
- USE_FULL_EDIT_DISTANCE = 0x2
+ USE_FULL_EDIT_DISTANCE = 0x2,
+ REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
};
- static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
-
- int *mFrequencies;
- unsigned short *mOutputChars;
- ProximityInfo *mProximityInfo;
- Correction *mCorrection;
- int mInputLength;
- // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
- unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
+ static const digraph_t GERMAN_UMLAUT_DIGRAPHS[];
+ static const digraph_t FRENCH_LIGATURES_DIGRAPHS[];
+ // Still bundled members
+ unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
diff --git a/native/jni/src/words_priority_queue.h b/native/jni/src/words_priority_queue.h
new file mode 100644
index 000000000..249962eec
--- /dev/null
+++ b/native/jni/src/words_priority_queue.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_WORDS_PRIORITY_QUEUE_H
+#define LATINIME_WORDS_PRIORITY_QUEUE_H
+
+#include <cstring> // for memcpy()
+#include <iostream>
+#include <queue>
+#include "defines.h"
+
+namespace latinime {
+
+class WordsPriorityQueue {
+ public:
+ class SuggestedWord {
+ public:
+ int mScore;
+ unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
+ int mWordLength;
+ bool mUsed;
+
+ void setParams(int score, unsigned short* word, int wordLength) {
+ mScore = score;
+ mWordLength = wordLength;
+ memcpy(mWord, word, sizeof(unsigned short) * wordLength);
+ mUsed = true;
+ }
+ };
+
+ WordsPriorityQueue(int maxWords, int maxWordLength) :
+ MAX_WORDS((unsigned int) maxWords), MAX_WORD_LENGTH(
+ (unsigned int) maxWordLength) {
+ mSuggestedWords = new SuggestedWord[maxWordLength];
+ for (int i = 0; i < maxWordLength; ++i) {
+ mSuggestedWords[i].mUsed = false;
+ }
+ mHighestSuggestedWord = 0;
+ }
+
+ ~WordsPriorityQueue() {
+ delete[] mSuggestedWords;
+ }
+
+ void push(int score, unsigned short* word, int wordLength) {
+ SuggestedWord* sw = 0;
+ if (mSuggestions.size() >= MAX_WORDS) {
+ sw = mSuggestions.top();
+ const int minScore = sw->mScore;
+ if (minScore >= score) {
+ return;
+ } else {
+ sw->mUsed = false;
+ mSuggestions.pop();
+ }
+ }
+ if (sw == 0) {
+ sw = getFreeSuggestedWord(score, word, wordLength);
+ } else {
+ sw->setParams(score, word, wordLength);
+ }
+ if (sw == 0) {
+ AKLOGE("SuggestedWord is accidentally null.");
+ return;
+ }
+ if (DEBUG_WORDS_PRIORITY_QUEUE) {
+ AKLOGI("Push word. %d, %d", score, wordLength);
+ DUMP_WORD(word, wordLength);
+ }
+ mSuggestions.push(sw);
+ if (!mHighestSuggestedWord || mHighestSuggestedWord->mScore < sw->mScore) {
+ mHighestSuggestedWord = sw;
+ }
+ }
+
+ SuggestedWord* top() {
+ if (mSuggestions.empty()) return 0;
+ SuggestedWord* sw = mSuggestions.top();
+ return sw;
+ }
+
+ int outputSuggestions(int *frequencies, unsigned short *outputChars) {
+ mHighestSuggestedWord = 0;
+ const unsigned int size = min(
+ MAX_WORDS, static_cast<unsigned int>(mSuggestions.size()));
+ int index = size - 1;
+ while (!mSuggestions.empty() && index >= 0) {
+ SuggestedWord* sw = mSuggestions.top();
+ if (DEBUG_WORDS_PRIORITY_QUEUE) {
+ AKLOGI("dump word. %d", sw->mScore);
+ DUMP_WORD(sw->mWord, sw->mWordLength);
+ }
+ const unsigned int wordLength = sw->mWordLength;
+ char* targetAdr = (char*) outputChars
+ + (index) * MAX_WORD_LENGTH * sizeof(short);
+ frequencies[index] = sw->mScore;
+ memcpy(targetAdr, sw->mWord, (wordLength) * sizeof(short));
+ if (wordLength < MAX_WORD_LENGTH) {
+ ((unsigned short*) targetAdr)[wordLength] = 0;
+ }
+ sw->mUsed = false;
+ mSuggestions.pop();
+ --index;
+ }
+ return size;
+ }
+
+ int size() const {
+ return mSuggestions.size();
+ }
+
+ void clear() {
+ mHighestSuggestedWord = 0;
+ while (!mSuggestions.empty()) {
+ SuggestedWord* sw = mSuggestions.top();
+ if (DEBUG_WORDS_PRIORITY_QUEUE) {
+ AKLOGI("Clear word. %d", sw->mScore);
+ DUMP_WORD(sw->mWord, sw->mWordLength);
+ }
+ sw->mUsed = false;
+ mSuggestions.pop();
+ }
+ }
+
+ void dumpTopWord() {
+ if (size() <= 0) {
+ return;
+ }
+ DUMP_WORD(mHighestSuggestedWord->mWord, mHighestSuggestedWord->mWordLength);
+ }
+
+ double getHighestNormalizedScore(const unsigned short* before, const int beforeLength,
+ unsigned short** outWord, int *outScore, int *outLength) {
+ if (!mHighestSuggestedWord) {
+ return 0.0;
+ }
+ SuggestedWord* sw = mHighestSuggestedWord;
+ const int score = sw->mScore;
+ unsigned short* word = sw->mWord;
+ const int wordLength = sw->mWordLength;
+ if (outScore) {
+ *outScore = score;
+ }
+ if (outWord) {
+ *outWord = word;
+ }
+ if (outLength) {
+ *outLength = wordLength;
+ }
+ return Correction::RankingAlgorithm::calcNormalizedScore(
+ before, beforeLength, word, wordLength, score);
+ }
+
+ private:
+ struct wordComparator {
+ bool operator ()(SuggestedWord * left, SuggestedWord * right) {
+ return left->mScore > right->mScore;
+ }
+ };
+
+ SuggestedWord* getFreeSuggestedWord(int score, unsigned short* word,
+ int wordLength) {
+ for (unsigned int i = 0; i < MAX_WORD_LENGTH; ++i) {
+ if (!mSuggestedWords[i].mUsed) {
+ mSuggestedWords[i].setParams(score, word, wordLength);
+ return &mSuggestedWords[i];
+ }
+ }
+ return 0;
+ }
+
+ typedef std::priority_queue<SuggestedWord*, std::vector<SuggestedWord*>,
+ wordComparator> Suggestions;
+ Suggestions mSuggestions;
+ const unsigned int MAX_WORDS;
+ const unsigned int MAX_WORD_LENGTH;
+ SuggestedWord* mSuggestedWords;
+ SuggestedWord* mHighestSuggestedWord;
+};
+}
+
+#endif // LATINIME_WORDS_PRIORITY_QUEUE_H
diff --git a/native/jni/src/words_priority_queue_pool.h b/native/jni/src/words_priority_queue_pool.h
new file mode 100644
index 000000000..5b50e8f4f
--- /dev/null
+++ b/native/jni/src/words_priority_queue_pool.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_WORDS_PRIORITY_QUEUE_POOL_H
+#define LATINIME_WORDS_PRIORITY_QUEUE_POOL_H
+
+#include <assert.h>
+#include <new>
+#include "words_priority_queue.h"
+
+namespace latinime {
+
+class WordsPriorityQueuePool {
+ public:
+ WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) {
+ mMasterQueue = new(mMasterQueueBuf) WordsPriorityQueue(mainQueueMaxWords, maxWordLength);
+ for (int i = 0, subQueueBufOffset = 0;
+ i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS * SUB_QUEUE_MAX_COUNT;
+ ++i, subQueueBufOffset += sizeof(WordsPriorityQueue)) {
+ mSubQueues[i] = new(mSubQueueBuf + subQueueBufOffset)
+ WordsPriorityQueue(subQueueMaxWords, maxWordLength);
+ }
+ }
+
+ virtual ~WordsPriorityQueuePool() {
+ }
+
+ WordsPriorityQueue* getMasterQueue() {
+ return mMasterQueue;
+ }
+
+ WordsPriorityQueue* getSubQueue(const int wordIndex, const int inputWordLength) {
+ if (wordIndex >= MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
+ return 0;
+ }
+ if (inputWordLength < 0 || inputWordLength >= SUB_QUEUE_MAX_COUNT) {
+ if (DEBUG_WORDS_PRIORITY_QUEUE) {
+ assert(false);
+ }
+ return 0;
+ }
+ return mSubQueues[wordIndex * SUB_QUEUE_MAX_COUNT + inputWordLength];
+ }
+
+ inline void clearAll() {
+ mMasterQueue->clear();
+ for (int i = 0; i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS; ++i) {
+ clearSubQueue(i);
+ }
+ }
+
+ inline void clearSubQueue(const int wordIndex) {
+ for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
+ WordsPriorityQueue* queue = getSubQueue(wordIndex, i);
+ if (queue) {
+ queue->clear();
+ }
+ }
+ }
+
+ void dumpSubQueue1TopSuggestions() {
+ AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS");
+ for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
+ getSubQueue(0, i)->dumpTopWord();
+ }
+ }
+
+ private:
+ WordsPriorityQueue* mMasterQueue;
+ WordsPriorityQueue* mSubQueues[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
+ char mMasterQueueBuf[sizeof(WordsPriorityQueue)];
+ char mSubQueueBuf[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS
+ * SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)];
+};
+}
+
+#endif // LATINIME_WORDS_PRIORITY_QUEUE_POOL_H
diff --git a/native/src/char_utils.h b/native/src/char_utils.h
deleted file mode 100644
index a69a35e7a..000000000
--- a/native/src/char_utils.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (C) 2010 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_CHAR_UTILS_H
-#define LATINIME_CHAR_UTILS_H
-
-namespace latinime {
-
-unsigned short latin_tolower(unsigned short c);
-
-} // namespace latinime
-
-#endif // LATINIME_CHAR_UTILS_H
diff --git a/native/src/dictionary.h b/native/src/dictionary.h
deleted file mode 100644
index d5de0083a..000000000
--- a/native/src/dictionary.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (C) 2009 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_DICTIONARY_H
-#define LATINIME_DICTIONARY_H
-
-#include "basechars.h"
-#include "bigram_dictionary.h"
-#include "char_utils.h"
-#include "defines.h"
-#include "proximity_info.h"
-#include "unigram_dictionary.h"
-
-namespace latinime {
-
-class Dictionary {
-public:
- Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler,
- int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives);
- int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates,
- int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) {
- return mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
- codesSize, flags, outWords, frequencies);
- }
-
- // TODO: Call mBigramDictionary instead of mUnigramDictionary
- int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
- unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
- int maxAlternatives) {
- return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies,
- maxWordLength, maxBigrams, maxAlternatives);
- }
-
- bool isValidWord(unsigned short *word, int length);
- void *getDict() { return (void *)mDict; }
- int getDictSize() { return mDictSize; }
- int getMmapFd() { return mMmapFd; }
- int getDictBufAdjust() { return mDictBufAdjust; }
- ~Dictionary();
-
- // public static utility methods
- // static inline methods should be defined in the header file
- static unsigned short getChar(const unsigned char *dict, int *pos);
- static int getCount(const unsigned char *dict, int *pos);
- static bool getTerminal(const unsigned char *dict, int *pos);
- static int getAddress(const unsigned char *dict, int *pos);
- static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos);
- static int wideStrLen(unsigned short *str);
- // returns next sibling's position
- static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion,
- const int pos, unsigned short *c, int *childrenPosition,
- bool *terminal, int *freq);
- static inline unsigned short toBaseLowerCase(unsigned short c);
-
-private:
- bool hasBigram();
-
- const unsigned char *mDict;
-
- // Used only for the mmap version of dictionary loading, but we use these as dummy variables
- // also for the malloc version.
- const int mDictSize;
- const int mMmapFd;
- const int mDictBufAdjust;
-
- const bool IS_LATEST_DICT_VERSION;
- UnigramDictionary *mUnigramDictionary;
- BigramDictionary *mBigramDictionary;
-};
-
-// public static utility methods
-// static inline methods should be defined in the header file
-inline unsigned short Dictionary::getChar(const unsigned char *dict, int *pos) {
- unsigned short ch = (unsigned short) (dict[(*pos)++] & 0xFF);
- // If the code is 255, then actual 16 bit code follows (in big endian)
- if (ch == 0xFF) {
- ch = ((dict[*pos] & 0xFF) << 8) | (dict[*pos + 1] & 0xFF);
- (*pos) += 2;
- }
- return ch;
-}
-
-inline int Dictionary::getCount(const unsigned char *dict, int *pos) {
- return dict[(*pos)++] & 0xFF;
-}
-
-inline bool Dictionary::getTerminal(const unsigned char *dict, int *pos) {
- return (dict[*pos] & FLAG_TERMINAL_MASK) > 0;
-}
-
-inline int Dictionary::getAddress(const unsigned char *dict, int *pos) {
- int address = 0;
- if ((dict[*pos] & FLAG_ADDRESS_MASK) == 0) {
- *pos += 1;
- } else {
- address += (dict[*pos] & (ADDRESS_MASK >> 16)) << 16;
- address += (dict[*pos + 1] & 0xFF) << 8;
- address += (dict[*pos + 2] & 0xFF);
- *pos += 3;
- }
- return address;
-}
-
-inline int Dictionary::getFreq(const unsigned char *dict,
- const bool isLatestDictVersion, int *pos) {
- int freq = dict[(*pos)++] & 0xFF;
- if (isLatestDictVersion) {
- // skipping bigram
- int bigramExist = (dict[*pos] & FLAG_BIGRAM_READ);
- if (bigramExist > 0) {
- int nextBigramExist = 1;
- while (nextBigramExist > 0) {
- (*pos) += 3;
- nextBigramExist = (dict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
- }
- } else {
- (*pos)++;
- }
- }
- return freq;
-}
-
-inline int Dictionary::wideStrLen(unsigned short *str) {
- if (!str) return 0;
- unsigned short *end = str;
- while (*end)
- end++;
- return end - str;
-}
-
-inline int Dictionary::setDictionaryValues(const unsigned char *dict,
- const bool isLatestDictVersion, const int pos, unsigned short *c,int *childrenPosition,
- bool *terminal, int *freq) {
- int position = pos;
- // -- at char
- *c = Dictionary::getChar(dict, &position);
- // -- at flag/add
- *terminal = Dictionary::getTerminal(dict, &position);
- *childrenPosition = Dictionary::getAddress(dict, &position);
- // -- after address or flag
- *freq = (*terminal) ? Dictionary::getFreq(dict, isLatestDictVersion, &position) : 1;
- // returns next sibling's position
- return position;
-}
-
-
-inline unsigned short Dictionary::toBaseLowerCase(unsigned short c) {
- if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
- c = BASE_CHARS[c];
- }
- if (c >='A' && c <= 'Z') {
- c |= 32;
- } else if (c > 127) {
- c = latin_tolower(c);
- }
- return c;
-}
-
-} // namespace latinime
-
-#endif // LATINIME_DICTIONARY_H
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
deleted file mode 100644
index 8eb5a9700..000000000
--- a/native/src/unigram_dictionary.cpp
+++ /dev/null
@@ -1,729 +0,0 @@
-/*
-**
-** Copyright 2010, The Android Open Source Project
-**
-** Licensed under the Apache License, Version 2.0 (the "License");
-** you may not use this file except in compliance with the License.
-** You may obtain a copy of the License at
-**
-** http://www.apache.org/licenses/LICENSE-2.0
-**
-** Unless required by applicable law or agreed to in writing, software
-** distributed under the License is distributed on an "AS IS" BASIS,
-** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-** See the License for the specific language governing permissions and
-** limitations under the License.
-*/
-
-#include <assert.h>
-#include <string.h>
-
-#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
-
-#include "char_utils.h"
-#include "dictionary.h"
-#include "unigram_dictionary.h"
-
-#include "binary_format.h"
-
-namespace latinime {
-
-const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
- { { 'a', 'e' },
- { 'o', 'e' },
- { 'u', 'e' } };
-
-// TODO: check the header
-UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
- int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
- const bool isLatestDictVersion)
- : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
- MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
- MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
- TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
- // TODO : remove this variable.
- ROOT_POS(0),
- BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
- MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
- if (DEBUG_DICT) {
- LOGI("UnigramDictionary - constructor");
- }
- mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier);
-}
-
-UnigramDictionary::~UnigramDictionary() {
- delete mCorrection;
-}
-
-static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
- const int MAX_PROXIMITY_CHARS) {
- return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
-}
-
-bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
-
- // There can't be a digraph if we don't have at least 2 characters to examine
- if (i + 2 > codesSize) return false;
-
- // Search for the first char of some digraph
- int lastDigraphIndex = -1;
- const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
- for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
- lastDigraphIndex >= 0; --lastDigraphIndex) {
- if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
- }
- // No match: return early
- if (lastDigraphIndex < 0) return false;
-
- // It's an interesting digraph if the second char matches too.
- return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
-}
-
-// Mostly the same arguments as the non-recursive version, except:
-// codes is the original value. It points to the start of the work buffer, and gets passed as is.
-// codesSize is the size of the user input (thus, it is the size of codesSrc).
-// codesDest is the current point in the work buffer.
-// codesSrc is the current point in the user-input, original, content-unmodified buffer.
-// codesRemain is the remaining size in codesSrc.
-void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
- const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
- const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
- const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
-
- if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
- for (int i = 0; i < codesRemain; ++i) {
- if (isDigraph(codesSrc, i, codesRemain)) {
- // Found a digraph. We will try both spellings. eg. the word is "pruefen"
-
- // Copy the word up to the first char of the digraph, then continue processing
- // on the remaining part of the word, skipping the second char of the digraph.
- // In our example, copy "pru" and continue running on "fen"
- // Make i the index of the second char of the digraph for simplicity. Forgetting
- // to do that results in an infinite recursion so take care!
- ++i;
- memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
- getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
- codesBuffer, codesBufferSize, flags,
- codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
- currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
- frequencies);
-
- // Copy the second char of the digraph in place, then continue processing on
- // the remaining part of the word.
- // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
- memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
- BYTES_IN_ONE_CHAR);
- getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
- codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
- codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
- outWords, frequencies);
- return;
- }
- }
- }
-
- // If we come here, we hit the end of the word: let's check it against the dictionary.
- // In our example, we'll come here once for "prufen" and then once for "pruefen".
- // If the word contains several digraphs, we'll come it for the product of them.
- // eg. if the word is "ueberpruefen" we'll test, in order, against
- // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
- const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
- if (0 != remainingBytes)
- memcpy(codesDest, codesSrc, remainingBytes);
-
- getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
- (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies,
- flags);
-}
-
-int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
- const int *ycoordinates, const int *codes, const int codesSize, const int flags,
- unsigned short *outWords, int *frequencies) {
-
- if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
- { // Incrementally tune the word and try all possibilities
- int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
- getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
- codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
- } else { // Normal processing
- getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
- outWords, frequencies, flags);
- }
-
- PROF_START(20);
- // Get the word count
- int suggestedWordsCount = 0;
- while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
- suggestedWordsCount++;
- }
-
- if (DEBUG_DICT) {
- LOGI("Returning %d words", suggestedWordsCount);
- /// Print the returned words
- for (int j = 0; j < suggestedWordsCount; ++j) {
-#ifdef FLAG_DBG
- short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
- char s[MAX_WORD_LENGTH];
- for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
- LOGI("%s %i", s, mFrequencies[j]);
-#endif
- }
- }
- PROF_END(20);
- PROF_CLOSE;
- return suggestedWordsCount;
-}
-
-void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
- const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
- unsigned short *outWords, int *frequencies, const int flags) {
-
- PROF_OPEN;
- PROF_START(0);
- initSuggestions(
- proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies);
- if (DEBUG_DICT) assert(codesSize == mInputLength);
-
- const int maxDepth = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
- mCorrection->initCorrection(mProximityInfo, mInputLength, maxDepth);
- PROF_END(0);
-
- const bool useFullEditDistance = USE_FULL_EDIT_DISTANCE & flags;
- // TODO: remove
- PROF_START(1);
- getSuggestionCandidates(useFullEditDistance);
- PROF_END(1);
-
- PROF_START(2);
- // Note: This line is intentionally left blank
- PROF_END(2);
-
- PROF_START(3);
- // Note: This line is intentionally left blank
- PROF_END(3);
-
- PROF_START(4);
- // Note: This line is intentionally left blank
- PROF_END(4);
-
- PROF_START(5);
- // Suggestions with missing space
- if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
- && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
- for (int i = 1; i < codesSize; ++i) {
- if (DEBUG_DICT) {
- LOGI("--- Suggest missing space characters %d", i);
- }
- getMissingSpaceWords(mInputLength, i, mCorrection, useFullEditDistance);
- }
- }
- PROF_END(5);
-
- PROF_START(6);
- if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
- // The first and last "mistyped spaces" are taken care of by excessive character handling
- for (int i = 1; i < codesSize - 1; ++i) {
- if (DEBUG_DICT) {
- LOGI("--- Suggest words with proximity space %d", i);
- }
- const int x = xcoordinates[i];
- const int y = ycoordinates[i];
- if (DEBUG_PROXIMITY_INFO) {
- LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
- i, x, y, proximityInfo->hasSpaceProximity(x, y));
- }
- if (proximityInfo->hasSpaceProximity(x, y)) {
- getMistypedSpaceWords(mInputLength, i, mCorrection, useFullEditDistance);
- }
- }
- }
- PROF_END(6);
-}
-
-void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates,
- const int *yCoordinates, const int *codes, const int codesSize,
- unsigned short *outWords, int *frequencies) {
- if (DEBUG_DICT) {
- LOGI("initSuggest");
- }
- mFrequencies = frequencies;
- mOutputChars = outWords;
- mInputLength = codesSize;
- proximityInfo->setInputParams(codes, codesSize, xCoordinates, yCoordinates);
- mProximityInfo = proximityInfo;
-}
-
-static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
- if (c < nextLettersSize) {
- nextLetters[c]++;
- }
-}
-
-// TODO: We need to optimize addWord by using STL or something
-// TODO: This needs to take an const unsigned short* and not tinker with its contents
-bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
- word[length] = 0;
- if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
-#ifdef FLAG_DBG
- char s[length + 1];
- for (int i = 0; i <= length; i++) s[i] = word[i];
- LOGI("Found word = %s, freq = %d", s, frequency);
-#endif
- }
- if (length > MAX_WORD_LENGTH) {
- if (DEBUG_DICT) {
- LOGI("Exceeded max word length.");
- }
- return false;
- }
-
- // Find the right insertion point
- int insertAt = 0;
- while (insertAt < MAX_WORDS) {
- // TODO: How should we sort words with the same frequency?
- if (frequency > mFrequencies[insertAt]) {
- break;
- }
- insertAt++;
- }
- if (insertAt < MAX_WORDS) {
- if (DEBUG_DICT) {
-#ifdef FLAG_DBG
- char s[length + 1];
- for (int i = 0; i <= length; i++) s[i] = word[i];
- LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
-#endif
- }
- memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
- (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
- (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
- mFrequencies[insertAt] = frequency;
- memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
- (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short),
- (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
- unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH;
- while (length--) {
- *dest++ = *word++;
- }
- *dest = 0; // NULL terminate
- if (DEBUG_DICT) {
- LOGI("Added word at %d", insertAt);
- }
- return true;
- }
- return false;
-}
-
-static const char QUOTE = '\'';
-static const char SPACE = ' ';
-
-void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance) {
- // TODO: Remove setCorrectionParams
- mCorrection->setCorrectionParams(0, 0, 0,
- -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance);
- int rootPosition = ROOT_POS;
- // Get the number of children of root, then increment the position
- int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
- int outputIndex = 0;
-
- mCorrection->initCorrectionState(rootPosition, childCount, (mInputLength <= 0));
-
- // Depth first search
- while (outputIndex >= 0) {
- if (mCorrection->initProcessState(outputIndex)) {
- int siblingPos = mCorrection->getTreeSiblingPos(outputIndex);
- int firstChildPos;
-
- const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
- mCorrection, &childCount, &firstChildPos, &siblingPos);
- // Update next sibling pos
- mCorrection->setTreeSiblingPos(outputIndex, siblingPos);
-
- if (needsToTraverseChildrenNodes) {
- // Goes to child node
- outputIndex = mCorrection->goDownTree(outputIndex, childCount, firstChildPos);
- }
- } else {
- // Goes to parent sibling node
- outputIndex = mCorrection->getTreeParentIndex(outputIndex);
- }
- }
-}
-
-void UnigramDictionary::getMissingSpaceWords(
- const int inputLength, const int missingSpacePos, Correction *correction,
- const bool useFullEditDistance) {
- correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
- -1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos,
- useFullEditDistance);
- getSplitTwoWordsSuggestion(inputLength, correction);
-}
-
-void UnigramDictionary::getMistypedSpaceWords(
- const int inputLength, const int spaceProximityPos, Correction *correction,
- const bool useFullEditDistance) {
- correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
- -1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */,
- useFullEditDistance);
- getSplitTwoWordsSuggestion(inputLength, correction);
-}
-
-inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
- const int inputIndex, const int skipPos, const int depth) {
- const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex);
- // Skip the ' or other letter and continue deeper
- return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
-}
-
-inline void UnigramDictionary::onTerminal(const int freq, Correction *correction) {
- int wordLength;
- unsigned short* wordPointer;
- const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
- if (finalFreq >= 0) {
- addWord(wordPointer, wordLength, finalFreq);
- }
-}
-
-void UnigramDictionary::getSplitTwoWordsSuggestion(
- const int inputLength, Correction* correction) {
- const int spaceProximityPos = correction->getSpaceProximityPos();
- const int missingSpacePos = correction->getMissingSpacePos();
- if (DEBUG_DICT) {
- int inputCount = 0;
- if (spaceProximityPos >= 0) ++inputCount;
- if (missingSpacePos >= 0) ++inputCount;
- assert(inputCount <= 1);
- }
- const bool isSpaceProximity = spaceProximityPos >= 0;
- const int firstWordStartPos = 0;
- const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
- const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
- const int secondWordLength = isSpaceProximity
- ? (inputLength - spaceProximityPos - 1)
- : (inputLength - missingSpacePos);
-
- if (inputLength >= MAX_WORD_LENGTH) return;
- if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
- || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
- return;
-
- const int newWordLength = firstWordLength + secondWordLength + 1;
- // Allocating variable length array on stack
- unsigned short word[newWordLength];
- const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord);
- if (DEBUG_DICT) {
- LOGI("First freq: %d", firstFreq);
- }
- if (firstFreq <= 0) return;
-
- for (int i = 0; i < firstWordLength; ++i) {
- word[i] = mWord[i];
- }
-
- const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord);
- if (DEBUG_DICT) {
- LOGI("Second freq: %d", secondFreq);
- }
- if (secondFreq <= 0) return;
-
- word[firstWordLength] = SPACE;
- for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
- word[i] = mWord[i - firstWordLength - 1];
- }
-
- const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
- if (DEBUG_DICT) {
- LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
- }
- addWord(word, newWordLength, pairFreq);
- return;
-}
-
-// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
-// interface.
-inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
- const int inputLength, unsigned short *word) {
- uint16_t inWord[inputLength];
-
- for (int i = 0; i < inputLength; ++i) {
- inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i);
- }
- return getMostFrequentWordLikeInner(inWord, inputLength, word);
-}
-
-// This function will take the position of a character array within a CharGroup,
-// and check it actually like-matches the word in inWord starting at startInputIndex,
-// that is, it matches it with case and accents squashed.
-// The function returns true if there was a full match, false otherwise.
-// The function will copy on-the-fly the characters in the CharGroup to outNewWord.
-// It will also place the end position of the array in outPos; in outInputIndex,
-// it will place the index of the first char AFTER the match if there was a match,
-// and the initial position if there was not. It makes sense because if there was
-// a match we want to continue searching, but if there was not, we want to go to
-// the next CharGroup.
-// In and out parameters may point to the same location. This function takes care
-// not to use any input parameters after it wrote into its outputs.
-static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
- const uint8_t* const root, const int startPos,
- const uint16_t* const inWord, const int startInputIndex,
- int32_t* outNewWord, int* outInputIndex, int* outPos) {
- const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
- int pos = startPos;
- int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
- int32_t baseChar = Dictionary::toBaseLowerCase(character);
- const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]);
-
- if (baseChar != wChar) {
- *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
- *outInputIndex = startInputIndex;
- return false;
- }
- int inputIndex = startInputIndex;
- outNewWord[inputIndex] = character;
- if (hasMultipleChars) {
- character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
- while (NOT_A_CHARACTER != character) {
- baseChar = Dictionary::toBaseLowerCase(character);
- if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
- *outPos = BinaryFormat::skipOtherCharacters(root, pos);
- *outInputIndex = startInputIndex;
- return false;
- }
- outNewWord[inputIndex] = character;
- character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
- }
- }
- *outInputIndex = inputIndex + 1;
- *outPos = pos;
- return true;
-}
-
-// This function is invoked when a word like the word searched for is found.
-// It will compare the frequency to the max frequency, and if greater, will
-// copy the word into the output buffer. In output value maxFreq, it will
-// write the new maximum frequency if it changed.
-static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length,
- short unsigned int* outWord, int* maxFreq) {
- if (freq > *maxFreq) {
- for (int q = 0; q < length; ++q)
- outWord[q] = newWord[q];
- outWord[length] = 0;
- *maxFreq = freq;
- }
-}
-
-// Will find the highest frequency of the words like the one passed as an argument,
-// that is, everything that only differs by case/accents.
-int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord,
- const int length, short unsigned int* outWord) {
- int32_t newWord[MAX_WORD_LENGTH_INTERNAL];
- int depth = 0;
- int maxFreq = -1;
- const uint8_t* const root = DICT_ROOT;
-
- mStackChildCount[0] = root[0];
- mStackInputIndex[0] = 0;
- mStackSiblingPos[0] = 1;
- while (depth >= 0) {
- const int charGroupCount = mStackChildCount[depth];
- int pos = mStackSiblingPos[depth];
- for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) {
- int inputIndex = mStackInputIndex[depth];
- const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
- // Test whether all chars in this group match with the word we are searching for. If so,
- // we want to traverse its children (or if the length match, evaluate its frequency).
- // Note that this function will output the position regardless, but will only write
- // into inputIndex if there is a match.
- const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord,
- inputIndex, newWord, &inputIndex, &pos);
- if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) {
- const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
- onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq);
- }
- pos = BinaryFormat::skipFrequency(flags, pos);
- const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
- const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos);
- // If we had a match and the word has children, we want to traverse them. We don't have
- // to traverse words longer than the one we are searching for, since they will not match
- // anyway, so don't traverse unless inputIndex < length.
- if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) {
- // Save position for this depth, to get back to this once children are done
- mStackChildCount[depth] = charGroupIndex;
- mStackSiblingPos[depth] = siblingPos;
- // Prepare stack values for next depth
- ++depth;
- int childrenPos = childrenNodePos;
- mStackChildCount[depth] =
- BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos);
- mStackSiblingPos[depth] = childrenPos;
- mStackInputIndex[depth] = inputIndex;
- pos = childrenPos;
- // Go to the next depth level.
- ++depth;
- break;
- } else {
- // No match, or no children, or word too long to ever match: go the next sibling.
- pos = siblingPos;
- }
- }
- --depth;
- }
- return maxFreq;
-}
-
-bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
- return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
-}
-
-// TODO: remove this function.
-int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
- int length) const {
- return -1;
-}
-
-// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not.
-// If the return value is false, then the caller should read in the output "nextSiblingPosition"
-// to find out the address of the next sibling node and pass it to a new call of processCurrentNode.
-// It is worthy to note that when false is returned, the output values other than
-// nextSiblingPosition are undefined.
-// If the return value is true, then the caller must proceed to traverse the children of this
-// node. processCurrentNode will output the information about the children: their count in
-// newCount, their position in newChildrenPosition, the traverseAllNodes flag in
-// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the
-// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into
-// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when
-// there aren't any more nodes at this level, it merely returns the address of the first byte after
-// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
-// given level, as output into newCount when traversing this level's parent.
-inline bool UnigramDictionary::processCurrentNode(const int initialPos,
- Correction *correction, int *newCount,
- int *newChildrenPosition, int *nextSiblingPosition) {
- if (DEBUG_DICT) {
- correction->checkState();
- }
- int pos = initialPos;
-
- // Flags contain the following information:
- // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits:
- // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address
- // is on the specified number of bytes.
- // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address.
- // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
- // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
- // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
- const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
- const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
- const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags));
-
- bool needsToInvokeOnTerminal = false;
-
- // This gets only ONE character from the stream. Next there will be:
- // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node
- // else if FLAG_IS_TERMINAL: the frequency
- // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
- // Note that you can't have a node that both is not a terminal and has no children.
- int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
- assert(NOT_A_CHARACTER != c);
-
- // We are going to loop through each character and make it look like it's a different
- // node each time. To do that, we will process characters in this node in order until
- // we find the character terminator. This is signalled by getCharCode* returning
- // NOT_A_CHARACTER.
- // As a special case, if there is only one character in this node, we must not read the
- // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags.
- // This way, each loop run will look like a "virtual node".
- do {
- // We prefetch the next char. If 'c' is the last char of this node, we will have
- // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node
- // should behave as a terminal or not and whether we have children.
- const int32_t nextc = hasMultipleChars
- ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER;
- const bool isLastChar = (NOT_A_CHARACTER == nextc);
- // If there are more chars in this nodes, then this virtual node is not a terminal.
- // If we are on the last char, this virtual node is a terminal if this node is.
- const bool isTerminal = isLastChar && isTerminalNode;
-
- Correction::CorrectionType stateType = correction->processCharAndCalcState(
- c, isTerminal);
- if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL
- || stateType == Correction::ON_TERMINAL) {
- needsToInvokeOnTerminal = true;
- } else if (stateType == Correction::UNRELATED) {
- // We found that this is an unrelated character, so we should give up traversing
- // this node and its children entirely.
- // However we may not be on the last virtual node yet so we skip the remaining
- // characters in this node, the frequency if it's there, read the next sibling
- // position to output it, then return false.
- // We don't have to output other values because we return false, as in
- // "don't traverse children".
- if (!isLastChar) {
- pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
- }
- pos = BinaryFormat::skipFrequency(flags, pos);
- *nextSiblingPosition =
- BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
- return false;
- }
-
- // Prepare for the next character. Promote the prefetched char to current char - the loop
- // will take care of prefetching the next. If we finally found our last char, nextc will
- // contain NOT_A_CHARACTER.
- c = nextc;
- } while (NOT_A_CHARACTER != c);
-
- if (isTerminalNode) {
- if (needsToInvokeOnTerminal) {
- // The frequency should be here, because we come here only if this is actually
- // a terminal node, and we are on its last char.
- const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
- onTerminal(freq, mCorrection);
- }
-
- // If there are more chars in this node, then this virtual node has children.
- // If we are on the last char, this virtual node has children if this node has.
- const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
-
- // This character matched the typed character (enough to traverse the node at least)
- // so we just evaluated it. Now we should evaluate this virtual node's children - that
- // is, if it has any. If it has no children, we're done here - so we skip the end of
- // the node, output the siblings position, and return false "don't traverse children".
- // Note that !hasChildren implies isLastChar, so we know we don't have to skip any
- // remaining char in this group for there can't be any.
- if (!hasChildren) {
- pos = BinaryFormat::skipFrequency(flags, pos);
- *nextSiblingPosition =
- BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
- return false;
- }
-
- // Optimization: Prune out words that are too long compared to how much was typed.
- if (correction->needsToPrune()) {
- pos = BinaryFormat::skipFrequency(flags, pos);
- *nextSiblingPosition =
- BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
- if (DEBUG_DICT_FULL) {
- LOGI("Traversing was pruned.");
- }
- return false;
- }
- }
-
- // Now we finished processing this node, and we want to traverse children. If there are no
- // children, we can't come here.
- assert(BinaryFormat::hasChildrenInFlags(flags));
-
- // If this node was a terminal it still has the frequency under the pointer (it may have been
- // read, but not skipped - see readFrequencyWithoutMovingPointer).
- // Next come the children position, then possibly attributes (attributes are bigrams only for
- // now, maybe something related to shortcuts in the future).
- // Once this is read, we still need to output the number of nodes in the immediate children of
- // this node, so we read and output it before returning true, as in "please traverse children".
- pos = BinaryFormat::skipFrequency(flags, pos);
- int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
- *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
- *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
- *newChildrenPosition = childrenPos;
- return true;
-}
-
-} // namespace latinime