diff options
Diffstat (limited to 'native')
26 files changed, 1664 insertions, 790 deletions
diff --git a/native/Android.mk b/native/Android.mk index cf1da4a41..5053e7d64 100644 --- a/native/Android.mk +++ b/native/Android.mk @@ -1,60 +1 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) - -LOCAL_C_INCLUDES += $(LOCAL_PATH)/src - -LOCAL_CFLAGS += -Werror -Wall - -# To suppress compiler warnings for unused variables/functions used for debug features etc. -LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function - -LOCAL_SRC_FILES := \ - jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \ - jni/com_android_inputmethod_latin_BinaryDictionary.cpp \ - jni/jni_common.cpp \ - src/bigram_dictionary.cpp \ - src/char_utils.cpp \ - src/correction.cpp \ - src/dictionary.cpp \ - src/proximity_info.cpp \ - src/unigram_dictionary.cpp - -#FLAG_DBG := true -#FLAG_DO_PROFILE := true - -TARGETING_UNBUNDLED_FROYO := true - -ifeq ($(TARGET_ARCH), x86) - TARGETING_UNBUNDLED_FROYO := false -endif - -ifeq ($(FLAG_DBG), true) - TARGETING_UNBUNDLED_FROYO := false -endif - -ifeq ($(FLAG_DO_PROFILE), true) - TARGETING_UNBUNDLED_FROYO := false -endif - -ifeq ($(TARGETING_UNBUNDLED_FROYO), true) - LOCAL_NDK_VERSION := 4 - LOCAL_SDK_VERSION := 8 -endif - -LOCAL_MODULE := libjni_latinime - -LOCAL_MODULE_TAGS := optional - -ifeq ($(FLAG_DO_PROFILE), true) - $(warning Making profiling version of native library) - LOCAL_CFLAGS += -DFLAG_DO_PROFILE - LOCAL_SHARED_LIBRARIES := libcutils libutils -else # FLAG_DO_PROFILE -ifeq ($(FLAG_DBG), true) - $(warning Making debug version of native library) - LOCAL_CFLAGS += -DFLAG_DBG - LOCAL_SHARED_LIBRARIES := libcutils libutils -endif # FLAG_DBG -endif # FLAG_DO_PROFILE - -include $(BUILD_SHARED_LIBRARY) +include $(call all-subdir-makefiles) diff --git a/native/jni/Android.mk b/native/jni/Android.mk new file mode 100644 index 000000000..53bd21ddc --- /dev/null +++ b/native/jni/Android.mk @@ -0,0 +1,87 @@ +# Copyright (C) 2011 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) + +LATIN_IME_SRC_DIR := ../src + +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) + +LOCAL_CFLAGS += -Werror -Wall + +# To suppress compiler warnings for unused variables/functions used for debug features etc. +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function + +LATIN_IME_JNI_SRC_FILES := \ + com_android_inputmethod_keyboard_ProximityInfo.cpp \ + com_android_inputmethod_latin_BinaryDictionary.cpp \ + jni_common.cpp + +LATIN_IME_CORE_SRC_FILES := \ + basechars.cpp \ + bigram_dictionary.cpp \ + char_utils.cpp \ + correction.cpp \ + dictionary.cpp \ + proximity_info.cpp \ + unigram_dictionary.cpp + +LOCAL_SRC_FILES := \ + $(LATIN_IME_JNI_SRC_FILES) \ + $(addprefix $(LATIN_IME_SRC_DIR)/,$(LATIN_IME_CORE_SRC_FILES)) + +#FLAG_DBG := true +#FLAG_DO_PROFILE := true + +TARGETING_UNBUNDLED_FROYO := true + +ifeq ($(TARGET_ARCH), x86) + TARGETING_UNBUNDLED_FROYO := false +endif + +ifeq ($(FLAG_DBG), true) + TARGETING_UNBUNDLED_FROYO := false +endif + +ifeq ($(FLAG_DO_PROFILE), true) + TARGETING_UNBUNDLED_FROYO := false +endif + +ifeq ($(TARGETING_UNBUNDLED_FROYO), true) + LOCAL_NDK_VERSION := 4 + LOCAL_SDK_VERSION := 8 +endif + +LOCAL_MODULE := libjni_latinime + +LOCAL_MODULE_TAGS := optional + +# For STL +LOCAL_C_INCLUDES += external/stlport/stlport bionic +LOCAL_SHARED_LIBRARIES += libstlport + +ifeq ($(FLAG_DO_PROFILE), true) + $(warning Making profiling version of native library) + LOCAL_CFLAGS += -DFLAG_DO_PROFILE + LOCAL_SHARED_LIBRARIES += libcutils libutils +else # FLAG_DO_PROFILE +ifeq ($(FLAG_DBG), true) + $(warning Making debug version of native library) + LOCAL_CFLAGS += -DFLAG_DBG + LOCAL_SHARED_LIBRARIES += libcutils libutils +endif # FLAG_DBG +endif # FLAG_DO_PROFILE + +include $(BUILD_SHARED_LIBRARY) diff --git a/native/jni/Application.mk b/native/jni/Application.mk new file mode 100644 index 000000000..caf3b2622 --- /dev/null +++ b/native/jni/Application.mk @@ -0,0 +1 @@ +APP_STL := stlport_static diff --git a/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp b/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp index 595ea2fdc..6e4fefd72 100644 --- a/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp +++ b/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp @@ -28,14 +28,14 @@ namespace latinime { -static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object, +static jlong latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object, jint maxProximityCharsSize, jint displayWidth, jint displayHeight, jint gridWidth, jint gridHeight, jintArray proximityCharsArray, jint keyCount, jintArray keyXCoordinateArray, jintArray keyYCoordinateArray, jintArray keyWidthArray, jintArray keyHeightArray, jintArray keyCharCodeArray, jfloatArray sweetSpotCenterXArray, jfloatArray sweetSpotCenterYArray, jfloatArray sweetSpotRadiusArray) { - jint *proximityChars = env->GetIntArrayElements(proximityCharsArray, NULL); + jint *proximityChars = env->GetIntArrayElements(proximityCharsArray, 0); jint *keyXCoordinates = safeGetIntArrayElements(env, keyXCoordinateArray); jint *keyYCoordinates = safeGetIntArrayElements(env, keyYCoordinateArray); jint *keyWidths = safeGetIntArrayElements(env, keyWidthArray); @@ -59,19 +59,19 @@ static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object, safeReleaseIntArrayElements(env, keyYCoordinateArray, keyYCoordinates); safeReleaseIntArrayElements(env, keyXCoordinateArray, keyXCoordinates); env->ReleaseIntArrayElements(proximityCharsArray, proximityChars, 0); - return (jint)proximityInfo; + return (jlong)proximityInfo; } -static void latinime_Keyboard_release(JNIEnv *env, jobject object, jint proximityInfo) { +static void latinime_Keyboard_release(JNIEnv *env, jobject object, jlong proximityInfo) { ProximityInfo *pi = (ProximityInfo*)proximityInfo; if (!pi) return; delete pi; } static JNINativeMethod sKeyboardMethods[] = { - {"setProximityInfoNative", "(IIIII[II[I[I[I[I[I[F[F[F)I", + {"setProximityInfoNative", "(IIIII[II[I[I[I[I[I[F[F[F)J", (void*)latinime_Keyboard_setProximityInfo}, - {"releaseProximityInfoNative", "(I)V", (void*)latinime_Keyboard_release} + {"releaseProximityInfoNative", "(J)V", (void*)latinime_Keyboard_release} }; int register_ProximityInfo(JNIEnv *env) { diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 18c972444..f2878eea5 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -18,6 +18,7 @@ #define LOG_TAG "LatinIME: jni: BinaryDictionary" #include "binary_format.h" +#include "correction.h" #include "com_android_inputmethod_latin_BinaryDictionary.h" #include "dictionary.h" #include "jni.h" @@ -33,6 +34,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> +#include <unistd.h> #else // USE_MMAP_FOR_DICTIONARY #include <stdlib.h> #endif // USE_MMAP_FOR_DICTIONARY @@ -41,75 +43,75 @@ namespace latinime { void releaseDictBuf(void* dictBuf, const size_t length, int fd); -static jint latinime_BinaryDictionary_open(JNIEnv *env, jobject object, +static jlong latinime_BinaryDictionary_open(JNIEnv *env, jobject object, jstring sourceDir, jlong dictOffset, jlong dictSize, jint typedLetterMultiplier, jint fullWordMultiplier, jint maxWordLength, jint maxWords, jint maxAlternatives) { PROF_OPEN; PROF_START(66); - const char *sourceDirChars = env->GetStringUTFChars(sourceDir, NULL); - if (sourceDirChars == NULL) { - LOGE("DICT: Can't get sourceDir string"); + const char *sourceDirChars = env->GetStringUTFChars(sourceDir, 0); + if (sourceDirChars == 0) { + AKLOGE("DICT: Can't get sourceDir string"); return 0; } int fd = 0; - void *dictBuf = NULL; + void *dictBuf = 0; int adjust = 0; #ifdef USE_MMAP_FOR_DICTIONARY /* mmap version */ fd = open(sourceDirChars, O_RDONLY); if (fd < 0) { - LOGE("DICT: Can't open sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); + AKLOGE("DICT: Can't open sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); return 0; } int pagesize = getpagesize(); adjust = dictOffset % pagesize; int adjDictOffset = dictOffset - adjust; int adjDictSize = dictSize + adjust; - dictBuf = mmap(NULL, sizeof(char) * adjDictSize, PROT_READ, MAP_PRIVATE, fd, adjDictOffset); + dictBuf = mmap(0, sizeof(char) * adjDictSize, PROT_READ, MAP_PRIVATE, fd, adjDictOffset); if (dictBuf == MAP_FAILED) { - LOGE("DICT: Can't mmap dictionary. errno=%d", errno); + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); return 0; } dictBuf = (void *)((char *)dictBuf + adjust); #else // USE_MMAP_FOR_DICTIONARY /* malloc version */ - FILE *file = NULL; + FILE *file = 0; file = fopen(sourceDirChars, "rb"); - if (file == NULL) { - LOGE("DICT: Can't fopen sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); + if (file == 0) { + AKLOGE("DICT: Can't fopen sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); return 0; } dictBuf = malloc(sizeof(char) * dictSize); if (!dictBuf) { - LOGE("DICT: Can't allocate memory region for dictionary. errno=%d", errno); + AKLOGE("DICT: Can't allocate memory region for dictionary. errno=%d", errno); return 0; } int ret = fseek(file, (long)dictOffset, SEEK_SET); if (ret != 0) { - LOGE("DICT: Failure in fseek. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in fseek. ret=%d errno=%d", ret, errno); return 0; } ret = fread(dictBuf, sizeof(char) * dictSize, 1, file); if (ret != 1) { - LOGE("DICT: Failure in fread. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in fread. ret=%d errno=%d", ret, errno); return 0; } ret = fclose(file); if (ret != 0) { - LOGE("DICT: Failure in fclose. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in fclose. ret=%d errno=%d", ret, errno); return 0; } #endif // USE_MMAP_FOR_DICTIONARY env->ReleaseStringUTFChars(sourceDir, sourceDirChars); if (!dictBuf) { - LOGE("DICT: dictBuf is null"); + AKLOGE("DICT: dictBuf is null"); return 0; } - Dictionary *dictionary = NULL; + Dictionary *dictionary = 0; if (BinaryFormat::UNKNOWN_FORMAT == BinaryFormat::detectFormat((uint8_t*)dictBuf)) { - LOGE("DICT: dictionary format is unknown, bad magic number"); + AKLOGE("DICT: dictionary format is unknown, bad magic number"); #ifdef USE_MMAP_FOR_DICTIONARY releaseDictBuf(((char*)dictBuf) - adjust, adjDictSize, fd); #else // USE_MMAP_FOR_DICTIONARY @@ -121,23 +123,23 @@ static jint latinime_BinaryDictionary_open(JNIEnv *env, jobject object, } PROF_END(66); PROF_CLOSE; - return (jint)dictionary; + return (jlong)dictionary; } -static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jint dict, - jint proximityInfo, jintArray xCoordinatesArray, jintArray yCoordinatesArray, +static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jlong dict, + jlong proximityInfo, jintArray xCoordinatesArray, jintArray yCoordinatesArray, jintArray inputArray, jint arraySize, jint flags, jcharArray outputArray, jintArray frequencyArray) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return 0; ProximityInfo *pInfo = (ProximityInfo*)proximityInfo; - int *xCoordinates = env->GetIntArrayElements(xCoordinatesArray, NULL); - int *yCoordinates = env->GetIntArrayElements(yCoordinatesArray, NULL); + int *xCoordinates = env->GetIntArrayElements(xCoordinatesArray, 0); + int *yCoordinates = env->GetIntArrayElements(yCoordinatesArray, 0); - int *frequencies = env->GetIntArrayElements(frequencyArray, NULL); - int *inputCodes = env->GetIntArrayElements(inputArray, NULL); - jchar *outputChars = env->GetCharArrayElements(outputArray, NULL); + int *frequencies = env->GetIntArrayElements(frequencyArray, 0); + int *inputCodes = env->GetIntArrayElements(inputArray, 0); + jchar *outputChars = env->GetCharArrayElements(outputArray, 0); int count = dictionary->getSuggestions(pInfo, xCoordinates, yCoordinates, inputCodes, arraySize, flags, (unsigned short*) outputChars, frequencies); @@ -151,17 +153,17 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, return count; } -static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jint dict, +static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jlong dict, jcharArray prevWordArray, jint prevWordLength, jintArray inputArray, jint inputArraySize, jcharArray outputArray, jintArray frequencyArray, jint maxWordLength, jint maxBigrams, jint maxAlternatives) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return 0; - jchar *prevWord = env->GetCharArrayElements(prevWordArray, NULL); - int *inputCodes = env->GetIntArrayElements(inputArray, NULL); - jchar *outputChars = env->GetCharArrayElements(outputArray, NULL); - int *frequencies = env->GetIntArrayElements(frequencyArray, NULL); + jchar *prevWord = env->GetCharArrayElements(prevWordArray, 0); + int *inputCodes = env->GetIntArrayElements(inputArray, 0); + jchar *outputChars = env->GetCharArrayElements(outputArray, 0); + int *frequencies = env->GetIntArrayElements(frequencyArray, 0); int count = dictionary->getBigrams((unsigned short*) prevWord, prevWordLength, inputCodes, inputArraySize, (unsigned short*) outputChars, frequencies, maxWordLength, maxBigrams, @@ -175,19 +177,42 @@ static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jin return count; } -static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jint dict, +static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jlong dict, jcharArray wordArray, jint wordLength) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return (jboolean) false; - jchar *word = env->GetCharArrayElements(wordArray, NULL); + jchar *word = env->GetCharArrayElements(wordArray, 0); jboolean result = dictionary->isValidWord((unsigned short*) word, wordLength); env->ReleaseCharArrayElements(wordArray, word, JNI_ABORT); return result; } -static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jint dict) { +static jdouble latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jobject object, + jcharArray before, jint beforeLength, jcharArray after, jint afterLength, jint score) { + jchar *beforeChars = env->GetCharArrayElements(before, 0); + jchar *afterChars = env->GetCharArrayElements(after, 0); + jdouble result = Correction::RankingAlgorithm::calcNormalizedScore( + (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength, + score); + env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT); + env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT); + return result; +} + +static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jobject object, + jcharArray before, jint beforeLength, jcharArray after, jint afterLength) { + jchar *beforeChars = env->GetCharArrayElements(before, 0); + jchar *afterChars = env->GetCharArrayElements(after, 0); + jint result = Correction::RankingAlgorithm::editDistance( + (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength); + env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT); + env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT); + return result; +} + +static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jlong dict) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return; void *dictBuf = dictionary->getDict(); @@ -205,11 +230,11 @@ void releaseDictBuf(void* dictBuf, const size_t length, int fd) { #ifdef USE_MMAP_FOR_DICTIONARY int ret = munmap(dictBuf, length); if (ret != 0) { - LOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); } ret = close(fd); if (ret != 0) { - LOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); } #else // USE_MMAP_FOR_DICTIONARY free(dictBuf); @@ -217,11 +242,14 @@ void releaseDictBuf(void* dictBuf, const size_t length, int fd) { } static JNINativeMethod sMethods[] = { - {"openNative", "(Ljava/lang/String;JJIIIII)I", (void*)latinime_BinaryDictionary_open}, - {"closeNative", "(I)V", (void*)latinime_BinaryDictionary_close}, - {"getSuggestionsNative", "(II[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, - {"isValidWordNative", "(I[CI)Z", (void*)latinime_BinaryDictionary_isValidWord}, - {"getBigramsNative", "(I[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams} + {"openNative", "(Ljava/lang/String;JJIIIII)J", (void*)latinime_BinaryDictionary_open}, + {"closeNative", "(J)V", (void*)latinime_BinaryDictionary_close}, + {"getSuggestionsNative", "(JJ[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, + {"isValidWordNative", "(J[CI)Z", (void*)latinime_BinaryDictionary_isValidWord}, + {"getBigramsNative", "(J[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams}, + {"calcNormalizedScoreNative", "([CI[CII)D", + (void*)latinime_BinaryDictionary_calcNormalizedScore}, + {"editDistanceNative", "([CI[CI)I", (void*)latinime_BinaryDictionary_editDistance} }; int register_BinaryDictionary(JNIEnv *env) { diff --git a/native/jni/jni_common.cpp b/native/jni/jni_common.cpp index 8643f723f..85d26836a 100644 --- a/native/jni/jni_common.cpp +++ b/native/jni/jni_common.cpp @@ -32,22 +32,22 @@ using namespace latinime; * Returns the JNI version on success, -1 on failure. */ jint JNI_OnLoad(JavaVM* vm, void* reserved) { - JNIEnv* env = NULL; + JNIEnv* env = 0; jint result = -1; if (vm->GetEnv((void**) &env, JNI_VERSION_1_4) != JNI_OK) { - LOGE("ERROR: GetEnv failed"); + AKLOGE("ERROR: GetEnv failed"); goto bail; } - assert(env != NULL); + assert(env != 0); if (!register_BinaryDictionary(env)) { - LOGE("ERROR: BinaryDictionary native registration failed"); + AKLOGE("ERROR: BinaryDictionary native registration failed"); goto bail; } if (!register_ProximityInfo(env)) { - LOGE("ERROR: ProximityInfo native registration failed"); + AKLOGE("ERROR: ProximityInfo native registration failed"); goto bail; } @@ -63,12 +63,12 @@ namespace latinime { int registerNativeMethods(JNIEnv* env, const char* className, JNINativeMethod* methods, int numMethods) { jclass clazz = env->FindClass(className); - if (clazz == NULL) { - LOGE("Native registration unable to find class '%s'", className); + if (clazz == 0) { + AKLOGE("Native registration unable to find class '%s'", className); return JNI_FALSE; } if (env->RegisterNatives(clazz, methods, numMethods) < 0) { - LOGE("RegisterNatives failed for '%s'", className); + AKLOGE("RegisterNatives failed for '%s'", className); env->DeleteLocalRef(clazz); return JNI_FALSE; } diff --git a/native/jni/jni_common.h b/native/jni/jni_common.h index 9548e1b3f..6741443ac 100644 --- a/native/jni/jni_common.h +++ b/native/jni/jni_common.h @@ -29,17 +29,17 @@ int registerNativeMethods(JNIEnv *env, const char *className, JNINativeMethod *m inline jint *safeGetIntArrayElements(JNIEnv *env, jintArray jArray) { if (jArray) { - return env->GetIntArrayElements(jArray, NULL); + return env->GetIntArrayElements(jArray, 0); } else { - return NULL; + return 0; } } inline jfloat *safeGetFloatArrayElements(JNIEnv *env, jfloatArray jArray) { if (jArray) { - return env->GetFloatArrayElements(jArray, NULL); + return env->GetFloatArrayElements(jArray, 0); } else { - return NULL; + return 0; } } diff --git a/native/src/basechars.h b/native/src/basechars.cpp index 3843e11c5..31f1e18a8 100644 --- a/native/src/basechars.h +++ b/native/src/basechars.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 The Android Open Source Project + * Copyright (C) 2011 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,9 @@ * limitations under the License. */ -#ifndef LATINIME_BASECHARS_H -#define LATINIME_BASECHARS_H +#include "char_utils.h" + +namespace latinime { /** * Table mapping most combined Latin, Greek, and Cyrillic characters @@ -23,7 +24,7 @@ * if c is not a combined character, or the base character if it * is combined. */ -static unsigned short BASE_CHARS[] = { +const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, @@ -189,4 +190,5 @@ static unsigned short BASE_CHARS[] = { // generated with: // cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' -#endif // LATINIME_BASECHARS_H + +} // namespace latinime diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp index c340c6c1a..db7734bc7 100644 --- a/native/src/bigram_dictionary.cpp +++ b/native/src/bigram_dictionary.cpp @@ -32,8 +32,8 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { if (DEBUG_DICT) { - LOGI("BigramDictionary - constructor"); - LOGI("Has Bigram : %d", hasBigram); + AKLOGI("BigramDictionary - constructor"); + AKLOGI("Has Bigram : %d", hasBigram); } } @@ -46,7 +46,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Bigram: Found word = %s, freq = %d :", s, frequency); + AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency); #endif } @@ -60,7 +60,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ insertAt++; } if (DEBUG_DICT) { - LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); } if (insertAt < mMaxBigrams) { memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), @@ -76,7 +76,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ } *dest = 0; // NULL terminate if (DEBUG_DICT) { - LOGI("Bigram: Added word at %d", insertAt); + AKLOGI("Bigram: Added word at %d", insertAt); } return true; } diff --git a/native/src/bigram_dictionary.h b/native/src/bigram_dictionary.h index c07458a38..585a1866a 100644 --- a/native/src/bigram_dictionary.h +++ b/native/src/bigram_dictionary.h @@ -21,14 +21,14 @@ namespace latinime { class Dictionary; class BigramDictionary { -public: + public: BigramDictionary(const unsigned char *dict, int maxWordLength, int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary); int getBigrams(unsigned short *word, int length, int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, int maxAlternatives); ~BigramDictionary(); -private: + private: bool addWordBigram(unsigned short *word, int length, int frequency); int getBigramAddress(int *pos, bool advance); int getBigramFreq(int *pos); diff --git a/native/src/binary_format.h b/native/src/binary_format.h index 6f65088db..1d74998f6 100644 --- a/native/src/binary_format.h +++ b/native/src/binary_format.h @@ -22,12 +22,12 @@ namespace latinime { class BinaryFormat { -private: + private: const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; -public: + public: const static int UNKNOWN_FORMAT = -1; const static int FORMAT_VERSION_1 = 1; const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; @@ -61,7 +61,9 @@ inline int BinaryFormat::detectFormat(const uint8_t* const dict) { } inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { - return dict[(*pos)++]; + const int msb = dict[(*pos)++]; + if (msb < 0x80) return msb; + return ((msb & 0x7F) << 8) | dict[(*pos)++]; } inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { @@ -145,15 +147,15 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) { - // This function skips all attributes. The format makes provision for future extension - // with other attributes (notably shortcuts) but for the time being, bigrams are the - // only attributes that may be found in a character group, so we only look at bigrams - // in this version. + // This function skips all attributes: shortcuts and bigrams. + int newPos = pos; + if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { + newPos = skipAttributes(dict, newPos); + } if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { - return skipAttributes(dict, pos); - } else { - return pos; + newPos = skipAttributes(dict, newPos); } + return newPos; } inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, diff --git a/native/src/char_utils.h b/native/src/char_utils.h index a69a35e7a..607dc5195 100644 --- a/native/src/char_utils.h +++ b/native/src/char_utils.h @@ -19,8 +19,47 @@ namespace latinime { +inline static int isAsciiUpper(unsigned short c) { + return c >= 'A' && c <= 'Z'; +} + +inline static unsigned short toAsciiLower(unsigned short c) { + return c - 'A' + 'a'; +} + +inline static int isAscii(unsigned short c) { + return c <= 127; +} + unsigned short latin_tolower(unsigned short c); +/** + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + */ + +static const int BASE_CHARS_SIZE = 0x0500; +extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; + +inline static unsigned short toBaseChar(unsigned short c) { + if (c < BASE_CHARS_SIZE) { + return BASE_CHARS[c]; + } + return c; +} + +inline static unsigned short toBaseLowerCase(unsigned short c) { + c = toBaseChar(c); + if (isAsciiUpper(c)) { + return toAsciiLower(c); + } else if (isAscii(c)) { + return c; + } + return latin_tolower(c); +} + } // namespace latinime #endif // LATINIME_CHAR_UTILS_H diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 27dc40745..087219ed4 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -16,12 +16,15 @@ #include <assert.h> #include <ctype.h> +#include <math.h> #include <stdio.h> #include <string.h> #define LOG_TAG "LatinIME: correction.cpp" +#include "char_utils.h" #include "correction.h" +#include "defines.h" #include "dictionary.h" #include "proximity_info.h" @@ -31,81 +34,60 @@ namespace latinime { // edit distance funcitons // ///////////////////////////// -#if 0 /* no longer used */ -inline static int editDistance( - int* editDistanceTable, const unsigned short* input, - const int inputLength, const unsigned short* output, const int outputLength) { - // dp[li][lo] dp[a][b] = dp[ a * lo + b] - int* dp = editDistanceTable; - const int li = inputLength + 1; - const int lo = outputLength + 1; - for (int i = 0; i < li; ++i) { - dp[lo * i] = i; - } - for (int i = 0; i < lo; ++i) { - dp[i] = i; - } - - for (int i = 0; i < li - 1; ++i) { - for (int j = 0; j < lo - 1; ++j) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i]); - const uint32_t co = Dictionary::toBaseLowerCase(output[j]); - const uint16_t cost = (ci == co) ? 0 : 1; - dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, - min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); - if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1]) - && co == Dictionary::toBaseLowerCase(input[i - 1])) { - dp[(i + 1) * lo + (j + 1)] = min( - dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); - } - } +inline static void initEditDistance(int *editDistanceTable) { + for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { + editDistanceTable[i] = i; } +} - if (DEBUG_EDIT_DISTANCE) { - LOGI("IN = %d, OUT = %d", inputLength, outputLength); - for (int i = 0; i < li; ++i) { - for (int j = 0; j < lo; ++j) { - LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); +inline static void dumpEditDistance10ForDebug(int *editDistanceTable, + const int editDistanceTableWidth, const int outputLength) { + if (DEBUG_DICT) { + AKLOGI("EditDistanceTable"); + for (int i = 0; i <= 10; ++i) { + int c[11]; + for (int j = 0; j <= 10; ++j) { + if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { + c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; + } else { + c[j] = -1; + } } + AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); } } - return dp[li * lo - 1]; -} -#endif - -inline static void initEditDistance(int *editDistanceTable) { - for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { - editDistanceTable[i] = i; - } } inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input, const int inputLength, const unsigned short *output, const int outputLength) { + // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched. // Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j]. // Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated, // and calculate dp[ouputLength][0] ... dp[outputLength][inputLength]. int *const current = editDistanceTable + outputLength * (inputLength + 1); const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1); const int *const prevprev = - outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : NULL; + outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0; current[0] = outputLength; - const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]); - const uint32_t prevCO = - outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0; + const uint32_t co = toBaseLowerCase(output[outputLength - 1]); + const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; for (int i = 1; i <= inputLength; ++i) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]); + const uint32_t ci = toBaseLowerCase(input[i - 1]); const uint16_t cost = (ci == co) ? 0 : 1; current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); - if (i >= 2 && prevprev && ci == prevCO - && co == Dictionary::toBaseLowerCase(input[i - 2])) { + if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) { current[i] = min(current[i], prevprev[i - 2] + 1); } } } -inline static int getCurrentEditDistance( - int *editDistanceTable, const int inputLength, const int outputLength) { - return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; +inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth, + const int outputLength, const int inputLength) { + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); + } + return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength]; } ////////////////////// @@ -133,6 +115,9 @@ void Correction::initCorrection(const ProximityInfo *pi, const int inputLength, mInputLength = inputLength; mMaxDepth = maxDepth; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; + // TODO: This is not supposed to be required. Check what's going wrong with + // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] + initEditDistance(mEditDistanceTable); } void Correction::initCorrectionState( @@ -146,7 +131,7 @@ void Correction::initCorrectionState( void Correction::setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos, - const bool useFullEditDistance) { + const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) { // TODO: remove mTransposedPos = transposedPos; mExcessivePos = excessivePos; @@ -159,6 +144,8 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos, mSpaceProximityPos = spaceProximityPos; mMissingSpacePos = missingSpacePos; mUseFullEditDistance = useFullEditDistance; + mDoAutoCompletion = doAutoCompletion; + mMaxErrors = maxErrors; } void Correction::checkState() { @@ -172,23 +159,34 @@ void Correction::checkState() { } } -int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const unsigned short *word) { - return Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - firstFreq, secondFreq, this, word); +int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, + const int wordCount, const bool isSpaceProximity, const unsigned short *word) { + return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray, + wordCount, this, isSpaceProximity, word); } int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { + return getFinalFreqInternal(freq, word, wordLength, mInputLength); +} + +int Correction::getFinalFreqForSubQueue(const int freq, unsigned short **word, int *wordLength, + const int inputLength) { + return getFinalFreqInternal(freq, word, wordLength, inputLength); +} + +int Correction::getFinalFreqInternal(const int freq, unsigned short **word, int *wordLength, + const int inputLength) { const int outputIndex = mTerminalOutputIndex; const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; - if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { - return -1; + if (outputIndex < MIN_SUGGEST_DEPTH) { + return NOT_A_FREQUENCY; } *word = mWord; - return Correction::RankingAlgorithm::calculateFinalFreq( - inputIndex, outputIndex, freq, mEditDistanceTable, this); + int finalFreq = Correction::RankingAlgorithm::calculateFinalFreq( + inputIndex, outputIndex, freq, mEditDistanceTable, this, inputLength); + return finalFreq; } bool Correction::initProcessState(const int outputIndex) { @@ -213,6 +211,7 @@ bool Correction::initProcessState(const int outputIndex) { mMatching = false; mProximityMatching = false; + mAdditionalProximityMatching = false; mTransposing = false; mExceeding = false; mSkipping = false; @@ -229,20 +228,10 @@ int Correction::goDownTree( } // TODO: remove -int Correction::getOutputIndex() { - return mOutputIndex; -} - -// TODO: remove int Correction::getInputIndex() { return mInputIndex; } -// TODO: remove -bool Correction::needsToTraverseAllNodes() { - return mNeedsToTraverseAllNodes; -} - void Correction::incrementInputIndex() { ++mInputIndex; } @@ -269,6 +258,7 @@ void Correction::incrementOutputIndex() { mCorrectionStates[mOutputIndex].mMatching = mMatching; mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; + mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching; mCorrectionStates[mOutputIndex].mTransposing = mTransposing; mCorrectionStates[mOutputIndex].mExceeding = mExceeding; mCorrectionStates[mOutputIndex].mSkipping = mSkipping; @@ -280,7 +270,9 @@ void Correction::startToTraverseAllNodes() { bool Correction::needsToPrune() const { // TODO: use edit distance here - return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance; + return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance + // Allow one char longer word for missing character + || (!mDoAutoCompletion && (mOutputIndex > mInputLength)); } void Correction::addCharToCurrentWord(const int32_t c) { @@ -290,13 +282,12 @@ void Correction::addCharToCurrentWord(const int32_t c) { mWord, mOutputIndex + 1); } -// TODO: inline? Correction::CorrectionType Correction::processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { addCharToCurrentWord(c); - if (needsToTraverseAllNodes() && isTerminal) { - mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); - mTerminalOutputIndex = mOutputIndex; + mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); + mTerminalOutputIndex = mOutputIndex; + if (mNeedsToTraverseAllNodes && isTerminal) { incrementOutputIndex(); return TRAVERSE_ALL_ON_TERMINAL; } else { @@ -305,19 +296,36 @@ Correction::CorrectionType Correction::processSkipChar( } } +Correction::CorrectionType Correction::processUnrelatedCorrectionType() { + // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + return UNRELATED; +} + inline bool isEquivalentChar(ProximityInfo::ProximityType type) { return type == ProximityInfo::EQUIVALENT_CHAR; } +inline bool isProximityCharOrEquivalentChar(ProximityInfo::ProximityType type) { + return type == ProximityInfo::EQUIVALENT_CHAR + || type == ProximityInfo::NEAR_PROXIMITY_CHAR; +} + Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); + if (correctionCount > mMaxErrors) { + return processUnrelatedCorrectionType(); + } + // TODO: Change the limit if we'll allow two or more corrections const bool noCorrectionsHappenedSoFar = correctionCount == 0; const bool canTryCorrection = noCorrectionsHappenedSoFar; int proximityIndex = 0; mDistances[mOutputIndex] = NOT_A_DISTANCE; + // Skip checking this node if (mNeedsToTraverseAllNodes || isQuote(c)) { bool incremented = false; if (mLastCharExceeded && mInputIndex == mInputLength - 1) { @@ -342,6 +350,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( return processSkipChar(c, isTerminal, incremented); } + // Check possible corrections. if (mExcessivePos >= 0) { if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { mExcessivePos = mOutputIndex; @@ -382,30 +391,42 @@ Correction::CorrectionType Correction::processCharAndCalcState( incrementInputIndex(); } else { --mTransposedCount; - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } // TODO: Change the limit if we'll allow two or more proximity chars with corrections - const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0; + // Work around: When the mMaxErrors is 1, we only allow just one error + // including proximity correction. + const bool checkProximityChars = (mMaxErrors > 1) + ? (noCorrectionsHappenedSoFar || mProximityCount == 0) + : (noCorrectionsHappenedSoFar && mProximityCount == 0); + ProximityInfo::ProximityType matchedProximityCharId = secondTransposing ? ProximityInfo::EQUIVALENT_CHAR : mProximityInfo->getMatchedProximityId( mInputIndex, c, checkProximityChars, &proximityIndex); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId + || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { if (canTryCorrection && mOutputIndex > 0 && mCorrectionStates[mOutputIndex].mProximityMatching && mCorrectionStates[mOutputIndex].mExceeding && isEquivalentChar(mProximityInfo->getMatchedProximityId( mInputIndex, mWord[mOutputIndex - 1], false))) { - if (DEBUG_CORRECTION) { - LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); } // Conversion p->e // Example: @@ -423,7 +444,11 @@ Correction::CorrectionType Correction::processCharAndCalcState( } } - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId + || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + mAdditionalProximityMatching = true; + } // TODO: Optimize // As the current char turned out to be an unrelated char, // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] @@ -465,6 +490,18 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mSkippedCount; --mProximityCount; return processSkipChar(c, isTerminal, false); + } else if (mInputIndex - 1 < mInputLength + && mSkippedCount > 0 + && mCorrectionStates[mOutputIndex].mSkipping + && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching + && isProximityCharOrEquivalentChar( + mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { + // Conversion s->a + incrementInputIndex(); + --mSkippedCount; + mProximityMatching = true; + ++mProximityCount; + mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength && isEquivalentChar( mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { @@ -475,17 +512,52 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mExcessiveCount; incrementInputIndex(); } + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + DUMP_WORD(mWord, mOutputIndex); + if (mTransposing) { + AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } else { + AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } + } } else if (mSkipping) { // 3. Skip correction ++mSkippedCount; + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } return processSkipChar(c, isTerminal, false); + } else if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + // As a last resort, use additional proximity characters + mProximityMatching = true; + ++mProximityCount; + mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } } else { - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } else if (secondTransposing) { // If inputIndex is greater than mInputLength, that means there is no @@ -500,6 +572,13 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mProximityCount; mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } } addCharToCurrentWord(c); @@ -533,13 +612,17 @@ Correction::CorrectionType Correction::processCharAndCalcState( || isSameAsUserTypedLength) && isTerminal) { mTerminalInputIndex = mInputIndex - 1; mTerminalOutputIndex = mOutputIndex - 1; - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } return ON_TERMINAL; } else { + mTerminalInputIndex = mInputIndex - 1; + mTerminalOutputIndex = mOutputIndex - 1; return NOT_ON_TERMINAL; } } @@ -547,55 +630,6 @@ Correction::CorrectionType Correction::processCharAndCalcState( Correction::~Correction() { } -///////////////////////// -// static inline utils // -///////////////////////// - -static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; -static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { - return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); -} - -static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; -inline static void multiplyIntCapped(const int multiplier, int *base) { - const int temp = *base; - if (temp != S_INT_MAX) { - // Branch if multiplier == 2 for the optimization - if (multiplier == 2) { - *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; - } else { - // TODO: This overflow check gives a wrong answer when, for example, - // temp = 2^16 + 1 and multiplier = 2^17 + 1. - // Fix this behavior. - const int tempRetval = temp * multiplier; - *base = tempRetval >= temp ? tempRetval : S_INT_MAX; - } - } -} - -inline static int powerIntCapped(const int base, const int n) { - if (n <= 0) return 1; - if (base == 2) { - return n < 31 ? 1 << n : S_INT_MAX; - } else { - int ret = base; - for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); - return ret; - } -} - -inline static void multiplyRate(const int rate, int *freq) { - if (*freq != S_INT_MAX) { - if (*freq > 1000000) { - *freq /= 100; - multiplyIntCapped(rate, freq); - } else { - multiplyIntCapped(rate, freq); - *freq /= 100; - } - } -} - inline static int getQuoteCount(const unsigned short* word, const int length) { int quoteCount = 0; for (int i = 0; i < length; ++i) { @@ -607,13 +641,7 @@ inline static int getQuoteCount(const unsigned short* word, const int length) { } inline static bool isUpperCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (isupper(c)) { - return true; - } - return false; + return isAsciiUpper(toBaseChar(c)); } ////////////////////// @@ -622,9 +650,9 @@ inline static bool isUpperCase(unsigned short c) { /* static */ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, - const int freq, int* editDistanceTable, const Correction* correction) { + const int freq, int* editDistanceTable, const Correction* correction, + const int inputLength) { const int excessivePos = correction->getExcessivePos(); - const int inputLength = correction->mInputLength; const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; const ProximityInfo *proximityInfo = correction->mProximityInfo; @@ -649,45 +677,55 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const const unsigned short* word = correction->mWord; const bool skipped = skippedCount > 0; - const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1) + const int quoteDiffCount = max(0, getQuoteCount(word, outputLength) - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); // TODO: Calculate edit distance for transposed and excessive int ed = 0; + if (DEBUG_DICT_FULL) { + dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength); + } int adjustedProximityMatchedCount = proximityMatchedCount; int finalFreq = freq; + if (DEBUG_CORRECTION_FREQ + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { + AKLOGI("FinalFreq0: %d", finalFreq); + } // TODO: Optimize this. - // TODO: Ignoring edit distance for transposed char, for now - if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) { - ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1); + if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { + ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength, + inputLength) - transposedCount; + const int matchWeight = powerIntCapped(typedLetterMultiplier, - max(inputLength, outputIndex + 1) - ed); + max(inputLength, outputLength) - ed); multiplyIntCapped(matchWeight, &finalFreq); // TODO: Demote further if there are two or more excessive chars with longer user input? - if (inputLength > outputIndex + 1) { + if (inputLength > outputLength) { multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); } ed = max(0, ed - quoteDiffCount); - - if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { - // Promote a word with just one skipped or excessive char - if (sameLength) { - multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); - } else { + adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)), + proximityMatchedCount); + if (transposedCount < 1) { + if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) { + // Promote a word with just one skipped or excessive char + if (sameLength) { + multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE + + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength, + &finalFreq); + } else { + multiplyIntCapped(typedLetterMultiplier, &finalFreq); + } + } else if (ed == 0) { multiplyIntCapped(typedLetterMultiplier, &finalFreq); + sameLength = true; } - } else if (ed == 0) { - multiplyIntCapped(typedLetterMultiplier, &finalFreq); - sameLength = true; } - adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)), - proximityMatchedCount); } else { - // TODO: Calculate the edit distance for transposed char const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); multiplyIntCapped(matchWeight, &finalFreq); } @@ -707,7 +745,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const / (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); if (DEBUG_DICT_FULL) { - LOGI("Demotion rate for missing character is %d.", demotionRate); + AKLOGI("Demotion rate for missing character is %d.", demotionRate); } multiplyRate(demotionRate, &finalFreq); } @@ -720,8 +758,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const if (excessiveCount > 0) { multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { - if (DEBUG_CORRECTION_FREQ) { - LOGI("Double excessive demotion"); + if (DEBUG_DICT_FULL) { + AKLOGI("Double excessive demotion"); } // If an excessive character is not adjacent to the left char or the right char, // we will demote this word. @@ -729,11 +767,14 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } } + const bool performTouchPositionCorrection = + CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() + && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0; // Score calibration by touch coordinates is being done only for pure-fat finger typing error // cases. + int additionalProximityCount = 0; // TODO: Remove this constraint. - if (CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() - && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0) { + if (performTouchPositionCorrection) { for (int i = 0; i < outputLength; ++i) { const int squaredDistance = correction->mDistances[i]; if (i < adjustedProximityMatchedCount) { @@ -744,40 +785,60 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f; static const float B = 1.0f; static const float C = 0.5f; + static const float MIN = 0.3f; static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS; static const float R2 = HALF_SCORE_SQUARED_RADIUS; const float x = (float)squaredDistance / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; - const float factor = (x < R1) + const float factor = max((x < R1) ? (A * (R1 - x) + B * x) / R1 - : (B * (R2 - x) + C * (x - R1)) / (R2 - R1); + : (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN); // factor is piecewise linear function like: // A -_ . // ^-_ . // B \ . - // \ . - // C \ . - // 0 R1 R2 - if (factor <= 0) { - return -1; - } + // \_ . + // C ------------. + // . + // 0 R1 R2 . multiplyRate((int)(factor * 100), &finalFreq); } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) { multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } else if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { + ++additionalProximityCount; + multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } } } else { + // Demote additional proximity characters + for (int i = 0; i < outputLength; ++i) { + const int squaredDistance = correction->mDistances[i]; + if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { + ++additionalProximityCount; + } + } // Promotion for a word with proximity characters for (int i = 0; i < adjustedProximityMatchedCount; ++i) { // A word with proximity corrections if (DEBUG_DICT_FULL) { - LOGI("Found a proximity correction."); + AKLOGI("Found a proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &finalFreq); - multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + if (i < additionalProximityCount) { + multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } else { + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } } } + // If the user types too many(three or more) proximity characters with additional proximity + // character,do not treat as the same length word. + if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3 + || transposedCount > 0 || skipped || excessiveCount > 0)) { + sameLength = false; + } + const int errorCount = adjustedProximityMatchedCount > 0 ? adjustedProximityMatchedCount : (proximityMatchedCount + transposedCount); @@ -787,13 +848,15 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const // Promotion for an exactly matched word if (ed == 0) { // Full exact match - if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) { + if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0 + && quoteDiffCount == 0 && additionalProximityCount == 0) { finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } } // Promote a word with no correction - if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0) { + if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0 + && additionalProximityCount == 0) { multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } @@ -832,71 +895,101 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } if (DEBUG_DICT_FULL) { - LOGI("calc: %d, %d", outputIndex, sameLength); + AKLOGI("calc: %d, %d", outputLength, sameLength); } - if (DEBUG_CORRECTION_FREQ) { - DUMP_WORD(correction->mWord, outputIndex + 1); - LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, - skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength, - quoteDiffCount, ed, finalFreq); + if (DEBUG_CORRECTION_FREQ + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { + DUMP_WORD(proximityInfo->getPrimaryInputWord(), inputLength); + DUMP_WORD(correction->mWord, outputLength); + AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount, + skippedCount, transposedCount, excessiveCount, additionalProximityCount, + outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); } return finalFreq; } /* static */ -int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const Correction* correction, - const unsigned short *word) { - const int spaceProximityPos = correction->mSpaceProximityPos; - const int missingSpacePos = correction->mMissingSpacePos; - if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); - } - const bool isSpaceProximity = spaceProximityPos >= 0; - const int inputLength = correction->mInputLength; - const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); +int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( + const int *freqArray, const int *wordLengthArray, const int wordCount, + const Correction* correction, const bool isSpaceProximity, const unsigned short *word) { const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; bool firstCapitalizedWordDemotion = false; - if (firstWordLength >= 2) { - firstCapitalizedWordDemotion = isUpperCase(word[0]); - } - bool secondCapitalizedWordDemotion = false; - if (secondWordLength >= 2) { - secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + + { + // TODO: Handle multiple capitalized word demotion properly + const int firstWordLength = wordLengthArray[0]; + const int secondWordLength = wordLengthArray[1]; + if (firstWordLength >= 2) { + firstCapitalizedWordDemotion = isUpperCase(word[0]); + } + + if (secondWordLength >= 2) { + // FIXME: word[firstWordLength + 1] is incorrect. + secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + } } + const bool capitalizedWordDemotion = firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; - if (DEBUG_DICT_FULL) { - LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); + int totalLength = 0; + int totalFreq = 0; + for (int i = 0; i < wordCount; ++i){ + const int wordLength = wordLengthArray[i]; + if (wordLength <= 0) { + return 0; + } + totalLength += wordLength; + const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1); + int tempFirstFreq = freqArray[i]; + multiplyRate(demotionRate, &tempFirstFreq); + totalFreq += tempFirstFreq; } - if (firstWordLength == 0 || secondWordLength == 0) { + if (totalLength <= 0 || totalFreq <= 0) { return 0; } - const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); - int tempFirstFreq = firstFreq; - multiplyRate(firstDemotionRate, &tempFirstFreq); - - const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); - int tempSecondFreq = secondFreq; - multiplyRate(secondDemotionRate, &tempSecondFreq); - - const int totalLength = firstWordLength + secondWordLength; + // TODO: Currently totalFreq is adjusted to two word metrix. // Promote pairFreq with multiplying by 2, because the word length is the same as the typed // length. - int totalFreq = tempFirstFreq + tempSecondFreq; + totalFreq = totalFreq * 2 / wordCount; + if (wordCount > 2) { + // Safety net for 3+ words -- Caveats: many heuristics and workarounds here. + int oneLengthCounter = 0; + int twoLengthCounter = 0; + for (int i = 0; i < wordCount; ++i) { + const int wordLength = wordLengthArray[i]; + // TODO: Use bigram instead of this safety net + if (i < wordCount - 1) { + const int nextWordLength = wordLengthArray[i + 1]; + if (wordLength == 1 && nextWordLength == 2) { + // Safety net to filter 1 length and 2 length sequential words + return 0; + } + } + const int freq = freqArray[i]; + // Demote too short weak words + if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) { + multiplyRate(100 * freq / MAX_FREQ, &totalFreq); + } + if (wordLength == 1) { + ++oneLengthCounter; + } else if (wordLength == 2) { + ++twoLengthCounter; + } + if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) { + // Safety net to filter too many short words + return 0; + } + } + multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq); + } // This is a workaround to try offsetting the not-enough-demotion which will be done in // calcNormalizedScore in Utils.java. @@ -923,19 +1016,128 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( if (isSpaceProximity) { // A word pair with one space proximity correction if (DEBUG_DICT) { - LOGI("Found a word pair with space proximity correction."); + AKLOGI("Found a word pair with space proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &totalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } - multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + if (isSpaceProximity) { + multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); + } else { + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + } if (capitalizedWordDemotion) { multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); } + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1], + wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq); + DUMP_WORD(word, wordLengthArray[0]); + } + return totalFreq; } +/* Damerau-Levenshtein distance */ +inline static int editDistanceInternal( + int* editDistanceTable, const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { + // dp[li][lo] dp[a][b] = dp[ a * lo + b] + int* dp = editDistanceTable; + const int li = beforeLength + 1; + const int lo = afterLength + 1; + for (int i = 0; i < li; ++i) { + dp[lo * i] = i; + } + for (int i = 0; i < lo; ++i) { + dp[i] = i; + } + + for (int i = 0; i < li - 1; ++i) { + for (int j = 0; j < lo - 1; ++j) { + const uint32_t ci = toBaseLowerCase(before[i]); + const uint32_t co = toBaseLowerCase(after[j]); + const uint16_t cost = (ci == co) ? 0 : 1; + dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, + min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); + if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1]) + && co == toBaseLowerCase(before[i - 1])) { + dp[(i + 1) * lo + (j + 1)] = min( + dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); + } + } + } + + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); + for (int i = 0; i < li; ++i) { + for (int j = 0; j < lo; ++j) { + AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); + } + } + } + return dp[li * lo - 1]; +} + +int Correction::RankingAlgorithm::editDistance(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { + int table[(beforeLength + 1) * (afterLength + 1)]; + return editDistanceInternal(table, before, beforeLength, after, afterLength); +} + + +// In dictionary.cpp, getSuggestion() method, +// suggestion scores are computed using the below formula. +// original score +// := pow(mTypedLetterMultiplier (this is defined 2), +// (the number of matched characters between typed word and suggested word)) +// * (individual word's score which defined in the unigram dictionary, +// and this score is defined in range [0, 255].) +// Then, the following processing is applied. +// - If the dictionary word is matched up to the point of the user entry +// (full match up to min(before.length(), after.length()) +// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) +// - If the word is a true full match except for differences in accents or +// capitalization, then treat it as if the score was 255. +// - If before.length() == after.length() +// => multiply by mFullWordMultiplier (this is defined 2)) +// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 +// For historical reasons we ignore the 1.2 modifier (because the measure for a good +// autocorrection threshold was done at a time when it didn't exist). This doesn't change +// the result. +// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. + +/* static */ +double Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength, + const int score) { + if (0 == beforeLength || 0 == afterLength) { + return 0; + } + const int distance = editDistance(before, beforeLength, after, afterLength); + int spaceCount = 0; + for (int i = 0; i < afterLength; ++i) { + if (after[i] == CODE_SPACE) { + ++spaceCount; + } + } + + if (spaceCount == afterLength) { + return 0; + } + + const double maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE + * pow((double)TYPED_LETTER_MULTIPLIER, + (double)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER; + + // add a weight based on edit distance. + // distance <= max(afterLength, beforeLength) == afterLength, + // so, 0 <= distance / afterLength <= 1 + const double weight = 1.0 - (double) distance / afterLength; + return (score / maxScore) * weight; +} + } // namespace latinime diff --git a/native/src/correction.h b/native/src/correction.h index d4e99f0ce..2114eff4b 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -27,8 +27,7 @@ namespace latinime { class ProximityInfo; class Correction { - -public: + public: typedef enum { TRAVERSE_ALL_ON_TERMINAL, TRAVERSE_ALL_NOT_ON_TERMINAL, @@ -37,6 +36,55 @@ public: NOT_ON_TERMINAL } CorrectionType; + ///////////////////////// + // static inline utils // + ///////////////////////// + + static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; + static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); + } + + static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; + inline static void multiplyIntCapped(const int multiplier, int *base) { + const int temp = *base; + if (temp != S_INT_MAX) { + // Branch if multiplier == 2 for the optimization + if (multiplier == 2) { + *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; + } else { + // TODO: This overflow check gives a wrong answer when, for example, + // temp = 2^16 + 1 and multiplier = 2^17 + 1. + // Fix this behavior. + const int tempRetval = temp * multiplier; + *base = tempRetval >= temp ? tempRetval : S_INT_MAX; + } + } + } + + inline static int powerIntCapped(const int base, const int n) { + if (n <= 0) return 1; + if (base == 2) { + return n < 31 ? 1 << n : S_INT_MAX; + } else { + int ret = base; + for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); + return ret; + } + } + + inline static void multiplyRate(const int rate, int *freq) { + if (*freq != S_INT_MAX) { + if (*freq > 1000000) { + *freq /= 100; + multiplyIntCapped(rate, freq); + } else { + multiplyIntCapped(rate, freq); + *freq /= 100; + } + } + } + Correction(const int typedLetterMultiplier, const int fullWordMultiplier); void initCorrection( const ProximityInfo *pi, const int inputLength, const int maxWordLength); @@ -44,11 +92,11 @@ public: // TODO: remove void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, - const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance); + const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance, + const bool doAutoCompletion, const int maxErrors); void checkState(); bool initProcessState(const int index); - int getOutputIndex(); int getInputIndex(); virtual ~Correction(); @@ -73,9 +121,12 @@ public: bool needsToPrune() const; - int getFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const unsigned short *word); + int getFreqForSplitMultipleWords( + const int *freqArray, const int *wordLengthArray, const int wordCount, + const bool isSpaceProximity, const unsigned short *word); int getFinalFreq(const int freq, unsigned short **word, int* wordLength); + int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength, + const int inputLength); CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); @@ -94,21 +145,44 @@ public: inline int getTreeParentIndex(const int index) const { return mCorrectionStates[index].mParentIndex; } -private: + + class RankingAlgorithm { + public: + static int calculateFinalFreq(const int inputIndex, const int depth, + const int freq, int *editDistanceTable, const Correction* correction, + const int inputLength); + static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, + const int wordCount, const Correction* correction, const bool isSpaceProximity, + const unsigned short *word); + static double calcNormalizedScore(const unsigned short* before, const int beforeLength, + const unsigned short* after, const int afterLength, const int score); + static int editDistance(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength); + private: + static const int CODE_SPACE = ' '; + static const int MAX_INITIAL_SCORE = 255; + static const int TYPED_LETTER_MULTIPLIER = 2; + static const int FULL_WORD_MULTIPLIER = 2; + }; + + private: inline void incrementInputIndex(); inline void incrementOutputIndex(); - inline bool needsToTraverseAllNodes(); inline void startToTraverseAllNodes(); inline bool isQuote(const unsigned short c); inline CorrectionType processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented); + inline CorrectionType processUnrelatedCorrectionType(); inline void addCharToCurrentWord(const int32_t c); + inline int getFinalFreqInternal(const int freq, unsigned short **word, int* wordLength, + const int inputLength); const int TYPED_LETTER_MULTIPLIER; const int FULL_WORD_MULTIPLIER; const ProximityInfo *mProximityInfo; bool mUseFullEditDistance; + bool mDoAutoCompletion; int mMaxEditDistance; int mMaxDepth; int mInputLength; @@ -116,6 +190,7 @@ private: int mMissingSpacePos; int mTerminalInputIndex; int mTerminalOutputIndex; + int mMaxErrors; // The following arrays are state buffer. unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; @@ -146,17 +221,11 @@ private: bool mMatching; bool mProximityMatching; + bool mAdditionalProximityMatching; bool mExceeding; bool mTransposing; bool mSkipping; - class RankingAlgorithm { - public: - static int calculateFinalFreq(const int inputIndex, const int depth, - const int freq, int *editDistanceTable, const Correction* correction); - static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const Correction* correction, const unsigned short *word); - }; }; } // namespace latinime #endif // LATINIME_CORRECTION_H diff --git a/native/src/correction_state.h b/native/src/correction_state.h index c04146e54..5b2cbd3a2 100644 --- a/native/src/correction_state.h +++ b/native/src/correction_state.h @@ -47,9 +47,9 @@ struct CorrectionState { bool mExceeding; bool mSkipping; bool mProximityMatching; + bool mAdditionalProximityMatching; bool mNeedsToTraverseAllNodes; - }; inline static void initCorrectionState(CorrectionState *state, const int rootPos, @@ -77,7 +77,7 @@ inline static void initCorrectionState(CorrectionState *state, const int rootPos state->mTransposing = false; state->mExceeding = false; state->mSkipping = false; - + state->mAdditionalProximityMatching = false; } } // namespace latinime diff --git a/native/src/debug.h b/native/src/debug.h index 38b2f107a..b13052c95 100644 --- a/native/src/debug.h +++ b/native/src/debug.h @@ -42,7 +42,7 @@ static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned s static inline void LOGI_S16(unsigned short* string, const unsigned int length) { unsigned char tmp_buffer[length]; convertToUnibyteString(string, tmp_buffer, length); - LOGI(">> %s", tmp_buffer); + AKLOGI(">> %s", tmp_buffer); // The log facility is throwing out log that comes too fast. The following // is a dirty way of slowing down processing so that we can see all log. // TODO : refactor this in a blocking log or something. @@ -53,7 +53,7 @@ static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int leng unsigned char c) { unsigned char tmp_buffer[length+1]; convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c); - LOGI(">> %s", tmp_buffer); + AKLOGI(">> %s", tmp_buffer); // Likewise // usleep(10); } @@ -64,7 +64,7 @@ static inline void printDebug(const char* tag, int* codes, int codesSize, int MA buf[codesSize] = 0; while (--codesSize >= 0) buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS]; - LOGI("%s, WORD = %s", tag, buf); + AKLOGI("%s, WORD = %s", tag, buf); free(buf); } diff --git a/native/src/defines.h b/native/src/defines.h index ef1beb92f..ffadb11c5 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -20,15 +20,32 @@ #if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) #include <cutils/log.h> +#define AKLOGE ALOGE +#define AKLOGI ALOGI + +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) + +static char charBuf[50]; + +static void dumpWord(const unsigned short* word, const int length) { + for (int i = 0; i < length; ++i) { + charBuf[i] = word[i]; + } + charBuf[length] = 0; + AKLOGI("[ %s ]", charBuf); +} + #else -#define LOGE(fmt, ...) -#define LOGI(fmt, ...) +#define AKLOGE(fmt, ...) +#define AKLOGI(fmt, ...) +#define DUMP_WORD(word, length) #endif #ifdef FLAG_DO_PROFILE // Profiler #include <cutils/log.h> #include <time.h> + #define PROF_BUF_SIZE 100 static double profile_buf[PROF_BUF_SIZE]; static double profile_old[PROF_BUF_SIZE]; @@ -42,8 +59,8 @@ static unsigned int profile_counter[PROF_BUF_SIZE]; #define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while(0) #define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id]) #define PROF_CLOCKOUT(prof_buf_id) \ - LOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) -#define PROF_OUTALL do { LOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0) + AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) +#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0) static void prof_reset(void) { for (int i = 0; i < PROF_BUF_SIZE; ++i) { @@ -55,9 +72,9 @@ static void prof_reset(void) { static void prof_out(void) { if (profile_counter[PROF_BUF_SIZE - 1] != 1) { - LOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); + AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); } - LOGI("Total time is %6.3f ms.", + AKLOGI("Total time is %6.3f ms.", profile_buf[PROF_BUF_SIZE - 1] * 1000 / (double)CLOCKS_PER_SEC); double all = 0; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { @@ -66,7 +83,7 @@ static void prof_out(void) { if (all == 0) all = 1; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { if (profile_buf[i] != 0) { - LOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", + AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", i, (profile_buf[i] * 100 / all), profile_buf[i] * 1000 / (double)CLOCKS_PER_SEC, profile_counter[i]); } @@ -98,21 +115,10 @@ static void prof_out(void) { #define DEBUG_SHOW_FOUND_WORD false #define DEBUG_NODE DEBUG_DICT_FULL #define DEBUG_TRACE DEBUG_DICT_FULL -#define DEBUG_PROXIMITY_INFO true +#define DEBUG_PROXIMITY_INFO false #define DEBUG_CORRECTION false -#define DEBUG_CORRECTION_FREQ true - -#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) - -static char charBuf[50]; - -static void dumpWord(const unsigned short* word, const int length) { - for (int i = 0; i < length; ++i) { - charBuf[i] = word[i]; - } - charBuf[length] = 0; - LOGI("[ %s ]", charBuf); -} +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_WORDS_PRIORITY_QUEUE false #else // FLAG_DBG @@ -125,8 +131,8 @@ static void dumpWord(const unsigned short* word, const int length) { #define DEBUG_PROXIMITY_INFO false #define DEBUG_CORRECTION false #define DEBUG_CORRECTION_FREQ false +#define DEBUG_WORDS_PRIORITY_QUEUE false -#define DUMP_WORD(word, length) #endif // FLAG_DBG @@ -163,58 +169,88 @@ static void dumpWord(const unsigned short* word, const int length) { #define NOT_VALID_WORD -99 #define NOT_A_CHARACTER -1 #define NOT_A_DISTANCE -1 +#define NOT_A_COORDINATE -1 #define EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO -2 #define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO -3 +#define ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO -4 #define NOT_A_INDEX -1 +#define NOT_A_FREQUENCY -1 #define KEYCODE_SPACE ' ' #define CALIBRATE_SCORE_BY_TOUCH_COORDINATES true #define SUGGEST_WORDS_WITH_MISSING_CHARACTER true -#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true -#define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true +#define SUGGEST_MULTIPLE_WORDS true // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent. #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80 #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12 -#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67 +#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58 +#define WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE 50 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 -#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60 +#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 +#define WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE 70 #define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105 -#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160 +#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 148 +#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER 3 #define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45 #define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70 #define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96 #define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50 +#define TWO_WORDS_CORRECTION_DEMOTION_BASE 80 +#define TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER 1 #define ZERO_DISTANCE_PROMOTION_RATE 110 #define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f #define HALF_SCORE_SQUARED_RADIUS 32.0f +#define MAX_FREQ 255 -// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java +// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. #define MAX_WORD_LENGTH_INTERNAL 48 +// This must be equal to ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE in KeyDetector.java +#define ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE 2 + +// Word limit for sub queues used in WordsPriorityQueuePool. Sub queues are temporary queues used +// for better performance. +// Holds up to 1 candidate for each word +#define SUB_QUEUE_MAX_WORDS 1 +#define SUB_QUEUE_MAX_COUNT 10 +#define SUB_QUEUE_MIN_WORD_LENGTH 4 +#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10 +#define MULTIPLE_WORDS_DEMOTION_RATE 80 +#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6 + +#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39 +#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22 + #define MAX_DEPTH_MULTIPLIER 3 +#define FIRST_WORD_INDEX 0 + // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German // word in the dictionary #define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5 // Minimum suggest depth for one word for all cases except for missing space suggestions. #define MIN_SUGGEST_DEPTH 1 -#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3 +#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 -#define min(a,b) ((a)<(b)?(a):(b)) -#define max(a,b) ((a)>(b)?(a):(b)) +template<typename T> inline T min(T a, T b) { return a < b ? a : b; } +template<typename T> inline T max(T a, T b) { return a > b ? a : b; } // The ratio of neutral area radius to sweet spot radius. #define NEUTRAL_AREA_RADIUS_RATIO 1.3f +// DEBUG +#define INPUTLENGTH_FOR_DEBUG -1 +#define MIN_OUTPUT_INDEX_FOR_DEBUG -1 + #endif // LATINIME_DEFINES_H diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp index a49769bdb..822c2151d 100644 --- a/native/src/dictionary.cpp +++ b/native/src/dictionary.cpp @@ -33,11 +33,14 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) { if (DEBUG_DICT) { if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) { - LOGI("Max word length (%d) is greater than %d", + AKLOGI("Max word length (%d) is greater than %d", maxWordLength, MAX_WORD_LENGTH_INTERNAL); - LOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF)); + AKLOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF)); } } + mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); + mWordsPriorityQueuePool = new WordsPriorityQueuePool( + maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength); mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives, @@ -45,6 +48,8 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, } Dictionary::~Dictionary() { + delete mCorrection; + delete mWordsPriorityQueuePool; delete mUnigramDictionary; delete mBigramDictionary; } diff --git a/native/src/dictionary.h b/native/src/dictionary.h index d5de0083a..90d7148d5 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -17,22 +17,25 @@ #ifndef LATINIME_DICTIONARY_H #define LATINIME_DICTIONARY_H -#include "basechars.h" #include "bigram_dictionary.h" #include "char_utils.h" +#include "correction.h" #include "defines.h" #include "proximity_info.h" #include "unigram_dictionary.h" +#include "words_priority_queue_pool.h" namespace latinime { class Dictionary { -public: + public: Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives); + int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) { - return mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, + return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, + mCorrection, xcoordinates, ycoordinates, codes, codesSize, flags, outWords, frequencies); } @@ -53,19 +56,9 @@ public: // public static utility methods // static inline methods should be defined in the header file - static unsigned short getChar(const unsigned char *dict, int *pos); - static int getCount(const unsigned char *dict, int *pos); - static bool getTerminal(const unsigned char *dict, int *pos); - static int getAddress(const unsigned char *dict, int *pos); - static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos); static int wideStrLen(unsigned short *str); - // returns next sibling's position - static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion, - const int pos, unsigned short *c, int *childrenPosition, - bool *terminal, int *freq); - static inline unsigned short toBaseLowerCase(unsigned short c); -private: + private: bool hasBigram(); const unsigned char *mDict; @@ -79,60 +72,12 @@ private: const bool IS_LATEST_DICT_VERSION; UnigramDictionary *mUnigramDictionary; BigramDictionary *mBigramDictionary; + WordsPriorityQueuePool *mWordsPriorityQueuePool; + Correction *mCorrection; }; // public static utility methods // static inline methods should be defined in the header file -inline unsigned short Dictionary::getChar(const unsigned char *dict, int *pos) { - unsigned short ch = (unsigned short) (dict[(*pos)++] & 0xFF); - // If the code is 255, then actual 16 bit code follows (in big endian) - if (ch == 0xFF) { - ch = ((dict[*pos] & 0xFF) << 8) | (dict[*pos + 1] & 0xFF); - (*pos) += 2; - } - return ch; -} - -inline int Dictionary::getCount(const unsigned char *dict, int *pos) { - return dict[(*pos)++] & 0xFF; -} - -inline bool Dictionary::getTerminal(const unsigned char *dict, int *pos) { - return (dict[*pos] & FLAG_TERMINAL_MASK) > 0; -} - -inline int Dictionary::getAddress(const unsigned char *dict, int *pos) { - int address = 0; - if ((dict[*pos] & FLAG_ADDRESS_MASK) == 0) { - *pos += 1; - } else { - address += (dict[*pos] & (ADDRESS_MASK >> 16)) << 16; - address += (dict[*pos + 1] & 0xFF) << 8; - address += (dict[*pos + 2] & 0xFF); - *pos += 3; - } - return address; -} - -inline int Dictionary::getFreq(const unsigned char *dict, - const bool isLatestDictVersion, int *pos) { - int freq = dict[(*pos)++] & 0xFF; - if (isLatestDictVersion) { - // skipping bigram - int bigramExist = (dict[*pos] & FLAG_BIGRAM_READ); - if (bigramExist > 0) { - int nextBigramExist = 1; - while (nextBigramExist > 0) { - (*pos) += 3; - nextBigramExist = (dict[(*pos)++] & FLAG_BIGRAM_CONTINUED); - } - } else { - (*pos)++; - } - } - return freq; -} - inline int Dictionary::wideStrLen(unsigned short *str) { if (!str) return 0; unsigned short *end = str; @@ -140,35 +85,6 @@ inline int Dictionary::wideStrLen(unsigned short *str) { end++; return end - str; } - -inline int Dictionary::setDictionaryValues(const unsigned char *dict, - const bool isLatestDictVersion, const int pos, unsigned short *c,int *childrenPosition, - bool *terminal, int *freq) { - int position = pos; - // -- at char - *c = Dictionary::getChar(dict, &position); - // -- at flag/add - *terminal = Dictionary::getTerminal(dict, &position); - *childrenPosition = Dictionary::getAddress(dict, &position); - // -- after address or flag - *freq = (*terminal) ? Dictionary::getFreq(dict, isLatestDictVersion, &position) : 1; - // returns next sibling's position - return position; -} - - -inline unsigned short Dictionary::toBaseLowerCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (c >='A' && c <= 'Z') { - c |= 32; - } else if (c > 127) { - c = latin_tolower(c); - } - return c; -} - } // namespace latinime #endif // LATINIME_DICTIONARY_H diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index 763a3a174..b6bab2274 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -47,12 +47,12 @@ ProximityInfo::ProximityInfo(const int maxProximityCharsSize, const int keyboard HAS_TOUCH_POSITION_CORRECTION_DATA(keyCount > 0 && keyXCoordinates && keyYCoordinates && keyWidths && keyHeights && keyCharCodes && sweetSpotCenterXs && sweetSpotCenterYs && sweetSpotRadii), - mInputXCoordinates(NULL), mInputYCoordinates(NULL), + mInputXCoordinates(0), mInputYCoordinates(0), mTouchPositionCorrectionEnabled(false) { const int proximityGridLength = GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE; mProximityCharsArray = new uint32_t[proximityGridLength]; if (DEBUG_PROXIMITY_INFO) { - LOGI("Create proximity info array %d", proximityGridLength); + AKLOGI("Create proximity info array %d", proximityGridLength); } memcpy(mProximityCharsArray, proximityCharsArray, proximityGridLength * sizeof(mProximityCharsArray[0])); @@ -100,13 +100,21 @@ inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y) } bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { + if (x < 0 || y < 0) { + if (DEBUG_DICT) { + AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); + assert(false); + } + return false; + } + const int startIndex = getStartIndexFromCoordinates(x, y); if (DEBUG_PROXIMITY_INFO) { - LOGI("hasSpaceProximity: index %d", startIndex); + AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); } for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { if (DEBUG_PROXIMITY_INFO) { - LOGI("Index: %d", mProximityCharsArray[startIndex + i]); + AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]); } if (mProximityCharsArray[startIndex + i] == KEYCODE_SPACE) { return true; @@ -157,6 +165,9 @@ float ProximityInfo::calculateNormalizedSquaredDistance( if (!hasSweetSpotData(keyIndex)) { return NOT_A_DISTANCE_FLOAT; } + if (NOT_A_COORDINATE == mInputXCoordinates[inputIndex]) { + return NOT_A_DISTANCE_FLOAT; + } const float squaredDistance = calculateSquaredDistanceFromSweetSpotCenter(keyIndex, inputIndex); const float squaredRadius = square(mSweetSpotRadii[keyIndex]); return squaredDistance / squaredRadius; @@ -167,7 +178,7 @@ int ProximityInfo::getKeyIndex(const int c) const { // We do not have the coordinate data return NOT_A_INDEX; } - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); + const unsigned short baseLowerC = toBaseLowerCase(c); if (baseLowerC > MAX_CHAR_CODE) { return NOT_A_INDEX; } @@ -232,7 +243,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde const unsigned short c, const bool checkProximityChars, int *proximityIndex) const { const int *currentChars = getProximityCharsAt(index); const int firstChar = currentChars[0]; - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); + const unsigned short baseLowerC = toBaseLowerCase(c); // The first char in the array is what user typed. If it matches right away, // that means the user typed that same char for this pos. @@ -245,12 +256,13 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde // If the non-accented, lowercased version of that first character matches c, // then we have a non-accented version of the accented character the user // typed. Treat it as a close char. - if (Dictionary::toBaseLowerCase(firstChar) == baseLowerC) + if (toBaseLowerCase(firstChar) == baseLowerC) return NEAR_PROXIMITY_CHAR; // Not an exact nor an accent-alike match: search the list of close keys int j = 1; - while (j < MAX_PROXIMITY_CHARS_SIZE && currentChars[j] > 0) { + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); if (matched) { if (proximityIndex) { @@ -260,6 +272,21 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde } ++j; } + if (j < MAX_PROXIMITY_CHARS_SIZE + && currentChars[j] == ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + ++j; + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return ADDITIONAL_PROXIMITY_CHAR; + } + ++j; + } + } // Was not included, signal this as an unrelated character. return UNRELATED_CHAR; diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index 35e354c6e..b77c1bb0a 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -26,7 +26,7 @@ namespace latinime { class Correction; class ProximityInfo { -public: + public: static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2 = 10; static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR = 1 << NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2; @@ -38,7 +38,9 @@ public: // It is a char located nearby on the keyboard NEAR_PROXIMITY_CHAR, // It is an unrelated char - UNRELATED_CHAR + UNRELATED_CHAR, + // Additional proximity char which can differ by language. + ADDITIONAL_PROXIMITY_CHAR } ProximityType; ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth, @@ -56,7 +58,7 @@ public: bool existsCharInProximityAt(const int index, const int c) const; bool existsAdjacentProximityChars(const int index) const; ProximityType getMatchedProximityId(const int index, const unsigned short c, - const bool checkProximityChars, int *proximityIndex = NULL) const; + const bool checkProximityChars, int *proximityIndex = 0) const; int getNormalizedSquaredDistance(const int inputIndex, const int proximityIndex) const { return mNormalizedSquaredDistances[inputIndex * MAX_PROXIMITY_CHARS_SIZE + proximityIndex]; } @@ -68,7 +70,7 @@ public: return mTouchPositionCorrectionEnabled; } -private: + private: // The max number of the keys in one keyboard layout static const int MAX_KEY_COUNT_IN_A_KEYBOARD = 64; // The upper limit of the char code in mCodeToKeyIndex diff --git a/native/src/terminal_attributes.h b/native/src/terminal_attributes.h new file mode 100644 index 000000000..1f9815936 --- /dev/null +++ b/native/src/terminal_attributes.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TERMINAL_ATTRIBUTES_H +#define LATINIME_TERMINAL_ATTRIBUTES_H + +#include "unigram_dictionary.h" + +namespace latinime { + +/** + * This class encapsulates information about a terminal that allows to + * retrieve local node attributes like the list of shortcuts without + * exposing the format structure to the client. + */ +class TerminalAttributes { + public: + class ShortcutIterator { + const uint8_t* const mDict; + bool mHasNextShortcutTarget; + int mPos; + + public: + ShortcutIterator(const uint8_t* dict, const int pos, const uint8_t flags) : mDict(dict), + mPos(pos) { + mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS)); + } + + inline bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as a uint16_t string. For parameters and return value + // see BinaryFormat::getWordAtAddress. + inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) { + const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos); + mHasNextShortcutTarget = + 0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT); + int shortcutAddress = + BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos); + return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord); + } + }; + + private: + const uint8_t* const mDict; + const uint8_t mFlags; + const int mStartPos; + + public: + TerminalAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) : + mDict(dict), mFlags(flags), mStartPos(pos) { + } + + inline bool isShortcutOnly() const { + return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY); + } + + inline ShortcutIterator getShortcutIterator() const { + return ShortcutIterator(mDict, mStartPos, mFlags); + } +}; +} // namespace latinime + +#endif // LATINIME_TERMINAL_ATTRIBUTES_H diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 8eb5a9700..155bdcb7a 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -25,6 +25,7 @@ #include "unigram_dictionary.h" #include "binary_format.h" +#include "terminal_attributes.h" namespace latinime { @@ -46,21 +47,25 @@ UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typed BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { if (DEBUG_DICT) { - LOGI("UnigramDictionary - constructor"); + AKLOGI("UnigramDictionary - constructor"); } - mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); } UnigramDictionary::~UnigramDictionary() { - delete mCorrection; } -static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize, +static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize, const int MAX_PROXIMITY_CHARS) { return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize; } -bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const { +// TODO: This needs to take an const unsigned short* and not tinker with its contents +static inline void addWord( + unsigned short *word, int length, int frequency, WordsPriorityQueue *queue) { + queue->push(frequency, word, length); +} + +bool UnigramDictionary::isDigraph(const int *codes, const int i, const int codesSize) const { // There can't be a digraph if we don't have at least 2 characters to examine if (i + 2 > codesSize) return false; @@ -86,9 +91,10 @@ bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codes // codesSrc is the current point in the user-input, original, content-unmodified buffer. // codesRemain is the remaining size in codesSrc. void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, - const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, - const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, - const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) { + const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, + const int codesBufferSize, const int flags, const int *codesSrc, + const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, + WordsPriorityQueuePool *queuePool) { if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) { for (int i = 0; i < codesRemain; ++i) { @@ -105,8 +111,8 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, codesBufferSize, flags, codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1, - currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords, - frequencies); + currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, correction, + queuePool); // Copy the second char of the digraph in place, then continue processing on // the remaining part of the word. @@ -114,9 +120,9 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS, BYTES_IN_ONE_CHAR); getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, - codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS, - codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, - outWords, frequencies); + codesBuffer, codesBufferSize, flags, + codesSrc + i * MAX_PROXIMITY_CHARS, codesRemain - i, currentDepth + 1, + codesDest + i * MAX_PROXIMITY_CHARS, correction, queuePool); return; } } @@ -132,41 +138,47 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit memcpy(codesDest, codesSrc, remainingBytes); getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies, - flags); + (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, flags, correction, + queuePool); } -int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, +int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, + WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies) { + Correction* masterCorrection = correction; if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags) { // Incrementally tune the word and try all possibilities int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies); + codesSize, flags, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool); } else { // Normal processing - getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, - outWords, frequencies, flags); + getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, flags, + masterCorrection, queuePool); } PROF_START(20); - // Get the word count - int suggestedWordsCount = 0; - while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) { - suggestedWordsCount++; + if (DEBUG_DICT) { + double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + ns += 0; + AKLOGI("Max normalized score = %f", ns); } + const int suggestedWordsCount = + queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords); if (DEBUG_DICT) { - LOGI("Returning %d words", suggestedWordsCount); + double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + ns += 0; + AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { -#ifdef FLAG_DBG - short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH; + short unsigned int* w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; - LOGI("%s %i", s, mFrequencies[j]); -#endif + AKLOGI("%s %i", s, frequencies[j]); } } PROF_END(20); @@ -175,23 +187,19 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x } void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, - const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies, const int flags) { + const int *xcoordinates, const int *ycoordinates, const int *codes, + const int inputLength, const int flags, Correction *correction, + WordsPriorityQueuePool *queuePool) { PROF_OPEN; PROF_START(0); - initSuggestions( - proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies); - if (DEBUG_DICT) assert(codesSize == mInputLength); - - const int maxDepth = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); - mCorrection->initCorrection(mProximityInfo, mInputLength, maxDepth); + queuePool->clearAll(); PROF_END(0); - const bool useFullEditDistance = USE_FULL_EDIT_DISTANCE & flags; - // TODO: remove PROF_START(1); - getSuggestionCandidates(useFullEditDistance); + const bool useFullEditDistance = USE_FULL_EDIT_DISTANCE & flags; + getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, + inputLength, correction, queuePool); PROF_END(1); PROF_START(2); @@ -203,250 +211,369 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_END(3); PROF_START(4); - // Note: This line is intentionally left blank + bool hasAutoCorrectionCandidate = false; + WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); + if (masterQueue->size() > 0) { + double nsForMaster = masterQueue->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); + hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); + } PROF_END(4); PROF_START(5); - // Suggestions with missing space - if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER - && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { - for (int i = 1; i < codesSize; ++i) { - if (DEBUG_DICT) { - LOGI("--- Suggest missing space characters %d", i); - } - getMissingSpaceWords(mInputLength, i, mCorrection, useFullEditDistance); - } + // Multiple word suggestions + if (SUGGEST_MULTIPLE_WORDS + && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { + getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, inputLength, correction, queuePool, + hasAutoCorrectionCandidate); } PROF_END(5); PROF_START(6); - if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) { - // The first and last "mistyped spaces" are taken care of by excessive character handling - for (int i = 1; i < codesSize - 1; ++i) { - if (DEBUG_DICT) { - LOGI("--- Suggest words with proximity space %d", i); - } - const int x = xcoordinates[i]; - const int y = ycoordinates[i]; - if (DEBUG_PROXIMITY_INFO) { - LOGI("Input[%d] x = %d, y = %d, has space proximity = %d", - i, x, y, proximityInfo->hasSpaceProximity(x, y)); - } - if (proximityInfo->hasSpaceProximity(x, y)) { - getMistypedSpaceWords(mInputLength, i, mCorrection, useFullEditDistance); + // Note: This line is intentionally left blank + PROF_END(6); + + if (DEBUG_DICT) { + queuePool->dumpSubQueue1TopSuggestions(); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + WordsPriorityQueue* queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i); + if (queue->size() > 0) { + WordsPriorityQueue::SuggestedWord* sw = queue->top(); + const int score = sw->mScore; + const unsigned short* word = sw->mWord; + const int wordLength = sw->mWordLength; + double ns = Correction::RankingAlgorithm::calcNormalizedScore( + proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); + ns += 0; + AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, + (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD)); + DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); + DUMP_WORD(word, wordLength); } } } - PROF_END(6); } void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, - const int *yCoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies) { + const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { if (DEBUG_DICT) { - LOGI("initSuggest"); - } - mFrequencies = frequencies; - mOutputChars = outWords; - mInputLength = codesSize; - proximityInfo->setInputParams(codes, codesSize, xCoordinates, yCoordinates); - mProximityInfo = proximityInfo; -} - -static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) { - if (c < nextLettersSize) { - nextLetters[c]++; + AKLOGI("initSuggest"); } -} - -// TODO: We need to optimize addWord by using STL or something -// TODO: This needs to take an const unsigned short* and not tinker with its contents -bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) { - word[length] = 0; - if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) { -#ifdef FLAG_DBG - char s[length + 1]; - for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Found word = %s, freq = %d", s, frequency); -#endif - } - if (length > MAX_WORD_LENGTH) { - if (DEBUG_DICT) { - LOGI("Exceeded max word length."); - } - return false; - } - - // Find the right insertion point - int insertAt = 0; - while (insertAt < MAX_WORDS) { - // TODO: How should we sort words with the same frequency? - if (frequency > mFrequencies[insertAt]) { - break; - } - insertAt++; - } - if (insertAt < MAX_WORDS) { - if (DEBUG_DICT) { -#ifdef FLAG_DBG - char s[length + 1]; - for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX); -#endif - } - memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), - (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), - (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0])); - mFrequencies[insertAt] = frequency; - memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short), - (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short), - (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH); - unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH; - while (length--) { - *dest++ = *word++; - } - *dest = 0; // NULL terminate - if (DEBUG_DICT) { - LOGI("Added word at %d", insertAt); - } - return true; - } - return false; + proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); + const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); + correction->initCorrection(proximityInfo, inputLength, maxDepth); } static const char QUOTE = '\''; static const char SPACE = ' '; -void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance) { +void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, Correction *correction, + WordsPriorityQueuePool *queuePool) { + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, + true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); +} + +void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, + const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, + const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams - mCorrection->setCorrectionParams(0, 0, 0, - -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance); + correction->setCorrectionParams(0, 0, 0, + -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, + doAutoCompletion, maxErrors); int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position - int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); + int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); int outputIndex = 0; - mCorrection->initCorrectionState(rootPosition, childCount, (mInputLength <= 0)); + correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0)); // Depth first search while (outputIndex >= 0) { - if (mCorrection->initProcessState(outputIndex)) { - int siblingPos = mCorrection->getTreeSiblingPos(outputIndex); + if (correction->initProcessState(outputIndex)) { + int siblingPos = correction->getTreeSiblingPos(outputIndex); int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - mCorrection, &childCount, &firstChildPos, &siblingPos); + correction, &childCount, &firstChildPos, &siblingPos, queuePool, + currentWordIndex); // Update next sibling pos - mCorrection->setTreeSiblingPos(outputIndex, siblingPos); + correction->setTreeSiblingPos(outputIndex, siblingPos); if (needsToTraverseChildrenNodes) { // Goes to child node - outputIndex = mCorrection->goDownTree(outputIndex, childCount, firstChildPos); + outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos); } } else { // Goes to parent sibling node - outputIndex = mCorrection->getTreeParentIndex(outputIndex); + outputIndex = correction->getTreeParentIndex(outputIndex); } } } -void UnigramDictionary::getMissingSpaceWords( - const int inputLength, const int missingSpacePos, Correction *correction, - const bool useFullEditDistance) { - correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, - -1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos, - useFullEditDistance); - getSplitTwoWordsSuggestion(inputLength, correction); -} - -void UnigramDictionary::getMistypedSpaceWords( - const int inputLength, const int spaceProximityPos, Correction *correction, - const bool useFullEditDistance) { - correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, - -1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */, - useFullEditDistance); - getSplitTwoWordsSuggestion(inputLength, correction); -} - -inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, - const int inputIndex, const int skipPos, const int depth) { - const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex); - // Skip the ' or other letter and continue deeper - return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth; -} +inline void UnigramDictionary::onTerminal(const int freq, + const TerminalAttributes& terminalAttributes, Correction *correction, + WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, + const int currentWordIndex) { + const int inputIndex = correction->getInputIndex(); + const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; -inline void UnigramDictionary::onTerminal(const int freq, Correction *correction) { int wordLength; unsigned short* wordPointer; - const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); - if (finalFreq >= 0) { - addWord(wordPointer, wordLength, finalFreq); + + if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); + if (finalFreq != NOT_A_FREQUENCY) { + if (!terminalAttributes.isShortcutOnly()) { + addWord(wordPointer, wordLength, finalFreq, masterQueue); + } + + // Please note that the shortcut candidates will be added to the master queue only. + TerminalAttributes::ShortcutIterator iterator = + terminalAttributes.getShortcutIterator(); + while (iterator.hasNextShortcutTarget()) { + // TODO: addWord only supports weak ordering, meaning we have no means + // to control the order of the shortcuts relative to one another or to the word. + // We need to either modulate the frequency of each shortcut according + // to its own shortcut frequency or to make the queue + // so that the insert order is protected inside the queue for words + // with the same score. + uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; + const int shortcutTargetStringLength = iterator.getNextShortcutTarget( + MAX_WORD_LENGTH_INTERNAL, shortcutTarget); + addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue); + } + } + } + + // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH + // or more length. + if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) { + WordsPriorityQueue *subQueue; + subQueue = queuePool->getSubQueue(currentWordIndex, inputIndex); + if (!subQueue) { + return; + } + const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength, + inputIndex); + addWord(wordPointer, wordLength, finalFreq, subQueue); } } -void UnigramDictionary::getSplitTwoWordsSuggestion( - const int inputLength, Correction* correction) { - const int spaceProximityPos = correction->getSpaceProximityPos(); - const int missingSpacePos = correction->getMissingSpacePos(); +bool UnigramDictionary::getSubStringSuggestion( + ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, + const int *codes, const bool useFullEditDistance, Correction *correction, + WordsPriorityQueuePool* queuePool, const int inputLength, + const bool hasAutoCorrectionCandidate, const int currentWordIndex, + const int inputWordStartPos, const int inputWordLength, + const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, + int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) { + unsigned short* tempOutputWord = 0; + int nextWordLength = 0; + // TODO: Optimize init suggestion + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, + inputLength, correction); + + int freq = getMostFrequentWordLike( + inputWordStartPos, inputWordLength, proximityInfo, mWord); + if (freq > 0) { + nextWordLength = inputWordLength; + tempOutputWord = mWord; + } else if (!hasAutoCorrectionCandidate) { + if (inputWordStartPos > 0) { + const int offset = inputWordStartPos; + initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], + codes + offset * MAX_PROXIMITY_CHARS, inputWordLength, correction); + queuePool->clearSubQueue(currentWordIndex); + getSuggestionCandidates(useFullEditDistance, inputWordLength, correction, + queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); + if (DEBUG_DICT) { + if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { + AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord(); + } + } + } + } + WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); + if (!queue || queue->size() < 1) { + return false; + } + int score = 0; + const double ns = queue->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), inputWordLength, + &tempOutputWord, &score, &nextWordLength); + if (DEBUG_DICT) { + AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score); + } + // Two words correction won't be done if the score of the first word doesn't exceed the + // threshold. + if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD + || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { + return false; + } + freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); + } if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); + AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)" + , currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos, + wordLengthArray[0]); + } + if (freq <= 0 || nextWordLength <= 0 + || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) { + return false; + } + for (int i = 0; i < nextWordLength; ++i) { + outputWord[outputWordStartPos + i] = tempOutputWord[i]; } - const bool isSpaceProximity = spaceProximityPos >= 0; - const int firstWordStartPos = 0; - const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; - const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity - ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); - - if (inputLength >= MAX_WORD_LENGTH) return; - if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos - || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) - return; - const int newWordLength = firstWordLength + secondWordLength + 1; - // Allocating variable length array on stack - unsigned short word[newWordLength]; - const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord); - if (DEBUG_DICT) { - LOGI("First freq: %d", firstFreq); + // Put output values + freqArray[currentWordIndex] = freq; + // TODO: put output length instead of input length + wordLengthArray[currentWordIndex] = inputWordLength; + const int tempOutputWordLength = outputWordStartPos + nextWordLength; + if (outputWordLength) { + *outputWordLength = tempOutputWordLength; } - if (firstFreq <= 0) return; - for (int i = 0; i < firstWordLength; ++i) { - word[i] = mWord[i]; + if ((inputWordStartPos + inputWordLength) < inputLength) { + if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) { + return false; + } + outputWord[tempOutputWordLength] = SPACE; + if (outputWordLength) { + ++*outputWordLength; + } + } else if (currentWordIndex >= 1) { + // TODO: Handle 3 or more words + const int pairFreq = correction->getFreqForSplitMultipleWords( + freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord); + if (DEBUG_DICT) { + DUMP_WORD(outputWord, tempOutputWordLength); + AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq, + inputLength, wordLengthArray[0], tempOutputWordLength); + } + addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue()); } + return true; +} - const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord); - if (DEBUG_DICT) { - LOGI("Second freq: %d", secondFreq); +void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex, + const int outputWordLength, int *freqArray, int* wordLengthArray, + unsigned short* outputWord) { + if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { + // Return if the last word index + return; } - if (secondFreq <= 0) return; + if (startWordIndex >= 1 + && (hasAutoCorrectionCandidate + || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) { + // Do not suggest 3+ words if already has auto correction candidate + return; + } + for (int i = startInputPos + 1; i < inputLength; ++i) { + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Multi words(%d), start in %d sep %d start out %d", + startWordIndex, startInputPos, i, outputWordLength); + DUMP_WORD(outputWord, outputWordLength); + } + int tempOutputWordLength = 0; + // Current word + int inputWordStartPos = startInputPos; + int inputWordLength = i - startInputPos; + if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + startWordIndex, inputWordStartPos, inputWordLength, outputWordLength, + true /* not used */, freqArray, wordLengthArray, outputWord, + &tempOutputWordLength)) { + continue; + } + + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Do missing space correction"); + } + // Next word + // Missing space + inputWordStartPos = i; + inputWordLength = inputLength - i; + if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, + false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) { + getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, inputLength, correction, queuePool, + hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1, + tempOutputWordLength, freqArray, wordLengthArray, outputWord); + } - word[firstWordLength] = SPACE; - for (int i = (firstWordLength + 1); i < newWordLength; ++i) { - word[i] = mWord[i - firstWordLength - 1]; + // Mistyped space + ++inputWordStartPos; + --inputWordLength; + + if (inputWordLength <= 0) { + continue; + } + + const int x = xcoordinates[inputWordStartPos - 1]; + const int y = ycoordinates[inputWordStartPos - 1]; + if (!proximityInfo->hasSpaceProximity(x, y)) { + continue; + } + + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Do mistyped space correction"); + } + getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, + true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0); } +} - const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word); +void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate) { + if (inputLength >= MAX_WORD_LENGTH) return; + if (DEBUG_DICT) { + // MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16 + assert(MAX_PROXIMITY_CHARS == 16); + } if (DEBUG_DICT) { - LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); + AKLOGI("--- Suggest multiple words"); } - addWord(word, newWordLength, pairFreq); - return; + + // Allocating fixed length array on stack + unsigned short outputWord[MAX_WORD_LENGTH]; + int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; + int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; + const int outputWordLength = 0; + const int startInputPos = 0; + const int startWordIndex = 0; + getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate, + startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray, + outputWord); } // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, - const int inputLength, unsigned short *word) { + const int inputLength, ProximityInfo *proximityInfo, unsigned short *word) { uint16_t inWord[inputLength]; for (int i = 0; i < inputLength; ++i) { - inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i); + inWord[i] = (uint16_t)proximityInfo->getPrimaryCharAt(startInputIndex + i); } return getMostFrequentWordLikeInner(inWord, inputLength, word); } @@ -470,8 +597,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - int32_t baseChar = Dictionary::toBaseLowerCase(character); - const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]); + int32_t baseChar = toBaseLowerCase(character); + const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; @@ -483,8 +610,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, if (hasMultipleChars) { character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); while (NOT_A_CHARACTER != character) { - baseChar = Dictionary::toBaseLowerCase(character); - if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + baseChar = toBaseLowerCase(character); + if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false; @@ -521,9 +648,10 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor int maxFreq = -1; const uint8_t* const root = DICT_ROOT; - mStackChildCount[0] = root[0]; + int startPos = 0; + mStackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); mStackInputIndex[0] = 0; - mStackSiblingPos[0] = 1; + mStackSiblingPos[0] = startPos; while (depth >= 0) { const int charGroupCount = mStackChildCount[depth]; int pos = mStackSiblingPos[depth]; @@ -597,7 +725,8 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition) { + int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, + const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } @@ -648,7 +777,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL || stateType == Correction::ON_TERMINAL) { needsToInvokeOnTerminal = true; - } else if (stateType == Correction::UNRELATED) { + } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) { // We found that this is an unrelated character, so we should give up traversing // this node and its children entirely. // However we may not be on the last virtual node yet so we skip the remaining @@ -672,12 +801,14 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, } while (NOT_A_CHARACTER != c); if (isTerminalNode) { - if (needsToInvokeOnTerminal) { - // The frequency should be here, because we come here only if this is actually - // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - onTerminal(freq, mCorrection); - } + // The frequency should be here, because we come here only if this is actually + // a terminal node, and we are on its last char. + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); + const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); + TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); + onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, + currentWordIndex); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. @@ -702,7 +833,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); if (DEBUG_DICT_FULL) { - LOGI("Traversing was pruned."); + AKLOGI("Traversing was pruned."); } return false; } diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index ef9709a89..396a81149 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -22,17 +22,14 @@ #include "correction_state.h" #include "defines.h" #include "proximity_info.h" - -#ifndef NULL -#define NULL 0 -#endif +#include "words_priority_queue.h" +#include "words_priority_queue_pool.h" namespace latinime { +class TerminalAttributes; class UnigramDictionary { - -public: - + public: // Mask and flags for children address type selection. static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; @@ -46,8 +43,14 @@ public: // Flag for terminal groups static const int FLAG_IS_TERMINAL = 0x10; + // Flag for shortcut targets presence + static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; // Flag for bigram presence static const int FLAG_HAS_BIGRAMS = 0x04; + // Flag for shortcut-only words. Some words are shortcut-only, which means they match when + // the user types them but they don't pop in the suggestion strip, only the words they are + // shortcuts for do. + static const int FLAG_IS_SHORTCUT_ONLY = 0x02; // Attribute (bigram/shortcut) related flags: // Flag for presence of more attributes @@ -64,47 +67,73 @@ public: static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + // Error tolerances + static const int DEFAULT_MAX_ERRORS = 2; + static const int MAX_ERRORS_FOR_TWO_WORDS = 1; + UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion); bool isValidWord(const uint16_t* const inWord, const int length) const; int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; - int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, + Correction *correction, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies); virtual ~UnigramDictionary(); -private: - + private: void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies, const int flags); - bool isDigraph(const int* codes, const int i, const int codesSize) const; + const int *ycoordinates, const int *codes, const int inputLength, + const int flags, Correction *correction, WordsPriorityQueuePool *queuePool); + bool isDigraph(const int *codes, const int i, const int codesSize) const; void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, - const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, - const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies); + const int codesBufferSize, const int flags, const int* codesSrc, + const int codesRemain, const int currentDepth, int* codesDest, Correction *correction, + WordsPriorityQueuePool* queuePool); void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies); - void getSuggestionCandidates(const bool useFullEditDistance); - bool addWord(unsigned short *word, int length, int frequency); - void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction); - void getMissingSpaceWords(const int inputLength, const int missingSpacePos, - Correction *correction, const bool useFullEditDistance); - void getMistypedSpaceWords(const int inputLength, const int spaceProximityPos, - Correction *correction, const bool useFullEditDistance); - void onTerminal(const int freq, Correction *correction); + const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); + void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const bool useFullEditDistance, + const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool); + void getSuggestionCandidates( + const bool useFullEditDistance, const int inputLength, Correction *correction, + WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors, + const int currentWordIndex); + void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate); + void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, + Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, + const int currentWordIndex); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character - bool processCurrentNode(const int initialPos, - Correction *correction, int *newCount, - int *newChildPosition, int *nextSiblingPosition); + bool processCurrentNode(const int initialPos, Correction *correction, int *newCount, + int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, + const int currentWordIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, - unsigned short *word); + ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, - short unsigned int* outWord); + short unsigned int *outWord); + bool getSubStringSuggestion( + ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, + const int *codes, const bool useFullEditDistance, Correction *correction, + WordsPriorityQueuePool* queuePool, const int inputLength, + const bool hasAutoCorrectionCandidate, const int currentWordIndex, + const int inputWordStartPos, const int inputWordLength, + const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, + int *wordLengthArray, unsigned short* outputWord, int *outputWordLength); + void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex, + const int outputWordLength, int *freqArray, int* wordLengthArray, + unsigned short* outputWord); const uint8_t* const DICT_ROOT; const int MAX_WORD_LENGTH; @@ -127,14 +156,8 @@ private: }; static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[]; - int *mFrequencies; - unsigned short *mOutputChars; - ProximityInfo *mProximityInfo; - Correction *mCorrection; - int mInputLength; - // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH - unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; - + // Still bundled members + unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];// TODO: remove int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove diff --git a/native/src/words_priority_queue.h b/native/src/words_priority_queue.h new file mode 100644 index 000000000..249962eec --- /dev/null +++ b/native/src/words_priority_queue.h @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORDS_PRIORITY_QUEUE_H +#define LATINIME_WORDS_PRIORITY_QUEUE_H + +#include <cstring> // for memcpy() +#include <iostream> +#include <queue> +#include "defines.h" + +namespace latinime { + +class WordsPriorityQueue { + public: + class SuggestedWord { + public: + int mScore; + unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; + int mWordLength; + bool mUsed; + + void setParams(int score, unsigned short* word, int wordLength) { + mScore = score; + mWordLength = wordLength; + memcpy(mWord, word, sizeof(unsigned short) * wordLength); + mUsed = true; + } + }; + + WordsPriorityQueue(int maxWords, int maxWordLength) : + MAX_WORDS((unsigned int) maxWords), MAX_WORD_LENGTH( + (unsigned int) maxWordLength) { + mSuggestedWords = new SuggestedWord[maxWordLength]; + for (int i = 0; i < maxWordLength; ++i) { + mSuggestedWords[i].mUsed = false; + } + mHighestSuggestedWord = 0; + } + + ~WordsPriorityQueue() { + delete[] mSuggestedWords; + } + + void push(int score, unsigned short* word, int wordLength) { + SuggestedWord* sw = 0; + if (mSuggestions.size() >= MAX_WORDS) { + sw = mSuggestions.top(); + const int minScore = sw->mScore; + if (minScore >= score) { + return; + } else { + sw->mUsed = false; + mSuggestions.pop(); + } + } + if (sw == 0) { + sw = getFreeSuggestedWord(score, word, wordLength); + } else { + sw->setParams(score, word, wordLength); + } + if (sw == 0) { + AKLOGE("SuggestedWord is accidentally null."); + return; + } + if (DEBUG_WORDS_PRIORITY_QUEUE) { + AKLOGI("Push word. %d, %d", score, wordLength); + DUMP_WORD(word, wordLength); + } + mSuggestions.push(sw); + if (!mHighestSuggestedWord || mHighestSuggestedWord->mScore < sw->mScore) { + mHighestSuggestedWord = sw; + } + } + + SuggestedWord* top() { + if (mSuggestions.empty()) return 0; + SuggestedWord* sw = mSuggestions.top(); + return sw; + } + + int outputSuggestions(int *frequencies, unsigned short *outputChars) { + mHighestSuggestedWord = 0; + const unsigned int size = min( + MAX_WORDS, static_cast<unsigned int>(mSuggestions.size())); + int index = size - 1; + while (!mSuggestions.empty() && index >= 0) { + SuggestedWord* sw = mSuggestions.top(); + if (DEBUG_WORDS_PRIORITY_QUEUE) { + AKLOGI("dump word. %d", sw->mScore); + DUMP_WORD(sw->mWord, sw->mWordLength); + } + const unsigned int wordLength = sw->mWordLength; + char* targetAdr = (char*) outputChars + + (index) * MAX_WORD_LENGTH * sizeof(short); + frequencies[index] = sw->mScore; + memcpy(targetAdr, sw->mWord, (wordLength) * sizeof(short)); + if (wordLength < MAX_WORD_LENGTH) { + ((unsigned short*) targetAdr)[wordLength] = 0; + } + sw->mUsed = false; + mSuggestions.pop(); + --index; + } + return size; + } + + int size() const { + return mSuggestions.size(); + } + + void clear() { + mHighestSuggestedWord = 0; + while (!mSuggestions.empty()) { + SuggestedWord* sw = mSuggestions.top(); + if (DEBUG_WORDS_PRIORITY_QUEUE) { + AKLOGI("Clear word. %d", sw->mScore); + DUMP_WORD(sw->mWord, sw->mWordLength); + } + sw->mUsed = false; + mSuggestions.pop(); + } + } + + void dumpTopWord() { + if (size() <= 0) { + return; + } + DUMP_WORD(mHighestSuggestedWord->mWord, mHighestSuggestedWord->mWordLength); + } + + double getHighestNormalizedScore(const unsigned short* before, const int beforeLength, + unsigned short** outWord, int *outScore, int *outLength) { + if (!mHighestSuggestedWord) { + return 0.0; + } + SuggestedWord* sw = mHighestSuggestedWord; + const int score = sw->mScore; + unsigned short* word = sw->mWord; + const int wordLength = sw->mWordLength; + if (outScore) { + *outScore = score; + } + if (outWord) { + *outWord = word; + } + if (outLength) { + *outLength = wordLength; + } + return Correction::RankingAlgorithm::calcNormalizedScore( + before, beforeLength, word, wordLength, score); + } + + private: + struct wordComparator { + bool operator ()(SuggestedWord * left, SuggestedWord * right) { + return left->mScore > right->mScore; + } + }; + + SuggestedWord* getFreeSuggestedWord(int score, unsigned short* word, + int wordLength) { + for (unsigned int i = 0; i < MAX_WORD_LENGTH; ++i) { + if (!mSuggestedWords[i].mUsed) { + mSuggestedWords[i].setParams(score, word, wordLength); + return &mSuggestedWords[i]; + } + } + return 0; + } + + typedef std::priority_queue<SuggestedWord*, std::vector<SuggestedWord*>, + wordComparator> Suggestions; + Suggestions mSuggestions; + const unsigned int MAX_WORDS; + const unsigned int MAX_WORD_LENGTH; + SuggestedWord* mSuggestedWords; + SuggestedWord* mHighestSuggestedWord; +}; +} + +#endif // LATINIME_WORDS_PRIORITY_QUEUE_H diff --git a/native/src/words_priority_queue_pool.h b/native/src/words_priority_queue_pool.h new file mode 100644 index 000000000..5b50e8f4f --- /dev/null +++ b/native/src/words_priority_queue_pool.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORDS_PRIORITY_QUEUE_POOL_H +#define LATINIME_WORDS_PRIORITY_QUEUE_POOL_H + +#include <assert.h> +#include <new> +#include "words_priority_queue.h" + +namespace latinime { + +class WordsPriorityQueuePool { + public: + WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) { + mMasterQueue = new(mMasterQueueBuf) WordsPriorityQueue(mainQueueMaxWords, maxWordLength); + for (int i = 0, subQueueBufOffset = 0; + i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS * SUB_QUEUE_MAX_COUNT; + ++i, subQueueBufOffset += sizeof(WordsPriorityQueue)) { + mSubQueues[i] = new(mSubQueueBuf + subQueueBufOffset) + WordsPriorityQueue(subQueueMaxWords, maxWordLength); + } + } + + virtual ~WordsPriorityQueuePool() { + } + + WordsPriorityQueue* getMasterQueue() { + return mMasterQueue; + } + + WordsPriorityQueue* getSubQueue(const int wordIndex, const int inputWordLength) { + if (wordIndex >= MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { + return 0; + } + if (inputWordLength < 0 || inputWordLength >= SUB_QUEUE_MAX_COUNT) { + if (DEBUG_WORDS_PRIORITY_QUEUE) { + assert(false); + } + return 0; + } + return mSubQueues[wordIndex * SUB_QUEUE_MAX_COUNT + inputWordLength]; + } + + inline void clearAll() { + mMasterQueue->clear(); + for (int i = 0; i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS; ++i) { + clearSubQueue(i); + } + } + + inline void clearSubQueue(const int wordIndex) { + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + WordsPriorityQueue* queue = getSubQueue(wordIndex, i); + if (queue) { + queue->clear(); + } + } + } + + void dumpSubQueue1TopSuggestions() { + AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS"); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + getSubQueue(0, i)->dumpTopWord(); + } + } + + private: + WordsPriorityQueue* mMasterQueue; + WordsPriorityQueue* mSubQueues[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; + char mMasterQueueBuf[sizeof(WordsPriorityQueue)]; + char mSubQueueBuf[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS + * SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)]; +}; +} + +#endif // LATINIME_WORDS_PRIORITY_QUEUE_POOL_H |