diff options
Diffstat (limited to 'native')
32 files changed, 2781 insertions, 1476 deletions
diff --git a/native/Android.mk b/native/Android.mk index 24c0037c9..5053e7d64 100644 --- a/native/Android.mk +++ b/native/Android.mk @@ -1,64 +1 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) - -LOCAL_C_INCLUDES += $(LOCAL_PATH)/src - -LOCAL_CFLAGS += -Werror -Wall - -# To suppress compiler warnings for unused variables/functions used for debug features etc. -LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function - -LOCAL_SRC_FILES := \ - jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \ - jni/com_android_inputmethod_latin_BinaryDictionary.cpp \ - jni/jni_common.cpp \ - src/bigram_dictionary.cpp \ - src/char_utils.cpp \ - src/correction.cpp \ - src/dictionary.cpp \ - src/proximity_info.cpp \ - src/unigram_dictionary.cpp - -#FLAG_DBG := true -#FLAG_DO_PROFILE := true - -TARGETING_UNBUNDLED_FROYO := true - -ifeq ($(TARGET_ARCH), x86) - TARGETING_UNBUNDLED_FROYO := false -endif - -ifeq ($(TARGET_ARCH),mips) - TARGETING_UNBUNDLED_FROYO := false -endif - -ifeq ($(FLAG_DBG), true) - TARGETING_UNBUNDLED_FROYO := false -endif - -ifeq ($(FLAG_DO_PROFILE), true) - TARGETING_UNBUNDLED_FROYO := false -endif - -ifeq ($(TARGETING_UNBUNDLED_FROYO), true) - LOCAL_NDK_VERSION := 4 - LOCAL_SDK_VERSION := 8 -endif - -LOCAL_MODULE := libjni_latinime - -LOCAL_MODULE_TAGS := optional - -ifeq ($(FLAG_DO_PROFILE), true) - $(warning Making profiling version of native library) - LOCAL_CFLAGS += -DFLAG_DO_PROFILE - LOCAL_SHARED_LIBRARIES := libcutils libutils -else # FLAG_DO_PROFILE -ifeq ($(FLAG_DBG), true) - $(warning Making debug version of native library) - LOCAL_CFLAGS += -DFLAG_DBG - LOCAL_SHARED_LIBRARIES := libcutils libutils -endif # FLAG_DBG -endif # FLAG_DO_PROFILE - -include $(BUILD_SHARED_LIBRARY) +include $(call all-subdir-makefiles) diff --git a/native/jni/Android.mk b/native/jni/Android.mk new file mode 100644 index 000000000..5e0d3518d --- /dev/null +++ b/native/jni/Android.mk @@ -0,0 +1,138 @@ +# Copyright (C) 2011 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOCAL_PATH := $(call my-dir) + +############ some local flags +# If you change any of those flags, you need to rebuild both libjni_latinime_static +# and the shared library. +#FLAG_DBG := true +#FLAG_DO_PROFILE := true + +TARGETING_UNBUNDLED_FROYO := true + +ifeq ($(TARGET_ARCH), x86) + TARGETING_UNBUNDLED_FROYO := false +endif + +ifeq ($(TARGET_ARCH), mips) + TARGETING_UNBUNDLED_FROYO := false +endif + +ifeq ($(FLAG_DBG), true) + TARGETING_UNBUNDLED_FROYO := false +endif + +ifeq ($(FLAG_DO_PROFILE), true) + TARGETING_UNBUNDLED_FROYO := false +endif + +###################################### +include $(CLEAR_VARS) + +LATIN_IME_SRC_DIR := src + +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) + +LOCAL_CFLAGS += -Werror -Wall + +# To suppress compiler warnings for unused variables/functions used for debug features etc. +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function + +LATIN_IME_JNI_SRC_FILES := \ + com_android_inputmethod_keyboard_ProximityInfo.cpp \ + com_android_inputmethod_latin_BinaryDictionary.cpp \ + jni_common.cpp + +LATIN_IME_CORE_SRC_FILES := \ + additional_proximity_chars.cpp \ + basechars.cpp \ + bigram_dictionary.cpp \ + char_utils.cpp \ + correction.cpp \ + dictionary.cpp \ + proximity_info.cpp \ + unigram_dictionary.cpp + +LOCAL_SRC_FILES := \ + $(LATIN_IME_JNI_SRC_FILES) \ + $(addprefix $(LATIN_IME_SRC_DIR)/,$(LATIN_IME_CORE_SRC_FILES)) + +ifeq ($(TARGETING_UNBUNDLED_FROYO), true) + LOCAL_NDK_VERSION := 4 + LOCAL_SDK_VERSION := 8 +endif + +ifeq ($(FLAG_DO_PROFILE), true) + $(warning Making profiling version of native library) + LOCAL_CFLAGS += -DFLAG_DO_PROFILE +else # FLAG_DO_PROFILE +ifeq ($(FLAG_DBG), true) + $(warning Making debug version of native library) + LOCAL_CFLAGS += -DFLAG_DBG +endif # FLAG_DBG +endif # FLAG_DO_PROFILE + +LOCAL_MODULE := libjni_latinime_static +LOCAL_MODULE_TAGS := optional + +ifdef HISTORICAL_NDK_VERSIONS_ROOT # In the platform build system +include external/stlport/libstlport.mk +else # In the NDK build system +LOCAL_C_INCLUDES += external/stlport/stlport bionic +endif + +include $(BUILD_STATIC_LIBRARY) + +###################################### +include $(CLEAR_VARS) + +# All code in LOCAL_WHOLE_STATIC_LIBRARIES will be built into this shared library. +LOCAL_WHOLE_STATIC_LIBRARIES := libjni_latinime_static + +ifdef HISTORICAL_NDK_VERSIONS_ROOT # In the platform build system +LOCAL_SHARED_LIBRARIES := libstlport +else # In the NDK build system +LOCAL_SHARED_LIBRARIES := libstlport_static +endif + +ifeq ($(FLAG_DO_PROFILE), true) + $(warning Making profiling version of native library) + LOCAL_SHARED_LIBRARIES += libcutils libutils +else # FLAG_DO_PROFILE +ifeq ($(FLAG_DBG), true) + $(warning Making debug version of native library) + LOCAL_SHARED_LIBRARIES += libcutils libutils +endif # FLAG_DBG +endif # FLAG_DO_PROFILE + +ifeq ($(TARGETING_UNBUNDLED_FROYO), true) + LOCAL_NDK_VERSION := 4 + LOCAL_SDK_VERSION := 8 +endif + +LOCAL_MODULE := libjni_latinime +LOCAL_MODULE_TAGS := optional + +ifdef HISTORICAL_NDK_VERSIONS_ROOT # In the platform build system +include external/stlport/libstlport.mk +endif + +include $(BUILD_SHARED_LIBRARY) + +#################### Clean up the tmp vars +LATIN_IME_CORE_SRC_FILES := +LATIN_IME_JNI_SRC_FILES := +LATIN_IME_SRC_DIR := +TARGETING_UNBUNDLED_FROYO := diff --git a/native/jni/Application.mk b/native/jni/Application.mk new file mode 100644 index 000000000..caf3b2622 --- /dev/null +++ b/native/jni/Application.mk @@ -0,0 +1 @@ +APP_STL := stlport_static diff --git a/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp b/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp index 595ea2fdc..9eb437c06 100644 --- a/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp +++ b/native/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp @@ -25,17 +25,20 @@ #include <assert.h> #include <errno.h> #include <stdio.h> +#include <string> namespace latinime { -static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object, - jint maxProximityCharsSize, jint displayWidth, jint displayHeight, jint gridWidth, - jint gridHeight, jintArray proximityCharsArray, jint keyCount, - jintArray keyXCoordinateArray, jintArray keyYCoordinateArray, jintArray keyWidthArray, - jintArray keyHeightArray, jintArray keyCharCodeArray, +static jlong latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object, + jstring localejStr, jint maxProximityCharsSize, jint displayWidth, jint displayHeight, + jint gridWidth, jint gridHeight, jint mostCommonkeyWidth, jintArray proximityCharsArray, + jint keyCount, jintArray keyXCoordinateArray, jintArray keyYCoordinateArray, + jintArray keyWidthArray, jintArray keyHeightArray, jintArray keyCharCodeArray, jfloatArray sweetSpotCenterXArray, jfloatArray sweetSpotCenterYArray, jfloatArray sweetSpotRadiusArray) { - jint *proximityChars = env->GetIntArrayElements(proximityCharsArray, NULL); + const char *localeStrPtr = env->GetStringUTFChars(localejStr, 0); + const std::string localeStr(localeStrPtr); + jint *proximityChars = env->GetIntArrayElements(proximityCharsArray, 0); jint *keyXCoordinates = safeGetIntArrayElements(env, keyXCoordinateArray); jint *keyYCoordinates = safeGetIntArrayElements(env, keyYCoordinateArray); jint *keyWidths = safeGetIntArrayElements(env, keyWidthArray); @@ -44,8 +47,10 @@ static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object, jfloat *sweetSpotCenterXs = safeGetFloatArrayElements(env, sweetSpotCenterXArray); jfloat *sweetSpotCenterYs = safeGetFloatArrayElements(env, sweetSpotCenterYArray); jfloat *sweetSpotRadii = safeGetFloatArrayElements(env, sweetSpotRadiusArray); - ProximityInfo *proximityInfo = new ProximityInfo(maxProximityCharsSize, displayWidth, - displayHeight, gridWidth, gridHeight, (const uint32_t*)proximityChars, + ProximityInfo *proximityInfo = new ProximityInfo( + localeStr, maxProximityCharsSize, displayWidth, + displayHeight, gridWidth, gridHeight, mostCommonkeyWidth, + (const int32_t*)proximityChars, keyCount, (const int32_t*)keyXCoordinates, (const int32_t*)keyYCoordinates, (const int32_t*)keyWidths, (const int32_t*)keyHeights, (const int32_t*)keyCharCodes, (const float*)sweetSpotCenterXs, (const float*)sweetSpotCenterYs, @@ -59,19 +64,20 @@ static jint latinime_Keyboard_setProximityInfo(JNIEnv *env, jobject object, safeReleaseIntArrayElements(env, keyYCoordinateArray, keyYCoordinates); safeReleaseIntArrayElements(env, keyXCoordinateArray, keyXCoordinates); env->ReleaseIntArrayElements(proximityCharsArray, proximityChars, 0); - return (jint)proximityInfo; + env->ReleaseStringUTFChars(localejStr, localeStrPtr); + return (jlong)proximityInfo; } -static void latinime_Keyboard_release(JNIEnv *env, jobject object, jint proximityInfo) { +static void latinime_Keyboard_release(JNIEnv *env, jobject object, jlong proximityInfo) { ProximityInfo *pi = (ProximityInfo*)proximityInfo; if (!pi) return; delete pi; } static JNINativeMethod sKeyboardMethods[] = { - {"setProximityInfoNative", "(IIIII[II[I[I[I[I[I[F[F[F)I", + {"setProximityInfoNative", "(Ljava/lang/String;IIIIII[II[I[I[I[I[I[F[F[F)J", (void*)latinime_Keyboard_setProximityInfo}, - {"releaseProximityInfoNative", "(I)V", (void*)latinime_Keyboard_release} + {"releaseProximityInfoNative", "(J)V", (void*)latinime_Keyboard_release} }; int register_ProximityInfo(JNIEnv *env) { diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 18c972444..20e44c2b0 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -18,6 +18,7 @@ #define LOG_TAG "LatinIME: jni: BinaryDictionary" #include "binary_format.h" +#include "correction.h" #include "com_android_inputmethod_latin_BinaryDictionary.h" #include "dictionary.h" #include "jni.h" @@ -33,6 +34,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> +#include <unistd.h> #else // USE_MMAP_FOR_DICTIONARY #include <stdlib.h> #endif // USE_MMAP_FOR_DICTIONARY @@ -41,75 +43,74 @@ namespace latinime { void releaseDictBuf(void* dictBuf, const size_t length, int fd); -static jint latinime_BinaryDictionary_open(JNIEnv *env, jobject object, +static jlong latinime_BinaryDictionary_open(JNIEnv *env, jobject object, jstring sourceDir, jlong dictOffset, jlong dictSize, - jint typedLetterMultiplier, jint fullWordMultiplier, jint maxWordLength, jint maxWords, - jint maxAlternatives) { + jint typedLetterMultiplier, jint fullWordMultiplier, jint maxWordLength, jint maxWords) { PROF_OPEN; PROF_START(66); - const char *sourceDirChars = env->GetStringUTFChars(sourceDir, NULL); - if (sourceDirChars == NULL) { - LOGE("DICT: Can't get sourceDir string"); + const char *sourceDirChars = env->GetStringUTFChars(sourceDir, 0); + if (sourceDirChars == 0) { + AKLOGE("DICT: Can't get sourceDir string"); return 0; } int fd = 0; - void *dictBuf = NULL; + void *dictBuf = 0; int adjust = 0; #ifdef USE_MMAP_FOR_DICTIONARY /* mmap version */ fd = open(sourceDirChars, O_RDONLY); if (fd < 0) { - LOGE("DICT: Can't open sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); + AKLOGE("DICT: Can't open sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); return 0; } int pagesize = getpagesize(); adjust = dictOffset % pagesize; int adjDictOffset = dictOffset - adjust; int adjDictSize = dictSize + adjust; - dictBuf = mmap(NULL, sizeof(char) * adjDictSize, PROT_READ, MAP_PRIVATE, fd, adjDictOffset); + dictBuf = mmap(0, sizeof(char) * adjDictSize, PROT_READ, MAP_PRIVATE, fd, adjDictOffset); if (dictBuf == MAP_FAILED) { - LOGE("DICT: Can't mmap dictionary. errno=%d", errno); + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); return 0; } dictBuf = (void *)((char *)dictBuf + adjust); #else // USE_MMAP_FOR_DICTIONARY /* malloc version */ - FILE *file = NULL; + FILE *file = 0; file = fopen(sourceDirChars, "rb"); - if (file == NULL) { - LOGE("DICT: Can't fopen sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); + if (file == 0) { + AKLOGE("DICT: Can't fopen sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); return 0; } dictBuf = malloc(sizeof(char) * dictSize); if (!dictBuf) { - LOGE("DICT: Can't allocate memory region for dictionary. errno=%d", errno); + AKLOGE("DICT: Can't allocate memory region for dictionary. errno=%d", errno); return 0; } int ret = fseek(file, (long)dictOffset, SEEK_SET); if (ret != 0) { - LOGE("DICT: Failure in fseek. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in fseek. ret=%d errno=%d", ret, errno); return 0; } ret = fread(dictBuf, sizeof(char) * dictSize, 1, file); if (ret != 1) { - LOGE("DICT: Failure in fread. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in fread. ret=%d errno=%d", ret, errno); return 0; } ret = fclose(file); if (ret != 0) { - LOGE("DICT: Failure in fclose. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in fclose. ret=%d errno=%d", ret, errno); return 0; } #endif // USE_MMAP_FOR_DICTIONARY env->ReleaseStringUTFChars(sourceDir, sourceDirChars); if (!dictBuf) { - LOGE("DICT: dictBuf is null"); + AKLOGE("DICT: dictBuf is null"); return 0; } - Dictionary *dictionary = NULL; + Dictionary *dictionary = 0; if (BinaryFormat::UNKNOWN_FORMAT == BinaryFormat::detectFormat((uint8_t*)dictBuf)) { - LOGE("DICT: dictionary format is unknown, bad magic number"); + AKLOGE("DICT: dictionary format is unknown, bad magic number"); #ifdef USE_MMAP_FOR_DICTIONARY releaseDictBuf(((char*)dictBuf) - adjust, adjDictSize, fd); #else // USE_MMAP_FOR_DICTIONARY @@ -117,27 +118,27 @@ static jint latinime_BinaryDictionary_open(JNIEnv *env, jobject object, #endif // USE_MMAP_FOR_DICTIONARY } else { dictionary = new Dictionary(dictBuf, dictSize, fd, adjust, typedLetterMultiplier, - fullWordMultiplier, maxWordLength, maxWords, maxAlternatives); + fullWordMultiplier, maxWordLength, maxWords); } PROF_END(66); PROF_CLOSE; - return (jint)dictionary; + return (jlong)dictionary; } -static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jint dict, - jint proximityInfo, jintArray xCoordinatesArray, jintArray yCoordinatesArray, +static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, jlong dict, + jlong proximityInfo, jintArray xCoordinatesArray, jintArray yCoordinatesArray, jintArray inputArray, jint arraySize, jint flags, jcharArray outputArray, jintArray frequencyArray) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return 0; ProximityInfo *pInfo = (ProximityInfo*)proximityInfo; - int *xCoordinates = env->GetIntArrayElements(xCoordinatesArray, NULL); - int *yCoordinates = env->GetIntArrayElements(yCoordinatesArray, NULL); + int *xCoordinates = env->GetIntArrayElements(xCoordinatesArray, 0); + int *yCoordinates = env->GetIntArrayElements(yCoordinatesArray, 0); - int *frequencies = env->GetIntArrayElements(frequencyArray, NULL); - int *inputCodes = env->GetIntArrayElements(inputArray, NULL); - jchar *outputChars = env->GetCharArrayElements(outputArray, NULL); + int *frequencies = env->GetIntArrayElements(frequencyArray, 0); + int *inputCodes = env->GetIntArrayElements(inputArray, 0); + jchar *outputChars = env->GetCharArrayElements(outputArray, 0); int count = dictionary->getSuggestions(pInfo, xCoordinates, yCoordinates, inputCodes, arraySize, flags, (unsigned short*) outputChars, frequencies); @@ -151,21 +152,19 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jobject object, return count; } -static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jint dict, +static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jlong dict, jcharArray prevWordArray, jint prevWordLength, jintArray inputArray, jint inputArraySize, - jcharArray outputArray, jintArray frequencyArray, jint maxWordLength, jint maxBigrams, - jint maxAlternatives) { + jcharArray outputArray, jintArray frequencyArray, jint maxWordLength, jint maxBigrams) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return 0; - jchar *prevWord = env->GetCharArrayElements(prevWordArray, NULL); - int *inputCodes = env->GetIntArrayElements(inputArray, NULL); - jchar *outputChars = env->GetCharArrayElements(outputArray, NULL); - int *frequencies = env->GetIntArrayElements(frequencyArray, NULL); + jchar *prevWord = env->GetCharArrayElements(prevWordArray, 0); + int *inputCodes = env->GetIntArrayElements(inputArray, 0); + jchar *outputChars = env->GetCharArrayElements(outputArray, 0); + int *frequencies = env->GetIntArrayElements(frequencyArray, 0); int count = dictionary->getBigrams((unsigned short*) prevWord, prevWordLength, inputCodes, - inputArraySize, (unsigned short*) outputChars, frequencies, maxWordLength, maxBigrams, - maxAlternatives); + inputArraySize, (unsigned short*) outputChars, frequencies, maxWordLength, maxBigrams); env->ReleaseCharArrayElements(prevWordArray, prevWord, JNI_ABORT); env->ReleaseIntArrayElements(inputArray, inputCodes, JNI_ABORT); @@ -175,19 +174,42 @@ static int latinime_BinaryDictionary_getBigrams(JNIEnv *env, jobject object, jin return count; } -static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jint dict, +static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject object, jlong dict, jcharArray wordArray, jint wordLength) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return (jboolean) false; - jchar *word = env->GetCharArrayElements(wordArray, NULL); + jchar *word = env->GetCharArrayElements(wordArray, 0); jboolean result = dictionary->isValidWord((unsigned short*) word, wordLength); env->ReleaseCharArrayElements(wordArray, word, JNI_ABORT); return result; } -static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jint dict) { +static jdouble latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jobject object, + jcharArray before, jint beforeLength, jcharArray after, jint afterLength, jint score) { + jchar *beforeChars = env->GetCharArrayElements(before, 0); + jchar *afterChars = env->GetCharArrayElements(after, 0); + jdouble result = Correction::RankingAlgorithm::calcNormalizedScore( + (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength, + score); + env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT); + env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT); + return result; +} + +static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jobject object, + jcharArray before, jint beforeLength, jcharArray after, jint afterLength) { + jchar *beforeChars = env->GetCharArrayElements(before, 0); + jchar *afterChars = env->GetCharArrayElements(after, 0); + jint result = Correction::RankingAlgorithm::editDistance( + (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength); + env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT); + env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT); + return result; +} + +static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jlong dict) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return; void *dictBuf = dictionary->getDict(); @@ -205,11 +227,11 @@ void releaseDictBuf(void* dictBuf, const size_t length, int fd) { #ifdef USE_MMAP_FOR_DICTIONARY int ret = munmap(dictBuf, length); if (ret != 0) { - LOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); } ret = close(fd); if (ret != 0) { - LOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); } #else // USE_MMAP_FOR_DICTIONARY free(dictBuf); @@ -217,11 +239,14 @@ void releaseDictBuf(void* dictBuf, const size_t length, int fd) { } static JNINativeMethod sMethods[] = { - {"openNative", "(Ljava/lang/String;JJIIIII)I", (void*)latinime_BinaryDictionary_open}, - {"closeNative", "(I)V", (void*)latinime_BinaryDictionary_close}, - {"getSuggestionsNative", "(II[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, - {"isValidWordNative", "(I[CI)Z", (void*)latinime_BinaryDictionary_isValidWord}, - {"getBigramsNative", "(I[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams} + {"openNative", "(Ljava/lang/String;JJIIII)J", (void*)latinime_BinaryDictionary_open}, + {"closeNative", "(J)V", (void*)latinime_BinaryDictionary_close}, + {"getSuggestionsNative", "(JJ[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, + {"isValidWordNative", "(J[CI)Z", (void*)latinime_BinaryDictionary_isValidWord}, + {"getBigramsNative", "(J[CI[II[C[III)I", (void*)latinime_BinaryDictionary_getBigrams}, + {"calcNormalizedScoreNative", "([CI[CII)D", + (void*)latinime_BinaryDictionary_calcNormalizedScore}, + {"editDistanceNative", "([CI[CI)I", (void*)latinime_BinaryDictionary_editDistance} }; int register_BinaryDictionary(JNIEnv *env) { diff --git a/native/jni/jni_common.cpp b/native/jni/jni_common.cpp index 8643f723f..85d26836a 100644 --- a/native/jni/jni_common.cpp +++ b/native/jni/jni_common.cpp @@ -32,22 +32,22 @@ using namespace latinime; * Returns the JNI version on success, -1 on failure. */ jint JNI_OnLoad(JavaVM* vm, void* reserved) { - JNIEnv* env = NULL; + JNIEnv* env = 0; jint result = -1; if (vm->GetEnv((void**) &env, JNI_VERSION_1_4) != JNI_OK) { - LOGE("ERROR: GetEnv failed"); + AKLOGE("ERROR: GetEnv failed"); goto bail; } - assert(env != NULL); + assert(env != 0); if (!register_BinaryDictionary(env)) { - LOGE("ERROR: BinaryDictionary native registration failed"); + AKLOGE("ERROR: BinaryDictionary native registration failed"); goto bail; } if (!register_ProximityInfo(env)) { - LOGE("ERROR: ProximityInfo native registration failed"); + AKLOGE("ERROR: ProximityInfo native registration failed"); goto bail; } @@ -63,12 +63,12 @@ namespace latinime { int registerNativeMethods(JNIEnv* env, const char* className, JNINativeMethod* methods, int numMethods) { jclass clazz = env->FindClass(className); - if (clazz == NULL) { - LOGE("Native registration unable to find class '%s'", className); + if (clazz == 0) { + AKLOGE("Native registration unable to find class '%s'", className); return JNI_FALSE; } if (env->RegisterNatives(clazz, methods, numMethods) < 0) { - LOGE("RegisterNatives failed for '%s'", className); + AKLOGE("RegisterNatives failed for '%s'", className); env->DeleteLocalRef(clazz); return JNI_FALSE; } diff --git a/native/jni/jni_common.h b/native/jni/jni_common.h index 9548e1b3f..6741443ac 100644 --- a/native/jni/jni_common.h +++ b/native/jni/jni_common.h @@ -29,17 +29,17 @@ int registerNativeMethods(JNIEnv *env, const char *className, JNINativeMethod *m inline jint *safeGetIntArrayElements(JNIEnv *env, jintArray jArray) { if (jArray) { - return env->GetIntArrayElements(jArray, NULL); + return env->GetIntArrayElements(jArray, 0); } else { - return NULL; + return 0; } } inline jfloat *safeGetFloatArrayElements(JNIEnv *env, jfloatArray jArray) { if (jArray) { - return env->GetFloatArrayElements(jArray, NULL); + return env->GetFloatArrayElements(jArray, 0); } else { - return NULL; + return 0; } } diff --git a/native/jni/src/additional_proximity_chars.cpp b/native/jni/src/additional_proximity_chars.cpp new file mode 100644 index 000000000..224f020f2 --- /dev/null +++ b/native/jni/src/additional_proximity_chars.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "additional_proximity_chars.h" + +namespace latinime { +const std::string AdditionalProximityChars::LOCALE_EN_US("en"); + +const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_A[EN_US_ADDITIONAL_A_SIZE] = { + 'e', 'i', 'o', 'u' +}; + +const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_E[EN_US_ADDITIONAL_E_SIZE] = { + 'a', 'i', 'o', 'u' +}; + +const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_I[EN_US_ADDITIONAL_I_SIZE] = { + 'a', 'e', 'o', 'u' +}; + +const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_O[EN_US_ADDITIONAL_O_SIZE] = { + 'a', 'e', 'i', 'u' +}; + +const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_U[EN_US_ADDITIONAL_U_SIZE] = { + 'a', 'e', 'i', 'o' +}; +} diff --git a/native/jni/src/additional_proximity_chars.h b/native/jni/src/additional_proximity_chars.h new file mode 100644 index 000000000..e0ecc0e1d --- /dev/null +++ b/native/jni/src/additional_proximity_chars.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ADDITIONAL_PROXIMITY_CHARS_H +#define LATINIME_ADDITIONAL_PROXIMITY_CHARS_H + +#include <stdint.h> +#include <string> + +#include "defines.h" + +namespace latinime { + +class AdditionalProximityChars { + private: + static const std::string LOCALE_EN_US; + static const int EN_US_ADDITIONAL_A_SIZE = 4; + static const int32_t EN_US_ADDITIONAL_A[]; + static const int EN_US_ADDITIONAL_E_SIZE = 4; + static const int32_t EN_US_ADDITIONAL_E[]; + static const int EN_US_ADDITIONAL_I_SIZE = 4; + static const int32_t EN_US_ADDITIONAL_I[]; + static const int EN_US_ADDITIONAL_O_SIZE = 4; + static const int32_t EN_US_ADDITIONAL_O[]; + static const int EN_US_ADDITIONAL_U_SIZE = 4; + static const int32_t EN_US_ADDITIONAL_U[]; + + static bool isEnLocale(const std::string *locale_str) { + return locale_str && locale_str->size() >= LOCALE_EN_US.size() + && LOCALE_EN_US.compare(0, LOCALE_EN_US.size(), *locale_str); + } + + public: + static int getAdditionalCharsSize(const std::string* locale_str, const int32_t c) { + if (!isEnLocale(locale_str)) { + return 0; + } + switch(c) { + case 'a': + return EN_US_ADDITIONAL_A_SIZE; + case 'e': + return EN_US_ADDITIONAL_E_SIZE; + case 'i': + return EN_US_ADDITIONAL_I_SIZE; + case 'o': + return EN_US_ADDITIONAL_O_SIZE; + case 'u': + return EN_US_ADDITIONAL_U_SIZE; + default: + return 0; + } + } + + static const int32_t* getAdditionalChars(const std::string *locale_str, const int32_t c) { + if (!isEnLocale(locale_str)) { + return 0; + } + switch(c) { + case 'a': + return EN_US_ADDITIONAL_A; + case 'e': + return EN_US_ADDITIONAL_E; + case 'i': + return EN_US_ADDITIONAL_I; + case 'o': + return EN_US_ADDITIONAL_O; + case 'u': + return EN_US_ADDITIONAL_U; + default: + return 0; + } + } + + static bool hasAdditionalChars(const std::string *locale_str, const int32_t c) { + return getAdditionalCharsSize(locale_str, c) > 0; + } +}; + +} + +#endif // LATINIME_ADDITIONAL_PROXIMITY_CHARS_H diff --git a/native/src/basechars.h b/native/jni/src/basechars.cpp index 3843e11c5..31f1e18a8 100644 --- a/native/src/basechars.h +++ b/native/jni/src/basechars.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 The Android Open Source Project + * Copyright (C) 2011 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,9 @@ * limitations under the License. */ -#ifndef LATINIME_BASECHARS_H -#define LATINIME_BASECHARS_H +#include "char_utils.h" + +namespace latinime { /** * Table mapping most combined Latin, Greek, and Cyrillic characters @@ -23,7 +24,7 @@ * if c is not a combined character, or the base character if it * is combined. */ -static unsigned short BASE_CHARS[] = { +const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, @@ -189,4 +190,5 @@ static unsigned short BASE_CHARS[] = { // generated with: // cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' -#endif // LATINIME_BASECHARS_H + +} // namespace latinime diff --git a/native/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index c340c6c1a..f7a3d3e60 100644 --- a/native/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -26,14 +26,14 @@ namespace latinime { BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, - int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram, + const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary) - : DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength), - MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), + : DICT(dict), MAX_WORD_LENGTH(maxWordLength), + IS_LATEST_DICT_VERSION(isLatestDictVersion), HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { if (DEBUG_DICT) { - LOGI("BigramDictionary - constructor"); - LOGI("Has Bigram : %d", hasBigram); + AKLOGI("BigramDictionary - constructor"); + AKLOGI("Has Bigram : %d", hasBigram); } } @@ -46,7 +46,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Bigram: Found word = %s, freq = %d :", s, frequency); + AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency); #endif } @@ -60,7 +60,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ insertAt++; } if (DEBUG_DICT) { - LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); } if (insertAt < mMaxBigrams) { memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), @@ -76,7 +76,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ } *dest = 0; // NULL terminate if (DEBUG_DICT) { - LOGI("Bigram: Added word at %d", insertAt); + AKLOGI("Bigram: Added word at %d", insertAt); } return true; } @@ -92,7 +92,6 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ * bigramFreq: an array to output frequencies. * maxWordLength: the maximum size of a word. * maxBigrams: the maximum number of bigrams fitting in the bigramChars array. - * maxAlteratives: unused. * This method returns the number of bigrams this word has, for backward compatibility. * Note: this is not the number of bigrams output in the array, which is the number of * bigrams this word has WHOSE first letter also matches the letter the user typed. @@ -103,7 +102,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ */ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength, - int maxBigrams, int maxAlternatives) { + int maxBigrams) { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name mBigramFreq = bigramFreq; @@ -134,11 +133,13 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, bigramBuffer); - if (checkFirstCharacter(bigramBuffer)) { + // codesSize == 0 means we are trying to find bigram predictions. + if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) { const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; - addWordBigram(bigramBuffer, length, frequency); + if (addWordBigram(bigramBuffer, length, frequency)) { + ++bigramCount; + } } - ++bigramCount; } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); return bigramCount; } @@ -149,8 +150,9 @@ bool BigramDictionary::checkFirstCharacter(unsigned short *word) { int *inputCodes = mInputCodes; int maxAlt = MAX_ALTERNATIVES; + const unsigned short firstBaseChar = toBaseLowerCase(*word); while (maxAlt > 0) { - if ((unsigned int) *inputCodes == (unsigned int) *word) { + if (toBaseLowerCase(*inputCodes) == firstBaseChar) { return true; } inputCodes++; diff --git a/native/src/bigram_dictionary.h b/native/jni/src/bigram_dictionary.h index c07458a38..8132fbc59 100644 --- a/native/src/bigram_dictionary.h +++ b/native/jni/src/bigram_dictionary.h @@ -21,14 +21,13 @@ namespace latinime { class Dictionary; class BigramDictionary { -public: - BigramDictionary(const unsigned char *dict, int maxWordLength, int maxAlternatives, + public: + BigramDictionary(const unsigned char *dict, int maxWordLength, const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary); int getBigrams(unsigned short *word, int length, int *codes, int codesSize, - unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, - int maxAlternatives); + unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams); ~BigramDictionary(); -private: + private: bool addWordBigram(unsigned short *word, int length, int frequency); int getBigramAddress(int *pos, bool advance); int getBigramFreq(int *pos); @@ -39,7 +38,8 @@ private: const unsigned char *DICT; const int MAX_WORD_LENGTH; - const int MAX_ALTERNATIVES; + // TODO: Re-implement proximity correction for bigram correction + static const int MAX_ALTERNATIVES = 1; const bool IS_LATEST_DICT_VERSION; const bool HAS_BIGRAM; diff --git a/native/src/binary_format.h b/native/jni/src/binary_format.h index 6f65088db..ab033ad90 100644 --- a/native/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -17,22 +17,31 @@ #ifndef LATINIME_BINARY_FORMAT_H #define LATINIME_BINARY_FORMAT_H +#include <limits> #include "unigram_dictionary.h" namespace latinime { class BinaryFormat { -private: + private: const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; -public: + public: const static int UNKNOWN_FORMAT = -1; - const static int FORMAT_VERSION_1 = 1; - const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; + // Originally, format version 1 had a 16-bit magic number, then the version number `01' + // then options that must be 0. Hence the first 32-bits of the format are always as follow + // and it's okay to consider them a magic number as a whole. + const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; + const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; + // The versions of Latin IME that only handle format version 1 only test for the magic + // number, so we had to change it so that version 2 files would be rejected by older + // implementations. On this occasion, we made the magic number 32 bits long. + const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; static int detectFormat(const uint8_t* const dict); + static unsigned int getHeaderSize(const uint8_t* const dict); static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); @@ -55,13 +64,43 @@ public: }; inline int BinaryFormat::detectFormat(const uint8_t* const dict) { - const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian - if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1; - return UNKNOWN_FORMAT; + // The magic number is stored big-endian. + const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; + switch (magicNumber) { + case FORMAT_VERSION_1_MAGIC_NUMBER: + // Format 1 header is exactly 5 bytes long and looks like: + // Magic number (2 bytes) 0x78 0xB1 + // Version number (1 byte) 0x01 + // Options (2 bytes) must be 0x00 0x00 + return 1; + case FORMAT_VERSION_2_MAGIC_NUMBER: + // Format 2 header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Version number (2 bytes) 0x00 0x02 + // Options (2 bytes) must be 0x00 0x00 + // Header size (4 bytes) : integer, big endian + return (dict[4] << 8) + dict[5]; + default: + return UNKNOWN_FORMAT; + } +} + +inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { + switch (detectFormat(dict)) { + case 1: + return FORMAT_VERSION_1_HEADER_SIZE; + case 2: + // See the format of the header in the comment in detectFormat() above + return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11]; + default: + return std::numeric_limits<unsigned int>::max(); + } } inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { - return dict[(*pos)++]; + const int msb = dict[(*pos)++]; + if (msb < 0x80) return msb; + return ((msb & 0x7F) << 8) | dict[(*pos)++]; } inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { @@ -145,15 +184,15 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) { - // This function skips all attributes. The format makes provision for future extension - // with other attributes (notably shortcuts) but for the time being, bigrams are the - // only attributes that may be found in a character group, so we only look at bigrams - // in this version. + // This function skips all attributes: shortcuts and bigrams. + int newPos = pos; + if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { + newPos = skipAttributes(dict, newPos); + } if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { - return skipAttributes(dict, pos); - } else { - return pos; + newPos = skipAttributes(dict, newPos); } + return newPos; } inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, diff --git a/native/src/char_utils.cpp b/native/jni/src/char_utils.cpp index a31a0632c..a31a0632c 100644 --- a/native/src/char_utils.cpp +++ b/native/jni/src/char_utils.cpp diff --git a/native/jni/src/char_utils.h b/native/jni/src/char_utils.h new file mode 100644 index 000000000..607dc5195 --- /dev/null +++ b/native/jni/src/char_utils.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_CHAR_UTILS_H +#define LATINIME_CHAR_UTILS_H + +namespace latinime { + +inline static int isAsciiUpper(unsigned short c) { + return c >= 'A' && c <= 'Z'; +} + +inline static unsigned short toAsciiLower(unsigned short c) { + return c - 'A' + 'a'; +} + +inline static int isAscii(unsigned short c) { + return c <= 127; +} + +unsigned short latin_tolower(unsigned short c); + +/** + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + */ + +static const int BASE_CHARS_SIZE = 0x0500; +extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; + +inline static unsigned short toBaseChar(unsigned short c) { + if (c < BASE_CHARS_SIZE) { + return BASE_CHARS[c]; + } + return c; +} + +inline static unsigned short toBaseLowerCase(unsigned short c) { + c = toBaseChar(c); + if (isAsciiUpper(c)) { + return toAsciiLower(c); + } else if (isAscii(c)) { + return c; + } + return latin_tolower(c); +} + +} // namespace latinime + +#endif // LATINIME_CHAR_UTILS_H diff --git a/native/src/correction.cpp b/native/jni/src/correction.cpp index 27dc40745..087219ed4 100644 --- a/native/src/correction.cpp +++ b/native/jni/src/correction.cpp @@ -16,12 +16,15 @@ #include <assert.h> #include <ctype.h> +#include <math.h> #include <stdio.h> #include <string.h> #define LOG_TAG "LatinIME: correction.cpp" +#include "char_utils.h" #include "correction.h" +#include "defines.h" #include "dictionary.h" #include "proximity_info.h" @@ -31,81 +34,60 @@ namespace latinime { // edit distance funcitons // ///////////////////////////// -#if 0 /* no longer used */ -inline static int editDistance( - int* editDistanceTable, const unsigned short* input, - const int inputLength, const unsigned short* output, const int outputLength) { - // dp[li][lo] dp[a][b] = dp[ a * lo + b] - int* dp = editDistanceTable; - const int li = inputLength + 1; - const int lo = outputLength + 1; - for (int i = 0; i < li; ++i) { - dp[lo * i] = i; - } - for (int i = 0; i < lo; ++i) { - dp[i] = i; - } - - for (int i = 0; i < li - 1; ++i) { - for (int j = 0; j < lo - 1; ++j) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i]); - const uint32_t co = Dictionary::toBaseLowerCase(output[j]); - const uint16_t cost = (ci == co) ? 0 : 1; - dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, - min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); - if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1]) - && co == Dictionary::toBaseLowerCase(input[i - 1])) { - dp[(i + 1) * lo + (j + 1)] = min( - dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); - } - } +inline static void initEditDistance(int *editDistanceTable) { + for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { + editDistanceTable[i] = i; } +} - if (DEBUG_EDIT_DISTANCE) { - LOGI("IN = %d, OUT = %d", inputLength, outputLength); - for (int i = 0; i < li; ++i) { - for (int j = 0; j < lo; ++j) { - LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); +inline static void dumpEditDistance10ForDebug(int *editDistanceTable, + const int editDistanceTableWidth, const int outputLength) { + if (DEBUG_DICT) { + AKLOGI("EditDistanceTable"); + for (int i = 0; i <= 10; ++i) { + int c[11]; + for (int j = 0; j <= 10; ++j) { + if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { + c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; + } else { + c[j] = -1; + } } + AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); } } - return dp[li * lo - 1]; -} -#endif - -inline static void initEditDistance(int *editDistanceTable) { - for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { - editDistanceTable[i] = i; - } } inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input, const int inputLength, const unsigned short *output, const int outputLength) { + // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched. // Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j]. // Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated, // and calculate dp[ouputLength][0] ... dp[outputLength][inputLength]. int *const current = editDistanceTable + outputLength * (inputLength + 1); const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1); const int *const prevprev = - outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : NULL; + outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0; current[0] = outputLength; - const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]); - const uint32_t prevCO = - outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0; + const uint32_t co = toBaseLowerCase(output[outputLength - 1]); + const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; for (int i = 1; i <= inputLength; ++i) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]); + const uint32_t ci = toBaseLowerCase(input[i - 1]); const uint16_t cost = (ci == co) ? 0 : 1; current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); - if (i >= 2 && prevprev && ci == prevCO - && co == Dictionary::toBaseLowerCase(input[i - 2])) { + if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) { current[i] = min(current[i], prevprev[i - 2] + 1); } } } -inline static int getCurrentEditDistance( - int *editDistanceTable, const int inputLength, const int outputLength) { - return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; +inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth, + const int outputLength, const int inputLength) { + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); + } + return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength]; } ////////////////////// @@ -133,6 +115,9 @@ void Correction::initCorrection(const ProximityInfo *pi, const int inputLength, mInputLength = inputLength; mMaxDepth = maxDepth; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; + // TODO: This is not supposed to be required. Check what's going wrong with + // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] + initEditDistance(mEditDistanceTable); } void Correction::initCorrectionState( @@ -146,7 +131,7 @@ void Correction::initCorrectionState( void Correction::setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos, - const bool useFullEditDistance) { + const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) { // TODO: remove mTransposedPos = transposedPos; mExcessivePos = excessivePos; @@ -159,6 +144,8 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos, mSpaceProximityPos = spaceProximityPos; mMissingSpacePos = missingSpacePos; mUseFullEditDistance = useFullEditDistance; + mDoAutoCompletion = doAutoCompletion; + mMaxErrors = maxErrors; } void Correction::checkState() { @@ -172,23 +159,34 @@ void Correction::checkState() { } } -int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const unsigned short *word) { - return Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - firstFreq, secondFreq, this, word); +int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, + const int wordCount, const bool isSpaceProximity, const unsigned short *word) { + return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray, + wordCount, this, isSpaceProximity, word); } int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { + return getFinalFreqInternal(freq, word, wordLength, mInputLength); +} + +int Correction::getFinalFreqForSubQueue(const int freq, unsigned short **word, int *wordLength, + const int inputLength) { + return getFinalFreqInternal(freq, word, wordLength, inputLength); +} + +int Correction::getFinalFreqInternal(const int freq, unsigned short **word, int *wordLength, + const int inputLength) { const int outputIndex = mTerminalOutputIndex; const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; - if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { - return -1; + if (outputIndex < MIN_SUGGEST_DEPTH) { + return NOT_A_FREQUENCY; } *word = mWord; - return Correction::RankingAlgorithm::calculateFinalFreq( - inputIndex, outputIndex, freq, mEditDistanceTable, this); + int finalFreq = Correction::RankingAlgorithm::calculateFinalFreq( + inputIndex, outputIndex, freq, mEditDistanceTable, this, inputLength); + return finalFreq; } bool Correction::initProcessState(const int outputIndex) { @@ -213,6 +211,7 @@ bool Correction::initProcessState(const int outputIndex) { mMatching = false; mProximityMatching = false; + mAdditionalProximityMatching = false; mTransposing = false; mExceeding = false; mSkipping = false; @@ -229,20 +228,10 @@ int Correction::goDownTree( } // TODO: remove -int Correction::getOutputIndex() { - return mOutputIndex; -} - -// TODO: remove int Correction::getInputIndex() { return mInputIndex; } -// TODO: remove -bool Correction::needsToTraverseAllNodes() { - return mNeedsToTraverseAllNodes; -} - void Correction::incrementInputIndex() { ++mInputIndex; } @@ -269,6 +258,7 @@ void Correction::incrementOutputIndex() { mCorrectionStates[mOutputIndex].mMatching = mMatching; mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; + mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching; mCorrectionStates[mOutputIndex].mTransposing = mTransposing; mCorrectionStates[mOutputIndex].mExceeding = mExceeding; mCorrectionStates[mOutputIndex].mSkipping = mSkipping; @@ -280,7 +270,9 @@ void Correction::startToTraverseAllNodes() { bool Correction::needsToPrune() const { // TODO: use edit distance here - return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance; + return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance + // Allow one char longer word for missing character + || (!mDoAutoCompletion && (mOutputIndex > mInputLength)); } void Correction::addCharToCurrentWord(const int32_t c) { @@ -290,13 +282,12 @@ void Correction::addCharToCurrentWord(const int32_t c) { mWord, mOutputIndex + 1); } -// TODO: inline? Correction::CorrectionType Correction::processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { addCharToCurrentWord(c); - if (needsToTraverseAllNodes() && isTerminal) { - mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); - mTerminalOutputIndex = mOutputIndex; + mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); + mTerminalOutputIndex = mOutputIndex; + if (mNeedsToTraverseAllNodes && isTerminal) { incrementOutputIndex(); return TRAVERSE_ALL_ON_TERMINAL; } else { @@ -305,19 +296,36 @@ Correction::CorrectionType Correction::processSkipChar( } } +Correction::CorrectionType Correction::processUnrelatedCorrectionType() { + // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + return UNRELATED; +} + inline bool isEquivalentChar(ProximityInfo::ProximityType type) { return type == ProximityInfo::EQUIVALENT_CHAR; } +inline bool isProximityCharOrEquivalentChar(ProximityInfo::ProximityType type) { + return type == ProximityInfo::EQUIVALENT_CHAR + || type == ProximityInfo::NEAR_PROXIMITY_CHAR; +} + Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); + if (correctionCount > mMaxErrors) { + return processUnrelatedCorrectionType(); + } + // TODO: Change the limit if we'll allow two or more corrections const bool noCorrectionsHappenedSoFar = correctionCount == 0; const bool canTryCorrection = noCorrectionsHappenedSoFar; int proximityIndex = 0; mDistances[mOutputIndex] = NOT_A_DISTANCE; + // Skip checking this node if (mNeedsToTraverseAllNodes || isQuote(c)) { bool incremented = false; if (mLastCharExceeded && mInputIndex == mInputLength - 1) { @@ -342,6 +350,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( return processSkipChar(c, isTerminal, incremented); } + // Check possible corrections. if (mExcessivePos >= 0) { if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { mExcessivePos = mOutputIndex; @@ -382,30 +391,42 @@ Correction::CorrectionType Correction::processCharAndCalcState( incrementInputIndex(); } else { --mTransposedCount; - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } // TODO: Change the limit if we'll allow two or more proximity chars with corrections - const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0; + // Work around: When the mMaxErrors is 1, we only allow just one error + // including proximity correction. + const bool checkProximityChars = (mMaxErrors > 1) + ? (noCorrectionsHappenedSoFar || mProximityCount == 0) + : (noCorrectionsHappenedSoFar && mProximityCount == 0); + ProximityInfo::ProximityType matchedProximityCharId = secondTransposing ? ProximityInfo::EQUIVALENT_CHAR : mProximityInfo->getMatchedProximityId( mInputIndex, c, checkProximityChars, &proximityIndex); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId + || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { if (canTryCorrection && mOutputIndex > 0 && mCorrectionStates[mOutputIndex].mProximityMatching && mCorrectionStates[mOutputIndex].mExceeding && isEquivalentChar(mProximityInfo->getMatchedProximityId( mInputIndex, mWord[mOutputIndex - 1], false))) { - if (DEBUG_CORRECTION) { - LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); } // Conversion p->e // Example: @@ -423,7 +444,11 @@ Correction::CorrectionType Correction::processCharAndCalcState( } } - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId + || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + mAdditionalProximityMatching = true; + } // TODO: Optimize // As the current char turned out to be an unrelated char, // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] @@ -465,6 +490,18 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mSkippedCount; --mProximityCount; return processSkipChar(c, isTerminal, false); + } else if (mInputIndex - 1 < mInputLength + && mSkippedCount > 0 + && mCorrectionStates[mOutputIndex].mSkipping + && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching + && isProximityCharOrEquivalentChar( + mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { + // Conversion s->a + incrementInputIndex(); + --mSkippedCount; + mProximityMatching = true; + ++mProximityCount; + mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength && isEquivalentChar( mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { @@ -475,17 +512,52 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mExcessiveCount; incrementInputIndex(); } + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + DUMP_WORD(mWord, mOutputIndex); + if (mTransposing) { + AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } else { + AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } + } } else if (mSkipping) { // 3. Skip correction ++mSkippedCount; + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } return processSkipChar(c, isTerminal, false); + } else if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + // As a last resort, use additional proximity characters + mProximityMatching = true; + ++mProximityCount; + mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } } else { - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } else if (secondTransposing) { // If inputIndex is greater than mInputLength, that means there is no @@ -500,6 +572,13 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mProximityCount; mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } } addCharToCurrentWord(c); @@ -533,13 +612,17 @@ Correction::CorrectionType Correction::processCharAndCalcState( || isSameAsUserTypedLength) && isTerminal) { mTerminalInputIndex = mInputIndex - 1; mTerminalOutputIndex = mOutputIndex - 1; - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } return ON_TERMINAL; } else { + mTerminalInputIndex = mInputIndex - 1; + mTerminalOutputIndex = mOutputIndex - 1; return NOT_ON_TERMINAL; } } @@ -547,55 +630,6 @@ Correction::CorrectionType Correction::processCharAndCalcState( Correction::~Correction() { } -///////////////////////// -// static inline utils // -///////////////////////// - -static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; -static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { - return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); -} - -static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; -inline static void multiplyIntCapped(const int multiplier, int *base) { - const int temp = *base; - if (temp != S_INT_MAX) { - // Branch if multiplier == 2 for the optimization - if (multiplier == 2) { - *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; - } else { - // TODO: This overflow check gives a wrong answer when, for example, - // temp = 2^16 + 1 and multiplier = 2^17 + 1. - // Fix this behavior. - const int tempRetval = temp * multiplier; - *base = tempRetval >= temp ? tempRetval : S_INT_MAX; - } - } -} - -inline static int powerIntCapped(const int base, const int n) { - if (n <= 0) return 1; - if (base == 2) { - return n < 31 ? 1 << n : S_INT_MAX; - } else { - int ret = base; - for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); - return ret; - } -} - -inline static void multiplyRate(const int rate, int *freq) { - if (*freq != S_INT_MAX) { - if (*freq > 1000000) { - *freq /= 100; - multiplyIntCapped(rate, freq); - } else { - multiplyIntCapped(rate, freq); - *freq /= 100; - } - } -} - inline static int getQuoteCount(const unsigned short* word, const int length) { int quoteCount = 0; for (int i = 0; i < length; ++i) { @@ -607,13 +641,7 @@ inline static int getQuoteCount(const unsigned short* word, const int length) { } inline static bool isUpperCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (isupper(c)) { - return true; - } - return false; + return isAsciiUpper(toBaseChar(c)); } ////////////////////// @@ -622,9 +650,9 @@ inline static bool isUpperCase(unsigned short c) { /* static */ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, - const int freq, int* editDistanceTable, const Correction* correction) { + const int freq, int* editDistanceTable, const Correction* correction, + const int inputLength) { const int excessivePos = correction->getExcessivePos(); - const int inputLength = correction->mInputLength; const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; const ProximityInfo *proximityInfo = correction->mProximityInfo; @@ -649,45 +677,55 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const const unsigned short* word = correction->mWord; const bool skipped = skippedCount > 0; - const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1) + const int quoteDiffCount = max(0, getQuoteCount(word, outputLength) - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); // TODO: Calculate edit distance for transposed and excessive int ed = 0; + if (DEBUG_DICT_FULL) { + dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength); + } int adjustedProximityMatchedCount = proximityMatchedCount; int finalFreq = freq; + if (DEBUG_CORRECTION_FREQ + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { + AKLOGI("FinalFreq0: %d", finalFreq); + } // TODO: Optimize this. - // TODO: Ignoring edit distance for transposed char, for now - if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) { - ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1); + if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { + ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength, + inputLength) - transposedCount; + const int matchWeight = powerIntCapped(typedLetterMultiplier, - max(inputLength, outputIndex + 1) - ed); + max(inputLength, outputLength) - ed); multiplyIntCapped(matchWeight, &finalFreq); // TODO: Demote further if there are two or more excessive chars with longer user input? - if (inputLength > outputIndex + 1) { + if (inputLength > outputLength) { multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); } ed = max(0, ed - quoteDiffCount); - - if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { - // Promote a word with just one skipped or excessive char - if (sameLength) { - multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); - } else { + adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)), + proximityMatchedCount); + if (transposedCount < 1) { + if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) { + // Promote a word with just one skipped or excessive char + if (sameLength) { + multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE + + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength, + &finalFreq); + } else { + multiplyIntCapped(typedLetterMultiplier, &finalFreq); + } + } else if (ed == 0) { multiplyIntCapped(typedLetterMultiplier, &finalFreq); + sameLength = true; } - } else if (ed == 0) { - multiplyIntCapped(typedLetterMultiplier, &finalFreq); - sameLength = true; } - adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)), - proximityMatchedCount); } else { - // TODO: Calculate the edit distance for transposed char const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); multiplyIntCapped(matchWeight, &finalFreq); } @@ -707,7 +745,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const / (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); if (DEBUG_DICT_FULL) { - LOGI("Demotion rate for missing character is %d.", demotionRate); + AKLOGI("Demotion rate for missing character is %d.", demotionRate); } multiplyRate(demotionRate, &finalFreq); } @@ -720,8 +758,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const if (excessiveCount > 0) { multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { - if (DEBUG_CORRECTION_FREQ) { - LOGI("Double excessive demotion"); + if (DEBUG_DICT_FULL) { + AKLOGI("Double excessive demotion"); } // If an excessive character is not adjacent to the left char or the right char, // we will demote this word. @@ -729,11 +767,14 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } } + const bool performTouchPositionCorrection = + CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() + && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0; // Score calibration by touch coordinates is being done only for pure-fat finger typing error // cases. + int additionalProximityCount = 0; // TODO: Remove this constraint. - if (CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() - && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0) { + if (performTouchPositionCorrection) { for (int i = 0; i < outputLength; ++i) { const int squaredDistance = correction->mDistances[i]; if (i < adjustedProximityMatchedCount) { @@ -744,40 +785,60 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f; static const float B = 1.0f; static const float C = 0.5f; + static const float MIN = 0.3f; static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS; static const float R2 = HALF_SCORE_SQUARED_RADIUS; const float x = (float)squaredDistance / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; - const float factor = (x < R1) + const float factor = max((x < R1) ? (A * (R1 - x) + B * x) / R1 - : (B * (R2 - x) + C * (x - R1)) / (R2 - R1); + : (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN); // factor is piecewise linear function like: // A -_ . // ^-_ . // B \ . - // \ . - // C \ . - // 0 R1 R2 - if (factor <= 0) { - return -1; - } + // \_ . + // C ------------. + // . + // 0 R1 R2 . multiplyRate((int)(factor * 100), &finalFreq); } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) { multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } else if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { + ++additionalProximityCount; + multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } } } else { + // Demote additional proximity characters + for (int i = 0; i < outputLength; ++i) { + const int squaredDistance = correction->mDistances[i]; + if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { + ++additionalProximityCount; + } + } // Promotion for a word with proximity characters for (int i = 0; i < adjustedProximityMatchedCount; ++i) { // A word with proximity corrections if (DEBUG_DICT_FULL) { - LOGI("Found a proximity correction."); + AKLOGI("Found a proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &finalFreq); - multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + if (i < additionalProximityCount) { + multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } else { + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } } } + // If the user types too many(three or more) proximity characters with additional proximity + // character,do not treat as the same length word. + if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3 + || transposedCount > 0 || skipped || excessiveCount > 0)) { + sameLength = false; + } + const int errorCount = adjustedProximityMatchedCount > 0 ? adjustedProximityMatchedCount : (proximityMatchedCount + transposedCount); @@ -787,13 +848,15 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const // Promotion for an exactly matched word if (ed == 0) { // Full exact match - if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) { + if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0 + && quoteDiffCount == 0 && additionalProximityCount == 0) { finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } } // Promote a word with no correction - if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0) { + if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0 + && additionalProximityCount == 0) { multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } @@ -832,71 +895,101 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } if (DEBUG_DICT_FULL) { - LOGI("calc: %d, %d", outputIndex, sameLength); + AKLOGI("calc: %d, %d", outputLength, sameLength); } - if (DEBUG_CORRECTION_FREQ) { - DUMP_WORD(correction->mWord, outputIndex + 1); - LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, - skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength, - quoteDiffCount, ed, finalFreq); + if (DEBUG_CORRECTION_FREQ + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { + DUMP_WORD(proximityInfo->getPrimaryInputWord(), inputLength); + DUMP_WORD(correction->mWord, outputLength); + AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount, + skippedCount, transposedCount, excessiveCount, additionalProximityCount, + outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); } return finalFreq; } /* static */ -int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const Correction* correction, - const unsigned short *word) { - const int spaceProximityPos = correction->mSpaceProximityPos; - const int missingSpacePos = correction->mMissingSpacePos; - if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); - } - const bool isSpaceProximity = spaceProximityPos >= 0; - const int inputLength = correction->mInputLength; - const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); +int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( + const int *freqArray, const int *wordLengthArray, const int wordCount, + const Correction* correction, const bool isSpaceProximity, const unsigned short *word) { const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; bool firstCapitalizedWordDemotion = false; - if (firstWordLength >= 2) { - firstCapitalizedWordDemotion = isUpperCase(word[0]); - } - bool secondCapitalizedWordDemotion = false; - if (secondWordLength >= 2) { - secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + + { + // TODO: Handle multiple capitalized word demotion properly + const int firstWordLength = wordLengthArray[0]; + const int secondWordLength = wordLengthArray[1]; + if (firstWordLength >= 2) { + firstCapitalizedWordDemotion = isUpperCase(word[0]); + } + + if (secondWordLength >= 2) { + // FIXME: word[firstWordLength + 1] is incorrect. + secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + } } + const bool capitalizedWordDemotion = firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; - if (DEBUG_DICT_FULL) { - LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); + int totalLength = 0; + int totalFreq = 0; + for (int i = 0; i < wordCount; ++i){ + const int wordLength = wordLengthArray[i]; + if (wordLength <= 0) { + return 0; + } + totalLength += wordLength; + const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1); + int tempFirstFreq = freqArray[i]; + multiplyRate(demotionRate, &tempFirstFreq); + totalFreq += tempFirstFreq; } - if (firstWordLength == 0 || secondWordLength == 0) { + if (totalLength <= 0 || totalFreq <= 0) { return 0; } - const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); - int tempFirstFreq = firstFreq; - multiplyRate(firstDemotionRate, &tempFirstFreq); - - const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); - int tempSecondFreq = secondFreq; - multiplyRate(secondDemotionRate, &tempSecondFreq); - - const int totalLength = firstWordLength + secondWordLength; + // TODO: Currently totalFreq is adjusted to two word metrix. // Promote pairFreq with multiplying by 2, because the word length is the same as the typed // length. - int totalFreq = tempFirstFreq + tempSecondFreq; + totalFreq = totalFreq * 2 / wordCount; + if (wordCount > 2) { + // Safety net for 3+ words -- Caveats: many heuristics and workarounds here. + int oneLengthCounter = 0; + int twoLengthCounter = 0; + for (int i = 0; i < wordCount; ++i) { + const int wordLength = wordLengthArray[i]; + // TODO: Use bigram instead of this safety net + if (i < wordCount - 1) { + const int nextWordLength = wordLengthArray[i + 1]; + if (wordLength == 1 && nextWordLength == 2) { + // Safety net to filter 1 length and 2 length sequential words + return 0; + } + } + const int freq = freqArray[i]; + // Demote too short weak words + if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) { + multiplyRate(100 * freq / MAX_FREQ, &totalFreq); + } + if (wordLength == 1) { + ++oneLengthCounter; + } else if (wordLength == 2) { + ++twoLengthCounter; + } + if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) { + // Safety net to filter too many short words + return 0; + } + } + multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq); + } // This is a workaround to try offsetting the not-enough-demotion which will be done in // calcNormalizedScore in Utils.java. @@ -923,19 +1016,128 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( if (isSpaceProximity) { // A word pair with one space proximity correction if (DEBUG_DICT) { - LOGI("Found a word pair with space proximity correction."); + AKLOGI("Found a word pair with space proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &totalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } - multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + if (isSpaceProximity) { + multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); + } else { + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + } if (capitalizedWordDemotion) { multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); } + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1], + wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq); + DUMP_WORD(word, wordLengthArray[0]); + } + return totalFreq; } +/* Damerau-Levenshtein distance */ +inline static int editDistanceInternal( + int* editDistanceTable, const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { + // dp[li][lo] dp[a][b] = dp[ a * lo + b] + int* dp = editDistanceTable; + const int li = beforeLength + 1; + const int lo = afterLength + 1; + for (int i = 0; i < li; ++i) { + dp[lo * i] = i; + } + for (int i = 0; i < lo; ++i) { + dp[i] = i; + } + + for (int i = 0; i < li - 1; ++i) { + for (int j = 0; j < lo - 1; ++j) { + const uint32_t ci = toBaseLowerCase(before[i]); + const uint32_t co = toBaseLowerCase(after[j]); + const uint16_t cost = (ci == co) ? 0 : 1; + dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, + min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); + if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1]) + && co == toBaseLowerCase(before[i - 1])) { + dp[(i + 1) * lo + (j + 1)] = min( + dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); + } + } + } + + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); + for (int i = 0; i < li; ++i) { + for (int j = 0; j < lo; ++j) { + AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); + } + } + } + return dp[li * lo - 1]; +} + +int Correction::RankingAlgorithm::editDistance(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { + int table[(beforeLength + 1) * (afterLength + 1)]; + return editDistanceInternal(table, before, beforeLength, after, afterLength); +} + + +// In dictionary.cpp, getSuggestion() method, +// suggestion scores are computed using the below formula. +// original score +// := pow(mTypedLetterMultiplier (this is defined 2), +// (the number of matched characters between typed word and suggested word)) +// * (individual word's score which defined in the unigram dictionary, +// and this score is defined in range [0, 255].) +// Then, the following processing is applied. +// - If the dictionary word is matched up to the point of the user entry +// (full match up to min(before.length(), after.length()) +// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) +// - If the word is a true full match except for differences in accents or +// capitalization, then treat it as if the score was 255. +// - If before.length() == after.length() +// => multiply by mFullWordMultiplier (this is defined 2)) +// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 +// For historical reasons we ignore the 1.2 modifier (because the measure for a good +// autocorrection threshold was done at a time when it didn't exist). This doesn't change +// the result. +// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. + +/* static */ +double Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength, + const int score) { + if (0 == beforeLength || 0 == afterLength) { + return 0; + } + const int distance = editDistance(before, beforeLength, after, afterLength); + int spaceCount = 0; + for (int i = 0; i < afterLength; ++i) { + if (after[i] == CODE_SPACE) { + ++spaceCount; + } + } + + if (spaceCount == afterLength) { + return 0; + } + + const double maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE + * pow((double)TYPED_LETTER_MULTIPLIER, + (double)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER; + + // add a weight based on edit distance. + // distance <= max(afterLength, beforeLength) == afterLength, + // so, 0 <= distance / afterLength <= 1 + const double weight = 1.0 - (double) distance / afterLength; + return (score / maxScore) * weight; +} + } // namespace latinime diff --git a/native/src/correction.h b/native/jni/src/correction.h index d4e99f0ce..ee55c9604 100644 --- a/native/src/correction.h +++ b/native/jni/src/correction.h @@ -17,6 +17,7 @@ #ifndef LATINIME_CORRECTION_H #define LATINIME_CORRECTION_H +#include <assert.h> #include <stdint.h> #include "correction_state.h" @@ -27,8 +28,7 @@ namespace latinime { class ProximityInfo; class Correction { - -public: + public: typedef enum { TRAVERSE_ALL_ON_TERMINAL, TRAVERSE_ALL_NOT_ON_TERMINAL, @@ -37,6 +37,62 @@ public: NOT_ON_TERMINAL } CorrectionType; + ///////////////////////// + // static inline utils // + ///////////////////////// + + static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; + static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); + } + + static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; + inline static void multiplyIntCapped(const int multiplier, int *base) { + const int temp = *base; + if (temp != S_INT_MAX) { + // Branch if multiplier == 2 for the optimization + if (multiplier < 0) { + if (DEBUG_DICT) { + assert(false); + } + AKLOGI("--- Invalid multiplier: %d", multiplier); + } else if (multiplier == 0) { + *base = 0; + } else if (multiplier == 2) { + *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; + } else { + // TODO: This overflow check gives a wrong answer when, for example, + // temp = 2^16 + 1 and multiplier = 2^17 + 1. + // Fix this behavior. + const int tempRetval = temp * multiplier; + *base = tempRetval >= temp ? tempRetval : S_INT_MAX; + } + } + } + + inline static int powerIntCapped(const int base, const int n) { + if (n <= 0) return 1; + if (base == 2) { + return n < 31 ? 1 << n : S_INT_MAX; + } else { + int ret = base; + for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); + return ret; + } + } + + inline static void multiplyRate(const int rate, int *freq) { + if (*freq != S_INT_MAX) { + if (*freq > 1000000) { + *freq /= 100; + multiplyIntCapped(rate, freq); + } else { + multiplyIntCapped(rate, freq); + *freq /= 100; + } + } + } + Correction(const int typedLetterMultiplier, const int fullWordMultiplier); void initCorrection( const ProximityInfo *pi, const int inputLength, const int maxWordLength); @@ -44,11 +100,11 @@ public: // TODO: remove void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, - const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance); + const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance, + const bool doAutoCompletion, const int maxErrors); void checkState(); bool initProcessState(const int index); - int getOutputIndex(); int getInputIndex(); virtual ~Correction(); @@ -73,9 +129,12 @@ public: bool needsToPrune() const; - int getFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const unsigned short *word); + int getFreqForSplitMultipleWords( + const int *freqArray, const int *wordLengthArray, const int wordCount, + const bool isSpaceProximity, const unsigned short *word); int getFinalFreq(const int freq, unsigned short **word, int* wordLength); + int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength, + const int inputLength); CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); @@ -94,21 +153,44 @@ public: inline int getTreeParentIndex(const int index) const { return mCorrectionStates[index].mParentIndex; } -private: + + class RankingAlgorithm { + public: + static int calculateFinalFreq(const int inputIndex, const int depth, + const int freq, int *editDistanceTable, const Correction* correction, + const int inputLength); + static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, + const int wordCount, const Correction* correction, const bool isSpaceProximity, + const unsigned short *word); + static double calcNormalizedScore(const unsigned short* before, const int beforeLength, + const unsigned short* after, const int afterLength, const int score); + static int editDistance(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength); + private: + static const int CODE_SPACE = ' '; + static const int MAX_INITIAL_SCORE = 255; + static const int TYPED_LETTER_MULTIPLIER = 2; + static const int FULL_WORD_MULTIPLIER = 2; + }; + + private: inline void incrementInputIndex(); inline void incrementOutputIndex(); - inline bool needsToTraverseAllNodes(); inline void startToTraverseAllNodes(); inline bool isQuote(const unsigned short c); inline CorrectionType processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented); + inline CorrectionType processUnrelatedCorrectionType(); inline void addCharToCurrentWord(const int32_t c); + inline int getFinalFreqInternal(const int freq, unsigned short **word, int* wordLength, + const int inputLength); const int TYPED_LETTER_MULTIPLIER; const int FULL_WORD_MULTIPLIER; const ProximityInfo *mProximityInfo; bool mUseFullEditDistance; + bool mDoAutoCompletion; int mMaxEditDistance; int mMaxDepth; int mInputLength; @@ -116,6 +198,7 @@ private: int mMissingSpacePos; int mTerminalInputIndex; int mTerminalOutputIndex; + int mMaxErrors; // The following arrays are state buffer. unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; @@ -146,17 +229,11 @@ private: bool mMatching; bool mProximityMatching; + bool mAdditionalProximityMatching; bool mExceeding; bool mTransposing; bool mSkipping; - class RankingAlgorithm { - public: - static int calculateFinalFreq(const int inputIndex, const int depth, - const int freq, int *editDistanceTable, const Correction* correction); - static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const Correction* correction, const unsigned short *word); - }; }; } // namespace latinime #endif // LATINIME_CORRECTION_H diff --git a/native/src/correction_state.h b/native/jni/src/correction_state.h index c04146e54..5b2cbd3a2 100644 --- a/native/src/correction_state.h +++ b/native/jni/src/correction_state.h @@ -47,9 +47,9 @@ struct CorrectionState { bool mExceeding; bool mSkipping; bool mProximityMatching; + bool mAdditionalProximityMatching; bool mNeedsToTraverseAllNodes; - }; inline static void initCorrectionState(CorrectionState *state, const int rootPos, @@ -77,7 +77,7 @@ inline static void initCorrectionState(CorrectionState *state, const int rootPos state->mTransposing = false; state->mExceeding = false; state->mSkipping = false; - + state->mAdditionalProximityMatching = false; } } // namespace latinime diff --git a/native/src/debug.h b/native/jni/src/debug.h index 38b2f107a..b13052c95 100644 --- a/native/src/debug.h +++ b/native/jni/src/debug.h @@ -42,7 +42,7 @@ static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned s static inline void LOGI_S16(unsigned short* string, const unsigned int length) { unsigned char tmp_buffer[length]; convertToUnibyteString(string, tmp_buffer, length); - LOGI(">> %s", tmp_buffer); + AKLOGI(">> %s", tmp_buffer); // The log facility is throwing out log that comes too fast. The following // is a dirty way of slowing down processing so that we can see all log. // TODO : refactor this in a blocking log or something. @@ -53,7 +53,7 @@ static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int leng unsigned char c) { unsigned char tmp_buffer[length+1]; convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c); - LOGI(">> %s", tmp_buffer); + AKLOGI(">> %s", tmp_buffer); // Likewise // usleep(10); } @@ -64,7 +64,7 @@ static inline void printDebug(const char* tag, int* codes, int codesSize, int MA buf[codesSize] = 0; while (--codesSize >= 0) buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS]; - LOGI("%s, WORD = %s", tag, buf); + AKLOGI("%s, WORD = %s", tag, buf); free(buf); } diff --git a/native/src/defines.h b/native/jni/src/defines.h index ef1beb92f..e882c3714 100644 --- a/native/src/defines.h +++ b/native/jni/src/defines.h @@ -20,15 +20,31 @@ #if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) #include <cutils/log.h> +#define AKLOGE ALOGE +#define AKLOGI ALOGI + +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) + +static inline void dumpWord(const unsigned short* word, const int length) { + static char charBuf[50]; + + for (int i = 0; i < length; ++i) { + charBuf[i] = word[i]; + } + charBuf[length] = 0; + AKLOGI("[ %s ]", charBuf); +} + #else -#define LOGE(fmt, ...) -#define LOGI(fmt, ...) +#define AKLOGE(fmt, ...) +#define AKLOGI(fmt, ...) +#define DUMP_WORD(word, length) #endif #ifdef FLAG_DO_PROFILE // Profiler -#include <cutils/log.h> #include <time.h> + #define PROF_BUF_SIZE 100 static double profile_buf[PROF_BUF_SIZE]; static double profile_old[PROF_BUF_SIZE]; @@ -42,10 +58,10 @@ static unsigned int profile_counter[PROF_BUF_SIZE]; #define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while(0) #define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id]) #define PROF_CLOCKOUT(prof_buf_id) \ - LOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) -#define PROF_OUTALL do { LOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0) + AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) +#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0) -static void prof_reset(void) { +static inline void prof_reset(void) { for (int i = 0; i < PROF_BUF_SIZE; ++i) { profile_buf[i] = 0; profile_old[i] = 0; @@ -53,11 +69,11 @@ static void prof_reset(void) { } } -static void prof_out(void) { +static inline void prof_out(void) { if (profile_counter[PROF_BUF_SIZE - 1] != 1) { - LOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); + AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); } - LOGI("Total time is %6.3f ms.", + AKLOGI("Total time is %6.3f ms.", profile_buf[PROF_BUF_SIZE - 1] * 1000 / (double)CLOCKS_PER_SEC); double all = 0; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { @@ -66,7 +82,7 @@ static void prof_out(void) { if (all == 0) all = 1; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { if (profile_buf[i] != 0) { - LOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", + AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", i, (profile_buf[i] * 100 / all), profile_buf[i] * 1000 / (double)CLOCKS_PER_SEC, profile_counter[i]); } @@ -98,21 +114,11 @@ static void prof_out(void) { #define DEBUG_SHOW_FOUND_WORD false #define DEBUG_NODE DEBUG_DICT_FULL #define DEBUG_TRACE DEBUG_DICT_FULL -#define DEBUG_PROXIMITY_INFO true +#define DEBUG_PROXIMITY_INFO false +#define DEBUG_PROXIMITY_CHARS false #define DEBUG_CORRECTION false -#define DEBUG_CORRECTION_FREQ true - -#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) - -static char charBuf[50]; - -static void dumpWord(const unsigned short* word, const int length) { - for (int i = 0; i < length; ++i) { - charBuf[i] = word[i]; - } - charBuf[length] = 0; - LOGI("[ %s ]", charBuf); -} +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_WORDS_PRIORITY_QUEUE false #else // FLAG_DBG @@ -123,10 +129,11 @@ static void dumpWord(const unsigned short* word, const int length) { #define DEBUG_NODE false #define DEBUG_TRACE false #define DEBUG_PROXIMITY_INFO false +#define DEBUG_PROXIMITY_CHARS false #define DEBUG_CORRECTION false #define DEBUG_CORRECTION_FREQ false +#define DEBUG_WORDS_PRIORITY_QUEUE false -#define DUMP_WORD(word, length) #endif // FLAG_DBG @@ -157,64 +164,91 @@ static void dumpWord(const unsigned short* word, const int length) { #define FLAG_BIGRAM_FREQ 0x7F #define DICTIONARY_VERSION_MIN 200 -// TODO: remove this constant when the switch to the new dict format is over -#define DICTIONARY_HEADER_SIZE 2 -#define NEW_DICTIONARY_HEADER_SIZE 5 #define NOT_VALID_WORD -99 #define NOT_A_CHARACTER -1 #define NOT_A_DISTANCE -1 +#define NOT_A_COORDINATE -1 #define EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO -2 #define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO -3 -#define NOT_A_INDEX -1 +#define ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO -4 +#define NOT_AN_INDEX -1 +#define NOT_A_FREQUENCY -1 #define KEYCODE_SPACE ' ' #define CALIBRATE_SCORE_BY_TOUCH_COORDINATES true #define SUGGEST_WORDS_WITH_MISSING_CHARACTER true -#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true -#define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true +#define SUGGEST_MULTIPLE_WORDS true // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent. #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80 #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12 -#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67 +#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58 +#define WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE 50 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 -#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60 +#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 +#define WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE 70 #define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105 -#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160 +#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 148 +#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER 3 #define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45 #define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70 #define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96 #define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50 +#define TWO_WORDS_CORRECTION_DEMOTION_BASE 80 +#define TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER 1 #define ZERO_DISTANCE_PROMOTION_RATE 110 #define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f #define HALF_SCORE_SQUARED_RADIUS 32.0f +#define MAX_FREQ 255 -// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java +// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. #define MAX_WORD_LENGTH_INTERNAL 48 +// This must be equal to ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE in KeyDetector.java +#define ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE 2 + +// Word limit for sub queues used in WordsPriorityQueuePool. Sub queues are temporary queues used +// for better performance. +// Holds up to 1 candidate for each word +#define SUB_QUEUE_MAX_WORDS 1 +#define SUB_QUEUE_MAX_COUNT 10 +#define SUB_QUEUE_MIN_WORD_LENGTH 4 +#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10 +#define MULTIPLE_WORDS_DEMOTION_RATE 80 +#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6 + +#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39 +#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22 + #define MAX_DEPTH_MULTIPLIER 3 -// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German -// word in the dictionary -#define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5 +#define FIRST_WORD_INDEX 0 + +// TODO: Reduce this constant if possible; check the maximum number of digraphs in the same +// word in the dictionary for languages with digraphs, like German and French +#define DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH 5 // Minimum suggest depth for one word for all cases except for missing space suggestions. #define MIN_SUGGEST_DEPTH 1 -#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3 +#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 -#define min(a,b) ((a)<(b)?(a):(b)) -#define max(a,b) ((a)>(b)?(a):(b)) +template<typename T> inline T min(T a, T b) { return a < b ? a : b; } +template<typename T> inline T max(T a, T b) { return a > b ? a : b; } // The ratio of neutral area radius to sweet spot radius. #define NEUTRAL_AREA_RADIUS_RATIO 1.3f +// DEBUG +#define INPUTLENGTH_FOR_DEBUG -1 +#define MIN_OUTPUT_INDEX_FOR_DEBUG -1 + #endif // LATINIME_DEFINES_H diff --git a/native/src/dictionary.cpp b/native/jni/src/dictionary.cpp index a49769bdb..981a983ee 100644 --- a/native/src/dictionary.cpp +++ b/native/jni/src/dictionary.cpp @@ -19,6 +19,7 @@ #define LOG_TAG "LatinIME: dictionary.cpp" +#include "binary_format.h" #include "dictionary.h" namespace latinime { @@ -26,33 +27,35 @@ namespace latinime { // TODO: Change the type of all keyCodes to uint32_t Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultiplier, int fullWordMultiplier, - int maxWordLength, int maxWords, int maxAlternatives) + int maxWordLength, int maxWords) : mDict((unsigned char*) dict), mDictSize(dictSize), mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust), // Checks whether it has the latest dictionary or the old dictionary IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) { if (DEBUG_DICT) { if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) { - LOGI("Max word length (%d) is greater than %d", + AKLOGI("Max word length (%d) is greater than %d", maxWordLength, MAX_WORD_LENGTH_INTERNAL); - LOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF)); + AKLOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF)); } } - mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier, - maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); - mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives, - IS_LATEST_DICT_VERSION, hasBigram(), this); + mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); + mWordsPriorityQueuePool = new WordsPriorityQueuePool( + maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength); + const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict); + mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier, + fullWordMultiplier, maxWordLength, maxWords, IS_LATEST_DICT_VERSION); + mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, + IS_LATEST_DICT_VERSION, true /* hasBigram */, this); } Dictionary::~Dictionary() { + delete mCorrection; + delete mWordsPriorityQueuePool; delete mUnigramDictionary; delete mBigramDictionary; } -bool Dictionary::hasBigram() { - return ((mDict[1] & 0xFF) == 1); -} - bool Dictionary::isValidWord(unsigned short *word, int length) { return mUnigramDictionary->isValidWord(word, length); } diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h new file mode 100644 index 000000000..139d3f0d7 --- /dev/null +++ b/native/jni/src/dictionary.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_H +#define LATINIME_DICTIONARY_H + +#include "bigram_dictionary.h" +#include "char_utils.h" +#include "correction.h" +#include "defines.h" +#include "proximity_info.h" +#include "unigram_dictionary.h" +#include "words_priority_queue_pool.h" + +namespace latinime { + +class Dictionary { + public: + Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler, + int fullWordMultiplier, int maxWordLength, int maxWords); + + int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, + int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) { + return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, + mCorrection, xcoordinates, ycoordinates, codes, + codesSize, flags, outWords, frequencies); + } + + // TODO: Call mBigramDictionary instead of mUnigramDictionary + int getBigrams(unsigned short *word, int length, int *codes, int codesSize, + unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) { + return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies, + maxWordLength, maxBigrams); + } + + bool isValidWord(unsigned short *word, int length); + void *getDict() { return (void *)mDict; } + int getDictSize() { return mDictSize; } + int getMmapFd() { return mMmapFd; } + int getDictBufAdjust() { return mDictBufAdjust; } + ~Dictionary(); + + // public static utility methods + // static inline methods should be defined in the header file + static int wideStrLen(unsigned short *str); + + private: + bool hasBigram(); + + const unsigned char *mDict; + + // Used only for the mmap version of dictionary loading, but we use these as dummy variables + // also for the malloc version. + const int mDictSize; + const int mMmapFd; + const int mDictBufAdjust; + + const bool IS_LATEST_DICT_VERSION; + UnigramDictionary *mUnigramDictionary; + BigramDictionary *mBigramDictionary; + WordsPriorityQueuePool *mWordsPriorityQueuePool; + Correction *mCorrection; +}; + +// public static utility methods +// static inline methods should be defined in the header file +inline int Dictionary::wideStrLen(unsigned short *str) { + if (!str) return 0; + unsigned short *end = str; + while (*end) + end++; + return end - str; +} +} // namespace latinime + +#endif // LATINIME_DICTIONARY_H diff --git a/native/src/proximity_info.cpp b/native/jni/src/proximity_info.cpp index 763a3a174..c00c4c20f 100644 --- a/native/src/proximity_info.cpp +++ b/native/jni/src/proximity_info.cpp @@ -16,10 +16,11 @@ #include <assert.h> #include <stdio.h> -#include <string.h> +#include <string> #define LOG_TAG "LatinIME: proximity_info.cpp" +#include "additional_proximity_chars.h" #include "dictionary.h" #include "proximity_info.h" @@ -33,26 +34,30 @@ inline void copyOrFillZero(void *to, const void *from, size_t size) { } } -ProximityInfo::ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth, - const int keyboardHeight, const int gridWidth, const int gridHeight, - const uint32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates, +ProximityInfo::ProximityInfo(const std::string localeStr, const int maxProximityCharsSize, + const int keyboardWidth, const int keyboardHeight, const int gridWidth, + const int gridHeight, const int mostCommonKeyWidth, + const int32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates, const int32_t *keyYCoordinates, const int32_t *keyWidths, const int32_t *keyHeights, const int32_t *keyCharCodes, const float *sweetSpotCenterXs, const float *sweetSpotCenterYs, const float *sweetSpotRadii) : MAX_PROXIMITY_CHARS_SIZE(maxProximityCharsSize), KEYBOARD_WIDTH(keyboardWidth), KEYBOARD_HEIGHT(keyboardHeight), GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight), + MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth), CELL_WIDTH((keyboardWidth + gridWidth - 1) / gridWidth), CELL_HEIGHT((keyboardHeight + gridHeight - 1) / gridHeight), KEY_COUNT(min(keyCount, MAX_KEY_COUNT_IN_A_KEYBOARD)), HAS_TOUCH_POSITION_CORRECTION_DATA(keyCount > 0 && keyXCoordinates && keyYCoordinates && keyWidths && keyHeights && keyCharCodes && sweetSpotCenterXs && sweetSpotCenterYs && sweetSpotRadii), - mInputXCoordinates(NULL), mInputYCoordinates(NULL), + mLocaleStr(localeStr), + mInputXCoordinates(0), mInputYCoordinates(0), mTouchPositionCorrectionEnabled(false) { const int proximityGridLength = GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE; - mProximityCharsArray = new uint32_t[proximityGridLength]; + mProximityCharsArray = new int32_t[proximityGridLength]; + mInputCodes = new int32_t[MAX_PROXIMITY_CHARS_SIZE * MAX_WORD_LENGTH_INTERNAL]; if (DEBUG_PROXIMITY_INFO) { - LOGI("Create proximity info array %d", proximityGridLength); + AKLOGI("Create proximity info array %d", proximityGridLength); } memcpy(mProximityCharsArray, proximityCharsArray, proximityGridLength * sizeof(mProximityCharsArray[0])); @@ -92,6 +97,7 @@ void ProximityInfo::initializeCodeToKeyIndex() { ProximityInfo::~ProximityInfo() { delete[] mNormalizedSquaredDistances; delete[] mProximityCharsArray; + delete[] mInputCodes; } inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y) const { @@ -100,13 +106,21 @@ inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y) } bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { + if (x < 0 || y < 0) { + if (DEBUG_DICT) { + AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); + assert(false); + } + return false; + } + const int startIndex = getStartIndexFromCoordinates(x, y); if (DEBUG_PROXIMITY_INFO) { - LOGI("hasSpaceProximity: index %d", startIndex); + AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); } for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { if (DEBUG_PROXIMITY_INFO) { - LOGI("Index: %d", mProximityCharsArray[startIndex + i]); + AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]); } if (mProximityCharsArray[startIndex + i] == KEYCODE_SPACE) { return true; @@ -115,10 +129,126 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { return false; } -// TODO: Calculate nearby codes here. -void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength, +bool ProximityInfo::isOnKey(const int keyId, const int x, const int y) const { + if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case + const int left = mKeyXCoordinates[keyId]; + const int top = mKeyYCoordinates[keyId]; + const int right = left + mKeyWidths[keyId] + 1; + const int bottom = top + mKeyHeights[keyId]; + return left < right && top < bottom && x >= left && x < right && y >= top && y < bottom; +} + +int ProximityInfo::squaredDistanceToEdge(const int keyId, const int x, const int y) const { + if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case + const int left = mKeyXCoordinates[keyId]; + const int top = mKeyYCoordinates[keyId]; + const int right = left + mKeyWidths[keyId]; + const int bottom = top + mKeyHeights[keyId]; + const int edgeX = x < left ? left : (x > right ? right : x); + const int edgeY = y < top ? top : (y > bottom ? bottom : y); + const int dx = x - edgeX; + const int dy = y - edgeY; + return dx * dx + dy * dy; +} + +void ProximityInfo::calculateNearbyKeyCodes( + const int x, const int y, const int32_t primaryKey, int *inputCodes) const { + int insertPos = 0; + inputCodes[insertPos++] = primaryKey; + const int startIndex = getStartIndexFromCoordinates(x, y); + if (startIndex >= 0) { + for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + const int32_t c = mProximityCharsArray[startIndex + i]; + if (c < KEYCODE_SPACE || c == primaryKey) { + continue; + } + const int keyIndex = getKeyIndex(c); + const bool onKey = isOnKey(keyIndex, x, y); + const int distance = squaredDistanceToEdge(keyIndex, x, y); + if (onKey || distance < MOST_COMMON_KEY_WIDTH_SQUARE) { + inputCodes[insertPos++] = c; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + assert(false); + } + return; + } + } + } + const int additionalProximitySize = + AdditionalProximityChars::getAdditionalCharsSize(&mLocaleStr, primaryKey); + if (additionalProximitySize > 0) { + inputCodes[insertPos++] = ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + assert(false); + } + return; + } + + const int32_t* additionalProximityChars = + AdditionalProximityChars::getAdditionalChars(&mLocaleStr, primaryKey); + for (int j = 0; j < additionalProximitySize; ++j) { + const int32_t ac = additionalProximityChars[j]; + int k = 0; + for (; k < insertPos; ++k) { + if ((int)ac == inputCodes[k]) { + break; + } + } + if (k < insertPos) { + continue; + } + inputCodes[insertPos++] = ac; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + assert(false); + } + return; + } + } + } + } + // Add a delimiter for the proximity characters + for (int i = insertPos; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + inputCodes[i] = NOT_A_CODE; + } +} + +void ProximityInfo::setInputParams(const int32_t* inputCodes, const int inputLength, const int* xCoordinates, const int* yCoordinates) { - mInputCodes = inputCodes; + memset(mInputCodes, 0, + MAX_WORD_LENGTH_INTERNAL * MAX_PROXIMITY_CHARS_SIZE * sizeof(mInputCodes[0])); + + for (int i = 0; i < inputLength; ++i) { + const int32_t primaryKey = inputCodes[i]; + const int x = xCoordinates[i]; + const int y = yCoordinates[i]; + int *proximities = &mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE]; + calculateNearbyKeyCodes(x, y, primaryKey, proximities); + } + + if (DEBUG_PROXIMITY_CHARS) { + for (int i = 0; i < inputLength; ++i) { + AKLOGI("---"); + for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE; ++j) { + int icc = mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE + j]; + int icfjc = inputCodes[i * MAX_PROXIMITY_CHARS_SIZE + j]; + icc+= 0; + icfjc += 0; + AKLOGI("--- (%d)%c,%c", i, icc, icfjc); + AKLOGI("--- A<%d>,B<%d>", icc, icfjc); + } + } + } + //Keep for debug, sorry + //for (int i = 0; i < MAX_WORD_LENGTH_INTERNAL * MAX_PROXIMITY_CHARS_SIZE; ++i) { + //if (i < inputLength * MAX_PROXIMITY_CHARS_SIZE) { + //mInputCodes[i] = mInputCodesFromJava[i]; + //} else { + // mInputCodes[i] = 0; + // } + //} mInputXCoordinates = xCoordinates; mInputYCoordinates = yCoordinates; mTouchPositionCorrectionEnabled = @@ -128,12 +258,35 @@ void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength, mPrimaryInputWord[i] = getPrimaryCharAt(i); } mPrimaryInputWord[inputLength] = 0; + if (DEBUG_PROXIMITY_CHARS) { + AKLOGI("--- setInputParams"); + } for (int i = 0; i < mInputLength; ++i) { const int *proximityChars = getProximityCharsAt(i); + const int primaryKey = proximityChars[0]; + const int x = xCoordinates[i]; + const int y = yCoordinates[i]; + if (DEBUG_PROXIMITY_CHARS) { + int a = x + y + primaryKey; + a += 0; + AKLOGI("--- Primary = %c, x = %d, y = %d", primaryKey, x, y); + // Keep debug code just in case + //int proximities[50]; + //for (int m = 0; m < 50; ++m) { + //proximities[m] = 0; + //} + //calculateNearbyKeyCodes(x, y, primaryKey, proximities); + //for (int l = 0; l < 50 && proximities[l] > 0; ++l) { + //if (DEBUG_PROXIMITY_CHARS) { + //AKLOGI("--- native Proximity (%d) = %c", l, proximities[l]); + //} + //} + } for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE && proximityChars[j] > 0; ++j) { const int currentChar = proximityChars[j]; - const int keyIndex = getKeyIndex(currentChar); - const float squaredDistance = calculateNormalizedSquaredDistance(keyIndex, i); + const float squaredDistance = hasInputCoordinates() + ? calculateNormalizedSquaredDistance(getKeyIndex(currentChar), i) + : NOT_A_DISTANCE_FLOAT; if (squaredDistance >= 0.0f) { mNormalizedSquaredDistances[i * MAX_PROXIMITY_CHARS_SIZE + j] = (int)(squaredDistance * NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR); @@ -142,6 +295,9 @@ void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength, ? EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO : PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO; } + if (DEBUG_PROXIMITY_CHARS) { + AKLOGI("--- Proximity (%d) = %c", j, currentChar); + } } } } @@ -150,26 +306,32 @@ inline float square(const float x) { return x * x; } float ProximityInfo::calculateNormalizedSquaredDistance( const int keyIndex, const int inputIndex) const { - static const float NOT_A_DISTANCE_FLOAT = -1.0f; - if (keyIndex == NOT_A_INDEX) { + if (keyIndex == NOT_AN_INDEX) { return NOT_A_DISTANCE_FLOAT; } if (!hasSweetSpotData(keyIndex)) { return NOT_A_DISTANCE_FLOAT; } + if (NOT_A_COORDINATE == mInputXCoordinates[inputIndex]) { + return NOT_A_DISTANCE_FLOAT; + } const float squaredDistance = calculateSquaredDistanceFromSweetSpotCenter(keyIndex, inputIndex); const float squaredRadius = square(mSweetSpotRadii[keyIndex]); return squaredDistance / squaredRadius; } +bool ProximityInfo::hasInputCoordinates() const { + return mInputXCoordinates && mInputYCoordinates; +} + int ProximityInfo::getKeyIndex(const int c) const { - if (KEY_COUNT == 0 || !mInputXCoordinates || !mInputYCoordinates) { + if (KEY_COUNT == 0) { // We do not have the coordinate data - return NOT_A_INDEX; + return NOT_AN_INDEX; } - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); + const unsigned short baseLowerC = toBaseLowerCase(c); if (baseLowerC > MAX_CHAR_CODE) { - return NOT_A_INDEX; + return NOT_AN_INDEX; } return mCodeToKeyIndex[baseLowerC]; } @@ -232,7 +394,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde const unsigned short c, const bool checkProximityChars, int *proximityIndex) const { const int *currentChars = getProximityCharsAt(index); const int firstChar = currentChars[0]; - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); + const unsigned short baseLowerC = toBaseLowerCase(c); // The first char in the array is what user typed. If it matches right away, // that means the user typed that same char for this pos. @@ -245,12 +407,13 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde // If the non-accented, lowercased version of that first character matches c, // then we have a non-accented version of the accented character the user // typed. Treat it as a close char. - if (Dictionary::toBaseLowerCase(firstChar) == baseLowerC) + if (toBaseLowerCase(firstChar) == baseLowerC) return NEAR_PROXIMITY_CHAR; // Not an exact nor an accent-alike match: search the list of close keys int j = 1; - while (j < MAX_PROXIMITY_CHARS_SIZE && currentChars[j] > 0) { + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); if (matched) { if (proximityIndex) { @@ -260,6 +423,21 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int inde } ++j; } + if (j < MAX_PROXIMITY_CHARS_SIZE + && currentChars[j] == ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + ++j; + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return ADDITIONAL_PROXIMITY_CHAR; + } + ++j; + } + } // Was not included, signal this as an unrelated character. return UNRELATED_CHAR; diff --git a/native/src/proximity_info.h b/native/jni/src/proximity_info.h index 35e354c6e..c1eefeacc 100644 --- a/native/src/proximity_info.h +++ b/native/jni/src/proximity_info.h @@ -18,6 +18,7 @@ #define LATINIME_PROXIMITY_INFO_H #include <stdint.h> +#include <string> #include "defines.h" @@ -26,7 +27,7 @@ namespace latinime { class Correction; class ProximityInfo { -public: + public: static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2 = 10; static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR = 1 << NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2; @@ -38,25 +39,28 @@ public: // It is a char located nearby on the keyboard NEAR_PROXIMITY_CHAR, // It is an unrelated char - UNRELATED_CHAR + UNRELATED_CHAR, + // Additional proximity char which can differ by language. + ADDITIONAL_PROXIMITY_CHAR } ProximityType; - ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth, - const int keybaordHeight, const int gridWidth, const int gridHeight, - const uint32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates, + ProximityInfo(const std::string localeStr, const int maxProximityCharsSize, + const int keyboardWidth, const int keybaordHeight, const int gridWidth, + const int gridHeight, const int mostCommonkeyWidth, + const int32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates, const int32_t *keyYCoordinates, const int32_t *keyWidths, const int32_t *keyHeights, const int32_t *keyCharCodes, const float *sweetSpotCenterXs, const float *sweetSpotCenterYs, const float *sweetSpotRadii); ~ProximityInfo(); bool hasSpaceProximity(const int x, const int y) const; - void setInputParams(const int* inputCodes, const int inputLength, + void setInputParams(const int32_t *inputCodes, const int inputLength, const int *xCoordinates, const int *yCoordinates); const int* getProximityCharsAt(const int index) const; unsigned short getPrimaryCharAt(const int index) const; bool existsCharInProximityAt(const int index, const int c) const; bool existsAdjacentProximityChars(const int index) const; ProximityType getMatchedProximityId(const int index, const unsigned short c, - const bool checkProximityChars, int *proximityIndex = NULL) const; + const bool checkProximityChars, int *proximityIndex = 0) const; int getNormalizedSquaredDistance(const int inputIndex, const int proximityIndex) const { return mNormalizedSquaredDistances[inputIndex * MAX_PROXIMITY_CHARS_SIZE + proximityIndex]; } @@ -68,38 +72,49 @@ public: return mTouchPositionCorrectionEnabled; } -private: + private: // The max number of the keys in one keyboard layout static const int MAX_KEY_COUNT_IN_A_KEYBOARD = 64; // The upper limit of the char code in mCodeToKeyIndex static const int MAX_CHAR_CODE = 127; + static const float NOT_A_DISTANCE_FLOAT = -1.0f; + static const int NOT_A_CODE = -1; int getStartIndexFromCoordinates(const int x, const int y) const; void initializeCodeToKeyIndex(); float calculateNormalizedSquaredDistance(const int keyIndex, const int inputIndex) const; float calculateSquaredDistanceFromSweetSpotCenter( const int keyIndex, const int inputIndex) const; + bool hasInputCoordinates() const; int getKeyIndex(const int c) const; bool hasSweetSpotData(const int keyIndex) const { // When there are no calibration data for a key, // the radius of the key is assigned to zero. return mSweetSpotRadii[keyIndex] > 0.0; } + bool isOnKey(const int keyId, const int x, const int y) const; + int squaredDistanceToEdge(const int keyId, const int x, const int y) const; + void calculateNearbyKeyCodes( + const int x, const int y, const int32_t primaryKey, int *inputCodes) const; const int MAX_PROXIMITY_CHARS_SIZE; const int KEYBOARD_WIDTH; const int KEYBOARD_HEIGHT; const int GRID_WIDTH; const int GRID_HEIGHT; + const int MOST_COMMON_KEY_WIDTH_SQUARE; const int CELL_WIDTH; const int CELL_HEIGHT; const int KEY_COUNT; const bool HAS_TOUCH_POSITION_CORRECTION_DATA; - const int *mInputCodes; + const std::string mLocaleStr; + // TODO: remove this + const int *mInputCodesFromJava; + int32_t *mInputCodes; const int *mInputXCoordinates; const int *mInputYCoordinates; bool mTouchPositionCorrectionEnabled; - uint32_t *mProximityCharsArray; + int32_t *mProximityCharsArray; int *mNormalizedSquaredDistances; int32_t mKeyXCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; int32_t mKeyYCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h new file mode 100644 index 000000000..1f9815936 --- /dev/null +++ b/native/jni/src/terminal_attributes.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TERMINAL_ATTRIBUTES_H +#define LATINIME_TERMINAL_ATTRIBUTES_H + +#include "unigram_dictionary.h" + +namespace latinime { + +/** + * This class encapsulates information about a terminal that allows to + * retrieve local node attributes like the list of shortcuts without + * exposing the format structure to the client. + */ +class TerminalAttributes { + public: + class ShortcutIterator { + const uint8_t* const mDict; + bool mHasNextShortcutTarget; + int mPos; + + public: + ShortcutIterator(const uint8_t* dict, const int pos, const uint8_t flags) : mDict(dict), + mPos(pos) { + mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS)); + } + + inline bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as a uint16_t string. For parameters and return value + // see BinaryFormat::getWordAtAddress. + inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) { + const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos); + mHasNextShortcutTarget = + 0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT); + int shortcutAddress = + BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos); + return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord); + } + }; + + private: + const uint8_t* const mDict; + const uint8_t mFlags; + const int mStartPos; + + public: + TerminalAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) : + mDict(dict), mFlags(flags), mStartPos(pos) { + } + + inline bool isShortcutOnly() const { + return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY); + } + + inline ShortcutIterator getShortcutIterator() const { + return ShortcutIterator(mDict, mStartPos, mFlags); + } +}; +} // namespace latinime + +#endif // LATINIME_TERMINAL_ATTRIBUTES_H diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp new file mode 100644 index 000000000..ed4c066f3 --- /dev/null +++ b/native/jni/src/unigram_dictionary.cpp @@ -0,0 +1,894 @@ +/* +** +** Copyright 2010, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#include <assert.h> +#include <string.h> + +#define LOG_TAG "LatinIME: unigram_dictionary.cpp" + +#include "char_utils.h" +#include "dictionary.h" +#include "unigram_dictionary.h" + +#include "binary_format.h" +#include "terminal_attributes.h" + +namespace latinime { + +const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = + { { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS + { 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS + { 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS + +const UnigramDictionary::digraph_t UnigramDictionary::FRENCH_LIGATURES_DIGRAPHS[] = + { { 'a', 'e', 0x00E6 }, // U+00E6 : LATIN SMALL LETTER AE + { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE + +// TODO: check the header +UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, + int fullWordMultiplier, int maxWordLength, int maxWords, + const bool isLatestDictVersion) + : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), + IS_LATEST_DICT_VERSION(isLatestDictVersion), + TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), + // TODO : remove this variable. + ROOT_POS(0), + BYTES_IN_ONE_CHAR(sizeof(int)), + MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH) { + if (DEBUG_DICT) { + AKLOGI("UnigramDictionary - constructor"); + } +} + +UnigramDictionary::~UnigramDictionary() { +} + +static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize) { + return sizeof(*codes) * codesSize; +} + +// TODO: This needs to take a const unsigned short* and not tinker with its contents +static inline void addWord( + unsigned short *word, int length, int frequency, WordsPriorityQueue *queue) { + queue->push(frequency, word, length); +} + +// Return the replacement code point for a digraph, or 0 if none. +int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int codesSize, + const digraph_t* const digraphs, const unsigned int digraphsSize) const { + + // There can't be a digraph if we don't have at least 2 characters to examine + if (i + 2 > codesSize) return false; + + // Search for the first char of some digraph + int lastDigraphIndex = -1; + const int thisChar = codes[i]; + for (lastDigraphIndex = digraphsSize - 1; lastDigraphIndex >= 0; --lastDigraphIndex) { + if (thisChar == digraphs[lastDigraphIndex].first) break; + } + // No match: return early + if (lastDigraphIndex < 0) return 0; + + // It's an interesting digraph if the second char matches too. + if (digraphs[lastDigraphIndex].second == codes[i + 1]) { + return digraphs[lastDigraphIndex].replacement; + } else { + return 0; + } +} + +// Mostly the same arguments as the non-recursive version, except: +// codes is the original value. It points to the start of the work buffer, and gets passed as is. +// codesSize is the size of the user input (thus, it is the size of codesSrc). +// codesDest is the current point in the work buffer. +// codesSrc is the current point in the user-input, original, content-unmodified buffer. +// codesRemain is the remaining size in codesSrc. +void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, + int *xCoordinatesBuffer, int *yCoordinatesBuffer, + const int codesBufferSize, const int flags, const int *codesSrc, + const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, + WordsPriorityQueuePool *queuePool, + const digraph_t* const digraphs, const unsigned int digraphsSize) { + + const int startIndex = codesDest - codesBuffer; + if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) { + for (int i = 0; i < codesRemain; ++i) { + xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i]; + yCoordinatesBuffer[startIndex + i] = ycoordinates[codesBufferSize - codesRemain + i]; + const int replacementCodePoint = + getDigraphReplacement(codesSrc, i, codesRemain, digraphs, digraphsSize); + if (0 != replacementCodePoint) { + // Found a digraph. We will try both spellings. eg. the word is "pruefen" + + // Copy the word up to the first char of the digraph, including proximity chars, + // and overwrite the primary code with the replacement code point. Then, continue + // processing on the remaining part of the word, skipping the second char of the + // digraph. + // In our example, copy "pru", replace "u" with the version with the diaeresis and + // continue running on "fen". + // Make i the index of the second char of the digraph for simplicity. Forgetting + // to do that results in an infinite recursion so take care! + ++i; + memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR); + codesDest[(i - 1) * (BYTES_IN_ONE_CHAR / sizeof(codesDest[0]))] = + replacementCodePoint; + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, + codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, flags, + codesSrc + i + 1, codesRemain - i - 1, + currentDepth + 1, codesDest + i, correction, + queuePool, digraphs, digraphsSize); + + // Copy the second char of the digraph in place, then continue processing on + // the remaining part of the word. + // In our example, after "pru" in the buffer copy the "e", and continue on "fen" + memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR); + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, + codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, flags, + codesSrc + i, codesRemain - i, currentDepth + 1, + codesDest + i, correction, queuePool, + digraphs, digraphsSize); + return; + } + } + } + + // If we come here, we hit the end of the word: let's check it against the dictionary. + // In our example, we'll come here once for "prufen" and then once for "pruefen". + // If the word contains several digraphs, we'll come it for the product of them. + // eg. if the word is "ueberpruefen" we'll test, in order, against + // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen". + const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain; + if (0 != remainingBytes) { + memcpy(codesDest, codesSrc, remainingBytes); + memcpy(&xCoordinatesBuffer[startIndex], &xcoordinates[codesBufferSize - codesRemain], + sizeof(int) * codesRemain); + memcpy(&yCoordinatesBuffer[startIndex], &ycoordinates[codesBufferSize - codesRemain], + sizeof(int) * codesRemain); + } + + getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, + startIndex + codesRemain, flags, correction, + queuePool); +} + +int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, + WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, const int flags, + unsigned short *outWords, int *frequencies) { + + queuePool->clearAll(); + Correction* masterCorrection = correction; + if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags) + { // Incrementally tune the word and try all possibilities + int codesBuffer[getCodesBufferSize(codes, codesSize)]; + int xCoordinatesBuffer[codesSize]; + int yCoordinatesBuffer[codesSize]; + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + xCoordinatesBuffer, yCoordinatesBuffer, + codesSize, flags, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, + GERMAN_UMLAUT_DIGRAPHS, + sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); + } else if (REQUIRES_FRENCH_LIGATURES_PROCESSING & flags) { + int codesBuffer[getCodesBufferSize(codes, codesSize)]; + int xCoordinatesBuffer[codesSize]; + int yCoordinatesBuffer[codesSize]; + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + xCoordinatesBuffer, yCoordinatesBuffer, + codesSize, flags, codes, codesSize, 0, codesBuffer, masterCorrection, queuePool, + FRENCH_LIGATURES_DIGRAPHS, + sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); + } else { // Normal processing + getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, flags, + masterCorrection, queuePool); + } + + PROF_START(20); + if (DEBUG_DICT) { + double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + ns += 0; + AKLOGI("Max normalized score = %f", ns); + } + const int suggestedWordsCount = + queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords); + + if (DEBUG_DICT) { + double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + ns += 0; + AKLOGI("Returning %d words", suggestedWordsCount); + /// Print the returned words + for (int j = 0; j < suggestedWordsCount; ++j) { + short unsigned int* w = outWords + j * MAX_WORD_LENGTH; + char s[MAX_WORD_LENGTH]; + for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; + AKLOGI("%s %i", s, frequencies[j]); + } + } + PROF_END(20); + PROF_CLOSE; + return suggestedWordsCount; +} + +void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const int inputLength, const int flags, Correction *correction, + WordsPriorityQueuePool *queuePool) { + + PROF_OPEN; + PROF_START(0); + PROF_END(0); + + PROF_START(1); + const bool useFullEditDistance = USE_FULL_EDIT_DISTANCE & flags; + getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, + inputLength, correction, queuePool); + PROF_END(1); + + PROF_START(2); + // Note: This line is intentionally left blank + PROF_END(2); + + PROF_START(3); + // Note: This line is intentionally left blank + PROF_END(3); + + PROF_START(4); + bool hasAutoCorrectionCandidate = false; + WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); + if (masterQueue->size() > 0) { + double nsForMaster = masterQueue->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); + hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); + } + PROF_END(4); + + PROF_START(5); + // Multiple word suggestions + if (SUGGEST_MULTIPLE_WORDS + && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { + getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, inputLength, correction, queuePool, + hasAutoCorrectionCandidate); + } + PROF_END(5); + + PROF_START(6); + // Note: This line is intentionally left blank + PROF_END(6); + + if (DEBUG_DICT) { + queuePool->dumpSubQueue1TopSuggestions(); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + WordsPriorityQueue* queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i); + if (queue->size() > 0) { + WordsPriorityQueue::SuggestedWord* sw = queue->top(); + const int score = sw->mScore; + const unsigned short* word = sw->mWord; + const int wordLength = sw->mWordLength; + double ns = Correction::RankingAlgorithm::calcNormalizedScore( + proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); + ns += 0; + AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, + (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD)); + DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); + DUMP_WORD(word, wordLength); + } + } + } +} + +void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, + const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { + if (DEBUG_DICT) { + AKLOGI("initSuggest"); + } + proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); + const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); + correction->initCorrection(proximityInfo, inputLength, maxDepth); +} + +static const char QUOTE = '\''; +static const char SPACE = ' '; + +void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, Correction *correction, + WordsPriorityQueuePool *queuePool) { + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, + true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); +} + +void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, + const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, + const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { + // TODO: Remove setCorrectionParams + correction->setCorrectionParams(0, 0, 0, + -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, + doAutoCompletion, maxErrors); + int rootPosition = ROOT_POS; + // Get the number of children of root, then increment the position + int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); + int outputIndex = 0; + + correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0)); + + // Depth first search + while (outputIndex >= 0) { + if (correction->initProcessState(outputIndex)) { + int siblingPos = correction->getTreeSiblingPos(outputIndex); + int firstChildPos; + + const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, + correction, &childCount, &firstChildPos, &siblingPos, queuePool, + currentWordIndex); + // Update next sibling pos + correction->setTreeSiblingPos(outputIndex, siblingPos); + + if (needsToTraverseChildrenNodes) { + // Goes to child node + outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos); + } + } else { + // Goes to parent sibling node + outputIndex = correction->getTreeParentIndex(outputIndex); + } + } +} + +inline void UnigramDictionary::onTerminal(const int freq, + const TerminalAttributes& terminalAttributes, Correction *correction, + WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, + const int currentWordIndex) { + const int inputIndex = correction->getInputIndex(); + const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; + + int wordLength; + unsigned short* wordPointer; + + if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); + if (finalFreq != NOT_A_FREQUENCY) { + if (!terminalAttributes.isShortcutOnly()) { + addWord(wordPointer, wordLength, finalFreq, masterQueue); + } + + // Please note that the shortcut candidates will be added to the master queue only. + TerminalAttributes::ShortcutIterator iterator = + terminalAttributes.getShortcutIterator(); + while (iterator.hasNextShortcutTarget()) { + // TODO: addWord only supports weak ordering, meaning we have no means + // to control the order of the shortcuts relative to one another or to the word. + // We need to either modulate the frequency of each shortcut according + // to its own shortcut frequency or to make the queue + // so that the insert order is protected inside the queue for words + // with the same score. + uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; + const int shortcutTargetStringLength = iterator.getNextShortcutTarget( + MAX_WORD_LENGTH_INTERNAL, shortcutTarget); + addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue); + } + } + } + + // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH + // or more length. + if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) { + WordsPriorityQueue *subQueue; + subQueue = queuePool->getSubQueue(currentWordIndex, inputIndex); + if (!subQueue) { + return; + } + const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength, + inputIndex); + addWord(wordPointer, wordLength, finalFreq, subQueue); + } +} + +bool UnigramDictionary::getSubStringSuggestion( + ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, + const int *codes, const bool useFullEditDistance, Correction *correction, + WordsPriorityQueuePool* queuePool, const int inputLength, + const bool hasAutoCorrectionCandidate, const int currentWordIndex, + const int inputWordStartPos, const int inputWordLength, + const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, + int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) { + unsigned short* tempOutputWord = 0; + int nextWordLength = 0; + // TODO: Optimize init suggestion + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, + inputLength, correction); + + int freq = getMostFrequentWordLike( + inputWordStartPos, inputWordLength, proximityInfo, mWord); + if (freq > 0) { + nextWordLength = inputWordLength; + tempOutputWord = mWord; + } else if (!hasAutoCorrectionCandidate) { + if (inputWordStartPos > 0) { + const int offset = inputWordStartPos; + initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], + codes + offset, inputWordLength, correction); + queuePool->clearSubQueue(currentWordIndex); + getSuggestionCandidates(useFullEditDistance, inputWordLength, correction, + queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); + if (DEBUG_DICT) { + if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { + AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord(); + } + } + } + } + WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); + if (!queue || queue->size() < 1) { + return false; + } + int score = 0; + const double ns = queue->getHighestNormalizedScore( + proximityInfo->getPrimaryInputWord(), inputWordLength, + &tempOutputWord, &score, &nextWordLength); + if (DEBUG_DICT) { + AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score); + } + // Two words correction won't be done if the score of the first word doesn't exceed the + // threshold. + if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD + || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { + return false; + } + freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); + } + if (DEBUG_DICT) { + AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)" + , currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos, + wordLengthArray[0]); + } + if (freq <= 0 || nextWordLength <= 0 + || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) { + return false; + } + for (int i = 0; i < nextWordLength; ++i) { + outputWord[outputWordStartPos + i] = tempOutputWord[i]; + } + + // Put output values + freqArray[currentWordIndex] = freq; + // TODO: put output length instead of input length + wordLengthArray[currentWordIndex] = inputWordLength; + const int tempOutputWordLength = outputWordStartPos + nextWordLength; + if (outputWordLength) { + *outputWordLength = tempOutputWordLength; + } + + if ((inputWordStartPos + inputWordLength) < inputLength) { + if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) { + return false; + } + outputWord[tempOutputWordLength] = SPACE; + if (outputWordLength) { + ++*outputWordLength; + } + } else if (currentWordIndex >= 1) { + // TODO: Handle 3 or more words + const int pairFreq = correction->getFreqForSplitMultipleWords( + freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord); + if (DEBUG_DICT) { + DUMP_WORD(outputWord, tempOutputWordLength); + AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq, + inputLength, wordLengthArray[0], tempOutputWordLength); + } + addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue()); + } + return true; +} + +void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex, + const int outputWordLength, int *freqArray, int* wordLengthArray, + unsigned short* outputWord) { + if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { + // Return if the last word index + return; + } + if (startWordIndex >= 1 + && (hasAutoCorrectionCandidate + || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) { + // Do not suggest 3+ words if already has auto correction candidate + return; + } + for (int i = startInputPos + 1; i < inputLength; ++i) { + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Multi words(%d), start in %d sep %d start out %d", + startWordIndex, startInputPos, i, outputWordLength); + DUMP_WORD(outputWord, outputWordLength); + } + int tempOutputWordLength = 0; + // Current word + int inputWordStartPos = startInputPos; + int inputWordLength = i - startInputPos; + if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + startWordIndex, inputWordStartPos, inputWordLength, outputWordLength, + true /* not used */, freqArray, wordLengthArray, outputWord, + &tempOutputWordLength)) { + continue; + } + + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Do missing space correction"); + } + // Next word + // Missing space + inputWordStartPos = i; + inputWordLength = inputLength - i; + if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, + false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) { + getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, inputLength, correction, queuePool, + hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1, + tempOutputWordLength, freqArray, wordLengthArray, outputWord); + } + + // Mistyped space + ++inputWordStartPos; + --inputWordLength; + + if (inputWordLength <= 0) { + continue; + } + + const int x = xcoordinates[inputWordStartPos - 1]; + const int y = ycoordinates[inputWordStartPos - 1]; + if (!proximityInfo->hasSpaceProximity(x, y)) { + continue; + } + + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Do mistyped space correction"); + } + getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, + true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0); + } +} + +void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate) { + if (inputLength >= MAX_WORD_LENGTH) return; + if (DEBUG_DICT) { + AKLOGI("--- Suggest multiple words"); + } + + // Allocating fixed length array on stack + unsigned short outputWord[MAX_WORD_LENGTH]; + int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; + int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; + const int outputWordLength = 0; + const int startInputPos = 0; + const int startWordIndex = 0; + getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate, + startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray, + outputWord); +} + +// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous +// interface. +inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, + const int inputLength, ProximityInfo *proximityInfo, unsigned short *word) { + uint16_t inWord[inputLength]; + + for (int i = 0; i < inputLength; ++i) { + inWord[i] = (uint16_t)proximityInfo->getPrimaryCharAt(startInputIndex + i); + } + return getMostFrequentWordLikeInner(inWord, inputLength, word); +} + +// This function will take the position of a character array within a CharGroup, +// and check it actually like-matches the word in inWord starting at startInputIndex, +// that is, it matches it with case and accents squashed. +// The function returns true if there was a full match, false otherwise. +// The function will copy on-the-fly the characters in the CharGroup to outNewWord. +// It will also place the end position of the array in outPos; in outInputIndex, +// it will place the index of the first char AFTER the match if there was a match, +// and the initial position if there was not. It makes sense because if there was +// a match we want to continue searching, but if there was not, we want to go to +// the next CharGroup. +// In and out parameters may point to the same location. This function takes care +// not to use any input parameters after it wrote into its outputs. +static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, + const uint8_t* const root, const int startPos, + const uint16_t* const inWord, const int startInputIndex, + int32_t* outNewWord, int* outInputIndex, int* outPos) { + const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); + int pos = startPos; + int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + int32_t baseChar = toBaseLowerCase(character); + const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); + + if (baseChar != wChar) { + *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; + *outInputIndex = startInputIndex; + return false; + } + int inputIndex = startInputIndex; + outNewWord[inputIndex] = character; + if (hasMultipleChars) { + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + while (NOT_A_CHARACTER != character) { + baseChar = toBaseLowerCase(character); + if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + *outPos = BinaryFormat::skipOtherCharacters(root, pos); + *outInputIndex = startInputIndex; + return false; + } + outNewWord[inputIndex] = character; + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + } + } + *outInputIndex = inputIndex + 1; + *outPos = pos; + return true; +} + +// This function is invoked when a word like the word searched for is found. +// It will compare the frequency to the max frequency, and if greater, will +// copy the word into the output buffer. In output value maxFreq, it will +// write the new maximum frequency if it changed. +static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length, + short unsigned int* outWord, int* maxFreq) { + if (freq > *maxFreq) { + for (int q = 0; q < length; ++q) + outWord[q] = newWord[q]; + outWord[length] = 0; + *maxFreq = freq; + } +} + +// Will find the highest frequency of the words like the one passed as an argument, +// that is, everything that only differs by case/accents. +int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord, + const int length, short unsigned int* outWord) { + int32_t newWord[MAX_WORD_LENGTH_INTERNAL]; + int depth = 0; + int maxFreq = -1; + const uint8_t* const root = DICT_ROOT; + + int startPos = 0; + mStackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); + mStackInputIndex[0] = 0; + mStackSiblingPos[0] = startPos; + while (depth >= 0) { + const int charGroupCount = mStackChildCount[depth]; + int pos = mStackSiblingPos[depth]; + for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) { + int inputIndex = mStackInputIndex[depth]; + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + // Test whether all chars in this group match with the word we are searching for. If so, + // we want to traverse its children (or if the length match, evaluate its frequency). + // Note that this function will output the position regardless, but will only write + // into inputIndex if there is a match. + const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, + inputIndex, newWord, &inputIndex, &pos); + if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) { + const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); + onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); + } + pos = BinaryFormat::skipFrequency(flags, pos); + const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); + const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); + // If we had a match and the word has children, we want to traverse them. We don't have + // to traverse words longer than the one we are searching for, since they will not match + // anyway, so don't traverse unless inputIndex < length. + if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) { + // Save position for this depth, to get back to this once children are done + mStackChildCount[depth] = charGroupIndex; + mStackSiblingPos[depth] = siblingPos; + // Prepare stack values for next depth + ++depth; + int childrenPos = childrenNodePos; + mStackChildCount[depth] = + BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos); + mStackSiblingPos[depth] = childrenPos; + mStackInputIndex[depth] = inputIndex; + pos = childrenPos; + // Go to the next depth level. + ++depth; + break; + } else { + // No match, or no children, or word too long to ever match: go the next sibling. + pos = siblingPos; + } + } + --depth; + } + return maxFreq; +} + +bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const { + return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); +} + +// TODO: remove this function. +int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, + int length) const { + return -1; +} + +// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not. +// If the return value is false, then the caller should read in the output "nextSiblingPosition" +// to find out the address of the next sibling node and pass it to a new call of processCurrentNode. +// It is worthy to note that when false is returned, the output values other than +// nextSiblingPosition are undefined. +// If the return value is true, then the caller must proceed to traverse the children of this +// node. processCurrentNode will output the information about the children: their count in +// newCount, their position in newChildrenPosition, the traverseAllNodes flag in +// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the +// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into +// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when +// there aren't any more nodes at this level, it merely returns the address of the first byte after +// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any +// given level, as output into newCount when traversing this level's parent. +inline bool UnigramDictionary::processCurrentNode(const int initialPos, + Correction *correction, int *newCount, + int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, + const int currentWordIndex) { + if (DEBUG_DICT) { + correction->checkState(); + } + int pos = initialPos; + + // Flags contain the following information: + // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: + // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address + // is on the specified number of bytes. + // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address. + // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. + // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) + // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); + const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); + const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); + + bool needsToInvokeOnTerminal = false; + + // This gets only ONE character from the stream. Next there will be: + // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node + // else if FLAG_IS_TERMINAL: the frequency + // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address + // Note that you can't have a node that both is not a terminal and has no children. + int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); + assert(NOT_A_CHARACTER != c); + + // We are going to loop through each character and make it look like it's a different + // node each time. To do that, we will process characters in this node in order until + // we find the character terminator. This is signalled by getCharCode* returning + // NOT_A_CHARACTER. + // As a special case, if there is only one character in this node, we must not read the + // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. + // This way, each loop run will look like a "virtual node". + do { + // We prefetch the next char. If 'c' is the last char of this node, we will have + // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node + // should behave as a terminal or not and whether we have children. + const int32_t nextc = hasMultipleChars + ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; + const bool isLastChar = (NOT_A_CHARACTER == nextc); + // If there are more chars in this nodes, then this virtual node is not a terminal. + // If we are on the last char, this virtual node is a terminal if this node is. + const bool isTerminal = isLastChar && isTerminalNode; + + Correction::CorrectionType stateType = correction->processCharAndCalcState( + c, isTerminal); + if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL + || stateType == Correction::ON_TERMINAL) { + needsToInvokeOnTerminal = true; + } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) { + // We found that this is an unrelated character, so we should give up traversing + // this node and its children entirely. + // However we may not be on the last virtual node yet so we skip the remaining + // characters in this node, the frequency if it's there, read the next sibling + // position to output it, then return false. + // We don't have to output other values because we return false, as in + // "don't traverse children". + if (!isLastChar) { + pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); + } + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } + + // Prepare for the next character. Promote the prefetched char to current char - the loop + // will take care of prefetching the next. If we finally found our last char, nextc will + // contain NOT_A_CHARACTER. + c = nextc; + } while (NOT_A_CHARACTER != c); + + if (isTerminalNode) { + // The frequency should be here, because we come here only if this is actually + // a terminal node, and we are on its last char. + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); + const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); + TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); + onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, + currentWordIndex); + + // If there are more chars in this node, then this virtual node has children. + // If we are on the last char, this virtual node has children if this node has. + const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); + + // This character matched the typed character (enough to traverse the node at least) + // so we just evaluated it. Now we should evaluate this virtual node's children - that + // is, if it has any. If it has no children, we're done here - so we skip the end of + // the node, output the siblings position, and return false "don't traverse children". + // Note that !hasChildren implies isLastChar, so we know we don't have to skip any + // remaining char in this group for there can't be any. + if (!hasChildren) { + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } + + // Optimization: Prune out words that are too long compared to how much was typed. + if (correction->needsToPrune()) { + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + if (DEBUG_DICT_FULL) { + AKLOGI("Traversing was pruned."); + } + return false; + } + } + + // Now we finished processing this node, and we want to traverse children. If there are no + // children, we can't come here. + assert(BinaryFormat::hasChildrenInFlags(flags)); + + // If this node was a terminal it still has the frequency under the pointer (it may have been + // read, but not skipped - see readFrequencyWithoutMovingPointer). + // Next come the children position, then possibly attributes (attributes are bigrams only for + // now, maybe something related to shortcuts in the future). + // Once this is read, we still need to output the number of nodes in the immediate children of + // this node, so we read and output it before returning true, as in "please traverse children". + pos = BinaryFormat::skipFrequency(flags, pos); + int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); + *newChildrenPosition = childrenPos; + return true; +} + +} // namespace latinime diff --git a/native/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index ef9709a89..c8f15566c 100644 --- a/native/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -22,17 +22,16 @@ #include "correction_state.h" #include "defines.h" #include "proximity_info.h" - -#ifndef NULL -#define NULL 0 -#endif +#include "words_priority_queue.h" +#include "words_priority_queue_pool.h" namespace latinime { +class TerminalAttributes; class UnigramDictionary { + typedef struct { int first; int second; int replacement; } digraph_t; -public: - + public: // Mask and flags for children address type selection. static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; @@ -46,8 +45,14 @@ public: // Flag for terminal groups static const int FLAG_IS_TERMINAL = 0x10; + // Flag for shortcut targets presence + static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; // Flag for bigram presence static const int FLAG_HAS_BIGRAMS = 0x04; + // Flag for shortcut-only words. Some words are shortcut-only, which means they match when + // the user types them but they don't pop in the suggestion strip, only the words they are + // shortcuts for do. + static const int FLAG_IS_SHORTCUT_ONLY = 0x02; // Attribute (bigram/shortcut) related flags: // Flag for presence of more attributes @@ -64,58 +69,86 @@ public: static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + // Error tolerances + static const int DEFAULT_MAX_ERRORS = 2; + static const int MAX_ERRORS_FOR_TWO_WORDS = 1; + UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, - int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, + int fullWordMultiplier, int maxWordLength, int maxWords, const bool isLatestDictVersion); bool isValidWord(const uint16_t* const inWord, const int length) const; int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; - int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, + Correction *correction, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies); virtual ~UnigramDictionary(); -private: - + private: void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies, const int flags); - bool isDigraph(const int* codes, const int i, const int codesSize) const; + const int *ycoordinates, const int *codes, const int inputLength, + const int flags, Correction *correction, WordsPriorityQueuePool *queuePool); + int getDigraphReplacement(const int *codes, const int i, const int codesSize, + const digraph_t* const digraphs, const unsigned int digraphsSize) const; void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, - const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, - const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies); + int *xCoordinatesBuffer, int *yCoordinatesBuffer, + const int codesBufferSize, const int flags, const int* codesSrc, + const int codesRemain, const int currentDepth, int* codesDest, Correction *correction, + WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs, + const unsigned int digraphsSize); void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies); - void getSuggestionCandidates(const bool useFullEditDistance); - bool addWord(unsigned short *word, int length, int frequency); - void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction); - void getMissingSpaceWords(const int inputLength, const int missingSpacePos, - Correction *correction, const bool useFullEditDistance); - void getMistypedSpaceWords(const int inputLength, const int spaceProximityPos, - Correction *correction, const bool useFullEditDistance); - void onTerminal(const int freq, Correction *correction); + const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); + void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const bool useFullEditDistance, + const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool); + void getSuggestionCandidates( + const bool useFullEditDistance, const int inputLength, Correction *correction, + WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors, + const int currentWordIndex); + void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate); + void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, + Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, + const int currentWordIndex); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character - bool processCurrentNode(const int initialPos, - Correction *correction, int *newCount, - int *newChildPosition, int *nextSiblingPosition); + bool processCurrentNode(const int initialPos, Correction *correction, int *newCount, + int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, + const int currentWordIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, - unsigned short *word); + ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, - short unsigned int* outWord); + short unsigned int *outWord); + bool getSubStringSuggestion( + ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, + const int *codes, const bool useFullEditDistance, Correction *correction, + WordsPriorityQueuePool* queuePool, const int inputLength, + const bool hasAutoCorrectionCandidate, const int currentWordIndex, + const int inputWordStartPos, const int inputWordLength, + const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, + int *wordLengthArray, unsigned short* outputWord, int *outputWordLength); + void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, + Correction *correction, WordsPriorityQueuePool* queuePool, + const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex, + const int outputWordLength, int *freqArray, int* wordLengthArray, + unsigned short* outputWord); const uint8_t* const DICT_ROOT; const int MAX_WORD_LENGTH; const int MAX_WORDS; - const int MAX_PROXIMITY_CHARS; const bool IS_LATEST_DICT_VERSION; const int TYPED_LETTER_MULTIPLIER; const int FULL_WORD_MULTIPLIER; const int ROOT_POS; const unsigned int BYTES_IN_ONE_CHAR; - const int MAX_UMLAUT_SEARCH_DEPTH; + const int MAX_DIGRAPH_SEARCH_DEPTH; // Flags for special processing // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java @@ -123,18 +156,14 @@ private: // Please update both at the same time. enum { REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1, - USE_FULL_EDIT_DISTANCE = 0x2 + USE_FULL_EDIT_DISTANCE = 0x2, + REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4 }; - static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[]; - - int *mFrequencies; - unsigned short *mOutputChars; - ProximityInfo *mProximityInfo; - Correction *mCorrection; - int mInputLength; - // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH - unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; + static const digraph_t GERMAN_UMLAUT_DIGRAPHS[]; + static const digraph_t FRENCH_LIGATURES_DIGRAPHS[]; + // Still bundled members + unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];// TODO: remove int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove diff --git a/native/jni/src/words_priority_queue.h b/native/jni/src/words_priority_queue.h new file mode 100644 index 000000000..249962eec --- /dev/null +++ b/native/jni/src/words_priority_queue.h @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORDS_PRIORITY_QUEUE_H +#define LATINIME_WORDS_PRIORITY_QUEUE_H + +#include <cstring> // for memcpy() +#include <iostream> +#include <queue> +#include "defines.h" + +namespace latinime { + +class WordsPriorityQueue { + public: + class SuggestedWord { + public: + int mScore; + unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; + int mWordLength; + bool mUsed; + + void setParams(int score, unsigned short* word, int wordLength) { + mScore = score; + mWordLength = wordLength; + memcpy(mWord, word, sizeof(unsigned short) * wordLength); + mUsed = true; + } + }; + + WordsPriorityQueue(int maxWords, int maxWordLength) : + MAX_WORDS((unsigned int) maxWords), MAX_WORD_LENGTH( + (unsigned int) maxWordLength) { + mSuggestedWords = new SuggestedWord[maxWordLength]; + for (int i = 0; i < maxWordLength; ++i) { + mSuggestedWords[i].mUsed = false; + } + mHighestSuggestedWord = 0; + } + + ~WordsPriorityQueue() { + delete[] mSuggestedWords; + } + + void push(int score, unsigned short* word, int wordLength) { + SuggestedWord* sw = 0; + if (mSuggestions.size() >= MAX_WORDS) { + sw = mSuggestions.top(); + const int minScore = sw->mScore; + if (minScore >= score) { + return; + } else { + sw->mUsed = false; + mSuggestions.pop(); + } + } + if (sw == 0) { + sw = getFreeSuggestedWord(score, word, wordLength); + } else { + sw->setParams(score, word, wordLength); + } + if (sw == 0) { + AKLOGE("SuggestedWord is accidentally null."); + return; + } + if (DEBUG_WORDS_PRIORITY_QUEUE) { + AKLOGI("Push word. %d, %d", score, wordLength); + DUMP_WORD(word, wordLength); + } + mSuggestions.push(sw); + if (!mHighestSuggestedWord || mHighestSuggestedWord->mScore < sw->mScore) { + mHighestSuggestedWord = sw; + } + } + + SuggestedWord* top() { + if (mSuggestions.empty()) return 0; + SuggestedWord* sw = mSuggestions.top(); + return sw; + } + + int outputSuggestions(int *frequencies, unsigned short *outputChars) { + mHighestSuggestedWord = 0; + const unsigned int size = min( + MAX_WORDS, static_cast<unsigned int>(mSuggestions.size())); + int index = size - 1; + while (!mSuggestions.empty() && index >= 0) { + SuggestedWord* sw = mSuggestions.top(); + if (DEBUG_WORDS_PRIORITY_QUEUE) { + AKLOGI("dump word. %d", sw->mScore); + DUMP_WORD(sw->mWord, sw->mWordLength); + } + const unsigned int wordLength = sw->mWordLength; + char* targetAdr = (char*) outputChars + + (index) * MAX_WORD_LENGTH * sizeof(short); + frequencies[index] = sw->mScore; + memcpy(targetAdr, sw->mWord, (wordLength) * sizeof(short)); + if (wordLength < MAX_WORD_LENGTH) { + ((unsigned short*) targetAdr)[wordLength] = 0; + } + sw->mUsed = false; + mSuggestions.pop(); + --index; + } + return size; + } + + int size() const { + return mSuggestions.size(); + } + + void clear() { + mHighestSuggestedWord = 0; + while (!mSuggestions.empty()) { + SuggestedWord* sw = mSuggestions.top(); + if (DEBUG_WORDS_PRIORITY_QUEUE) { + AKLOGI("Clear word. %d", sw->mScore); + DUMP_WORD(sw->mWord, sw->mWordLength); + } + sw->mUsed = false; + mSuggestions.pop(); + } + } + + void dumpTopWord() { + if (size() <= 0) { + return; + } + DUMP_WORD(mHighestSuggestedWord->mWord, mHighestSuggestedWord->mWordLength); + } + + double getHighestNormalizedScore(const unsigned short* before, const int beforeLength, + unsigned short** outWord, int *outScore, int *outLength) { + if (!mHighestSuggestedWord) { + return 0.0; + } + SuggestedWord* sw = mHighestSuggestedWord; + const int score = sw->mScore; + unsigned short* word = sw->mWord; + const int wordLength = sw->mWordLength; + if (outScore) { + *outScore = score; + } + if (outWord) { + *outWord = word; + } + if (outLength) { + *outLength = wordLength; + } + return Correction::RankingAlgorithm::calcNormalizedScore( + before, beforeLength, word, wordLength, score); + } + + private: + struct wordComparator { + bool operator ()(SuggestedWord * left, SuggestedWord * right) { + return left->mScore > right->mScore; + } + }; + + SuggestedWord* getFreeSuggestedWord(int score, unsigned short* word, + int wordLength) { + for (unsigned int i = 0; i < MAX_WORD_LENGTH; ++i) { + if (!mSuggestedWords[i].mUsed) { + mSuggestedWords[i].setParams(score, word, wordLength); + return &mSuggestedWords[i]; + } + } + return 0; + } + + typedef std::priority_queue<SuggestedWord*, std::vector<SuggestedWord*>, + wordComparator> Suggestions; + Suggestions mSuggestions; + const unsigned int MAX_WORDS; + const unsigned int MAX_WORD_LENGTH; + SuggestedWord* mSuggestedWords; + SuggestedWord* mHighestSuggestedWord; +}; +} + +#endif // LATINIME_WORDS_PRIORITY_QUEUE_H diff --git a/native/jni/src/words_priority_queue_pool.h b/native/jni/src/words_priority_queue_pool.h new file mode 100644 index 000000000..5b50e8f4f --- /dev/null +++ b/native/jni/src/words_priority_queue_pool.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORDS_PRIORITY_QUEUE_POOL_H +#define LATINIME_WORDS_PRIORITY_QUEUE_POOL_H + +#include <assert.h> +#include <new> +#include "words_priority_queue.h" + +namespace latinime { + +class WordsPriorityQueuePool { + public: + WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) { + mMasterQueue = new(mMasterQueueBuf) WordsPriorityQueue(mainQueueMaxWords, maxWordLength); + for (int i = 0, subQueueBufOffset = 0; + i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS * SUB_QUEUE_MAX_COUNT; + ++i, subQueueBufOffset += sizeof(WordsPriorityQueue)) { + mSubQueues[i] = new(mSubQueueBuf + subQueueBufOffset) + WordsPriorityQueue(subQueueMaxWords, maxWordLength); + } + } + + virtual ~WordsPriorityQueuePool() { + } + + WordsPriorityQueue* getMasterQueue() { + return mMasterQueue; + } + + WordsPriorityQueue* getSubQueue(const int wordIndex, const int inputWordLength) { + if (wordIndex >= MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { + return 0; + } + if (inputWordLength < 0 || inputWordLength >= SUB_QUEUE_MAX_COUNT) { + if (DEBUG_WORDS_PRIORITY_QUEUE) { + assert(false); + } + return 0; + } + return mSubQueues[wordIndex * SUB_QUEUE_MAX_COUNT + inputWordLength]; + } + + inline void clearAll() { + mMasterQueue->clear(); + for (int i = 0; i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS; ++i) { + clearSubQueue(i); + } + } + + inline void clearSubQueue(const int wordIndex) { + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + WordsPriorityQueue* queue = getSubQueue(wordIndex, i); + if (queue) { + queue->clear(); + } + } + } + + void dumpSubQueue1TopSuggestions() { + AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS"); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + getSubQueue(0, i)->dumpTopWord(); + } + } + + private: + WordsPriorityQueue* mMasterQueue; + WordsPriorityQueue* mSubQueues[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; + char mMasterQueueBuf[sizeof(WordsPriorityQueue)]; + char mSubQueueBuf[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS + * SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)]; +}; +} + +#endif // LATINIME_WORDS_PRIORITY_QUEUE_POOL_H diff --git a/native/src/char_utils.h b/native/src/char_utils.h deleted file mode 100644 index a69a35e7a..000000000 --- a/native/src/char_utils.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (C) 2010 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_CHAR_UTILS_H -#define LATINIME_CHAR_UTILS_H - -namespace latinime { - -unsigned short latin_tolower(unsigned short c); - -} // namespace latinime - -#endif // LATINIME_CHAR_UTILS_H diff --git a/native/src/dictionary.h b/native/src/dictionary.h deleted file mode 100644 index d5de0083a..000000000 --- a/native/src/dictionary.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (C) 2009 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_H -#define LATINIME_DICTIONARY_H - -#include "basechars.h" -#include "bigram_dictionary.h" -#include "char_utils.h" -#include "defines.h" -#include "proximity_info.h" -#include "unigram_dictionary.h" - -namespace latinime { - -class Dictionary { -public: - Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler, - int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives); - int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, - int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) { - return mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - codesSize, flags, outWords, frequencies); - } - - // TODO: Call mBigramDictionary instead of mUnigramDictionary - int getBigrams(unsigned short *word, int length, int *codes, int codesSize, - unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, - int maxAlternatives) { - return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies, - maxWordLength, maxBigrams, maxAlternatives); - } - - bool isValidWord(unsigned short *word, int length); - void *getDict() { return (void *)mDict; } - int getDictSize() { return mDictSize; } - int getMmapFd() { return mMmapFd; } - int getDictBufAdjust() { return mDictBufAdjust; } - ~Dictionary(); - - // public static utility methods - // static inline methods should be defined in the header file - static unsigned short getChar(const unsigned char *dict, int *pos); - static int getCount(const unsigned char *dict, int *pos); - static bool getTerminal(const unsigned char *dict, int *pos); - static int getAddress(const unsigned char *dict, int *pos); - static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos); - static int wideStrLen(unsigned short *str); - // returns next sibling's position - static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion, - const int pos, unsigned short *c, int *childrenPosition, - bool *terminal, int *freq); - static inline unsigned short toBaseLowerCase(unsigned short c); - -private: - bool hasBigram(); - - const unsigned char *mDict; - - // Used only for the mmap version of dictionary loading, but we use these as dummy variables - // also for the malloc version. - const int mDictSize; - const int mMmapFd; - const int mDictBufAdjust; - - const bool IS_LATEST_DICT_VERSION; - UnigramDictionary *mUnigramDictionary; - BigramDictionary *mBigramDictionary; -}; - -// public static utility methods -// static inline methods should be defined in the header file -inline unsigned short Dictionary::getChar(const unsigned char *dict, int *pos) { - unsigned short ch = (unsigned short) (dict[(*pos)++] & 0xFF); - // If the code is 255, then actual 16 bit code follows (in big endian) - if (ch == 0xFF) { - ch = ((dict[*pos] & 0xFF) << 8) | (dict[*pos + 1] & 0xFF); - (*pos) += 2; - } - return ch; -} - -inline int Dictionary::getCount(const unsigned char *dict, int *pos) { - return dict[(*pos)++] & 0xFF; -} - -inline bool Dictionary::getTerminal(const unsigned char *dict, int *pos) { - return (dict[*pos] & FLAG_TERMINAL_MASK) > 0; -} - -inline int Dictionary::getAddress(const unsigned char *dict, int *pos) { - int address = 0; - if ((dict[*pos] & FLAG_ADDRESS_MASK) == 0) { - *pos += 1; - } else { - address += (dict[*pos] & (ADDRESS_MASK >> 16)) << 16; - address += (dict[*pos + 1] & 0xFF) << 8; - address += (dict[*pos + 2] & 0xFF); - *pos += 3; - } - return address; -} - -inline int Dictionary::getFreq(const unsigned char *dict, - const bool isLatestDictVersion, int *pos) { - int freq = dict[(*pos)++] & 0xFF; - if (isLatestDictVersion) { - // skipping bigram - int bigramExist = (dict[*pos] & FLAG_BIGRAM_READ); - if (bigramExist > 0) { - int nextBigramExist = 1; - while (nextBigramExist > 0) { - (*pos) += 3; - nextBigramExist = (dict[(*pos)++] & FLAG_BIGRAM_CONTINUED); - } - } else { - (*pos)++; - } - } - return freq; -} - -inline int Dictionary::wideStrLen(unsigned short *str) { - if (!str) return 0; - unsigned short *end = str; - while (*end) - end++; - return end - str; -} - -inline int Dictionary::setDictionaryValues(const unsigned char *dict, - const bool isLatestDictVersion, const int pos, unsigned short *c,int *childrenPosition, - bool *terminal, int *freq) { - int position = pos; - // -- at char - *c = Dictionary::getChar(dict, &position); - // -- at flag/add - *terminal = Dictionary::getTerminal(dict, &position); - *childrenPosition = Dictionary::getAddress(dict, &position); - // -- after address or flag - *freq = (*terminal) ? Dictionary::getFreq(dict, isLatestDictVersion, &position) : 1; - // returns next sibling's position - return position; -} - - -inline unsigned short Dictionary::toBaseLowerCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (c >='A' && c <= 'Z') { - c |= 32; - } else if (c > 127) { - c = latin_tolower(c); - } - return c; -} - -} // namespace latinime - -#endif // LATINIME_DICTIONARY_H diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp deleted file mode 100644 index 8eb5a9700..000000000 --- a/native/src/unigram_dictionary.cpp +++ /dev/null @@ -1,729 +0,0 @@ -/* -** -** Copyright 2010, The Android Open Source Project -** -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** -** http://www.apache.org/licenses/LICENSE-2.0 -** -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -*/ - -#include <assert.h> -#include <string.h> - -#define LOG_TAG "LatinIME: unigram_dictionary.cpp" - -#include "char_utils.h" -#include "dictionary.h" -#include "unigram_dictionary.h" - -#include "binary_format.h" - -namespace latinime { - -const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = - { { 'a', 'e' }, - { 'o', 'e' }, - { 'u', 'e' } }; - -// TODO: check the header -UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, - int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, - const bool isLatestDictVersion) - : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE), - MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), - MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), - TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), - // TODO : remove this variable. - ROOT_POS(0), - BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)), - MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { - if (DEBUG_DICT) { - LOGI("UnigramDictionary - constructor"); - } - mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); -} - -UnigramDictionary::~UnigramDictionary() { - delete mCorrection; -} - -static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize, - const int MAX_PROXIMITY_CHARS) { - return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize; -} - -bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const { - - // There can't be a digraph if we don't have at least 2 characters to examine - if (i + 2 > codesSize) return false; - - // Search for the first char of some digraph - int lastDigraphIndex = -1; - const int thisChar = codes[i * MAX_PROXIMITY_CHARS]; - for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1; - lastDigraphIndex >= 0; --lastDigraphIndex) { - if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break; - } - // No match: return early - if (lastDigraphIndex < 0) return false; - - // It's an interesting digraph if the second char matches too. - return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS]; -} - -// Mostly the same arguments as the non-recursive version, except: -// codes is the original value. It points to the start of the work buffer, and gets passed as is. -// codesSize is the size of the user input (thus, it is the size of codesSrc). -// codesDest is the current point in the work buffer. -// codesSrc is the current point in the user-input, original, content-unmodified buffer. -// codesRemain is the remaining size in codesSrc. -void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, - const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, - const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, - const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) { - - if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) { - for (int i = 0; i < codesRemain; ++i) { - if (isDigraph(codesSrc, i, codesRemain)) { - // Found a digraph. We will try both spellings. eg. the word is "pruefen" - - // Copy the word up to the first char of the digraph, then continue processing - // on the remaining part of the word, skipping the second char of the digraph. - // In our example, copy "pru" and continue running on "fen" - // Make i the index of the second char of the digraph for simplicity. Forgetting - // to do that results in an infinite recursion so take care! - ++i; - memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR); - getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, - codesBuffer, codesBufferSize, flags, - codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1, - currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords, - frequencies); - - // Copy the second char of the digraph in place, then continue processing on - // the remaining part of the word. - // In our example, after "pru" in the buffer copy the "e", and continue on "fen" - memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS, - BYTES_IN_ONE_CHAR); - getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, - codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS, - codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, - outWords, frequencies); - return; - } - } - } - - // If we come here, we hit the end of the word: let's check it against the dictionary. - // In our example, we'll come here once for "prufen" and then once for "pruefen". - // If the word contains several digraphs, we'll come it for the product of them. - // eg. if the word is "ueberpruefen" we'll test, in order, against - // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen". - const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain; - if (0 != remainingBytes) - memcpy(codesDest, codesSrc, remainingBytes); - - getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies, - flags); -} - -int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, const int flags, - unsigned short *outWords, int *frequencies) { - - if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags) - { // Incrementally tune the word and try all possibilities - int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)]; - getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, - codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies); - } else { // Normal processing - getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, - outWords, frequencies, flags); - } - - PROF_START(20); - // Get the word count - int suggestedWordsCount = 0; - while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) { - suggestedWordsCount++; - } - - if (DEBUG_DICT) { - LOGI("Returning %d words", suggestedWordsCount); - /// Print the returned words - for (int j = 0; j < suggestedWordsCount; ++j) { -#ifdef FLAG_DBG - short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH; - char s[MAX_WORD_LENGTH]; - for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; - LOGI("%s %i", s, mFrequencies[j]); -#endif - } - } - PROF_END(20); - PROF_CLOSE; - return suggestedWordsCount; -} - -void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, - const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies, const int flags) { - - PROF_OPEN; - PROF_START(0); - initSuggestions( - proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies); - if (DEBUG_DICT) assert(codesSize == mInputLength); - - const int maxDepth = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); - mCorrection->initCorrection(mProximityInfo, mInputLength, maxDepth); - PROF_END(0); - - const bool useFullEditDistance = USE_FULL_EDIT_DISTANCE & flags; - // TODO: remove - PROF_START(1); - getSuggestionCandidates(useFullEditDistance); - PROF_END(1); - - PROF_START(2); - // Note: This line is intentionally left blank - PROF_END(2); - - PROF_START(3); - // Note: This line is intentionally left blank - PROF_END(3); - - PROF_START(4); - // Note: This line is intentionally left blank - PROF_END(4); - - PROF_START(5); - // Suggestions with missing space - if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER - && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { - for (int i = 1; i < codesSize; ++i) { - if (DEBUG_DICT) { - LOGI("--- Suggest missing space characters %d", i); - } - getMissingSpaceWords(mInputLength, i, mCorrection, useFullEditDistance); - } - } - PROF_END(5); - - PROF_START(6); - if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) { - // The first and last "mistyped spaces" are taken care of by excessive character handling - for (int i = 1; i < codesSize - 1; ++i) { - if (DEBUG_DICT) { - LOGI("--- Suggest words with proximity space %d", i); - } - const int x = xcoordinates[i]; - const int y = ycoordinates[i]; - if (DEBUG_PROXIMITY_INFO) { - LOGI("Input[%d] x = %d, y = %d, has space proximity = %d", - i, x, y, proximityInfo->hasSpaceProximity(x, y)); - } - if (proximityInfo->hasSpaceProximity(x, y)) { - getMistypedSpaceWords(mInputLength, i, mCorrection, useFullEditDistance); - } - } - } - PROF_END(6); -} - -void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, - const int *yCoordinates, const int *codes, const int codesSize, - unsigned short *outWords, int *frequencies) { - if (DEBUG_DICT) { - LOGI("initSuggest"); - } - mFrequencies = frequencies; - mOutputChars = outWords; - mInputLength = codesSize; - proximityInfo->setInputParams(codes, codesSize, xCoordinates, yCoordinates); - mProximityInfo = proximityInfo; -} - -static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) { - if (c < nextLettersSize) { - nextLetters[c]++; - } -} - -// TODO: We need to optimize addWord by using STL or something -// TODO: This needs to take an const unsigned short* and not tinker with its contents -bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) { - word[length] = 0; - if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) { -#ifdef FLAG_DBG - char s[length + 1]; - for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Found word = %s, freq = %d", s, frequency); -#endif - } - if (length > MAX_WORD_LENGTH) { - if (DEBUG_DICT) { - LOGI("Exceeded max word length."); - } - return false; - } - - // Find the right insertion point - int insertAt = 0; - while (insertAt < MAX_WORDS) { - // TODO: How should we sort words with the same frequency? - if (frequency > mFrequencies[insertAt]) { - break; - } - insertAt++; - } - if (insertAt < MAX_WORDS) { - if (DEBUG_DICT) { -#ifdef FLAG_DBG - char s[length + 1]; - for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX); -#endif - } - memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), - (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), - (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0])); - mFrequencies[insertAt] = frequency; - memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short), - (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short), - (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH); - unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH; - while (length--) { - *dest++ = *word++; - } - *dest = 0; // NULL terminate - if (DEBUG_DICT) { - LOGI("Added word at %d", insertAt); - } - return true; - } - return false; -} - -static const char QUOTE = '\''; -static const char SPACE = ' '; - -void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance) { - // TODO: Remove setCorrectionParams - mCorrection->setCorrectionParams(0, 0, 0, - -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance); - int rootPosition = ROOT_POS; - // Get the number of children of root, then increment the position - int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); - int outputIndex = 0; - - mCorrection->initCorrectionState(rootPosition, childCount, (mInputLength <= 0)); - - // Depth first search - while (outputIndex >= 0) { - if (mCorrection->initProcessState(outputIndex)) { - int siblingPos = mCorrection->getTreeSiblingPos(outputIndex); - int firstChildPos; - - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - mCorrection, &childCount, &firstChildPos, &siblingPos); - // Update next sibling pos - mCorrection->setTreeSiblingPos(outputIndex, siblingPos); - - if (needsToTraverseChildrenNodes) { - // Goes to child node - outputIndex = mCorrection->goDownTree(outputIndex, childCount, firstChildPos); - } - } else { - // Goes to parent sibling node - outputIndex = mCorrection->getTreeParentIndex(outputIndex); - } - } -} - -void UnigramDictionary::getMissingSpaceWords( - const int inputLength, const int missingSpacePos, Correction *correction, - const bool useFullEditDistance) { - correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, - -1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos, - useFullEditDistance); - getSplitTwoWordsSuggestion(inputLength, correction); -} - -void UnigramDictionary::getMistypedSpaceWords( - const int inputLength, const int spaceProximityPos, Correction *correction, - const bool useFullEditDistance) { - correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, - -1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */, - useFullEditDistance); - getSplitTwoWordsSuggestion(inputLength, correction); -} - -inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, - const int inputIndex, const int skipPos, const int depth) { - const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex); - // Skip the ' or other letter and continue deeper - return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth; -} - -inline void UnigramDictionary::onTerminal(const int freq, Correction *correction) { - int wordLength; - unsigned short* wordPointer; - const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); - if (finalFreq >= 0) { - addWord(wordPointer, wordLength, finalFreq); - } -} - -void UnigramDictionary::getSplitTwoWordsSuggestion( - const int inputLength, Correction* correction) { - const int spaceProximityPos = correction->getSpaceProximityPos(); - const int missingSpacePos = correction->getMissingSpacePos(); - if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); - } - const bool isSpaceProximity = spaceProximityPos >= 0; - const int firstWordStartPos = 0; - const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; - const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity - ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); - - if (inputLength >= MAX_WORD_LENGTH) return; - if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos - || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) - return; - - const int newWordLength = firstWordLength + secondWordLength + 1; - // Allocating variable length array on stack - unsigned short word[newWordLength]; - const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord); - if (DEBUG_DICT) { - LOGI("First freq: %d", firstFreq); - } - if (firstFreq <= 0) return; - - for (int i = 0; i < firstWordLength; ++i) { - word[i] = mWord[i]; - } - - const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord); - if (DEBUG_DICT) { - LOGI("Second freq: %d", secondFreq); - } - if (secondFreq <= 0) return; - - word[firstWordLength] = SPACE; - for (int i = (firstWordLength + 1); i < newWordLength; ++i) { - word[i] = mWord[i - firstWordLength - 1]; - } - - const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word); - if (DEBUG_DICT) { - LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); - } - addWord(word, newWordLength, pairFreq); - return; -} - -// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous -// interface. -inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, - const int inputLength, unsigned short *word) { - uint16_t inWord[inputLength]; - - for (int i = 0; i < inputLength; ++i) { - inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i); - } - return getMostFrequentWordLikeInner(inWord, inputLength, word); -} - -// This function will take the position of a character array within a CharGroup, -// and check it actually like-matches the word in inWord starting at startInputIndex, -// that is, it matches it with case and accents squashed. -// The function returns true if there was a full match, false otherwise. -// The function will copy on-the-fly the characters in the CharGroup to outNewWord. -// It will also place the end position of the array in outPos; in outInputIndex, -// it will place the index of the first char AFTER the match if there was a match, -// and the initial position if there was not. It makes sense because if there was -// a match we want to continue searching, but if there was not, we want to go to -// the next CharGroup. -// In and out parameters may point to the same location. This function takes care -// not to use any input parameters after it wrote into its outputs. -static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, - const uint8_t* const root, const int startPos, - const uint16_t* const inWord, const int startInputIndex, - int32_t* outNewWord, int* outInputIndex, int* outPos) { - const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); - int pos = startPos; - int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - int32_t baseChar = Dictionary::toBaseLowerCase(character); - const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]); - - if (baseChar != wChar) { - *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; - *outInputIndex = startInputIndex; - return false; - } - int inputIndex = startInputIndex; - outNewWord[inputIndex] = character; - if (hasMultipleChars) { - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - while (NOT_A_CHARACTER != character) { - baseChar = Dictionary::toBaseLowerCase(character); - if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) { - *outPos = BinaryFormat::skipOtherCharacters(root, pos); - *outInputIndex = startInputIndex; - return false; - } - outNewWord[inputIndex] = character; - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - } - } - *outInputIndex = inputIndex + 1; - *outPos = pos; - return true; -} - -// This function is invoked when a word like the word searched for is found. -// It will compare the frequency to the max frequency, and if greater, will -// copy the word into the output buffer. In output value maxFreq, it will -// write the new maximum frequency if it changed. -static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length, - short unsigned int* outWord, int* maxFreq) { - if (freq > *maxFreq) { - for (int q = 0; q < length; ++q) - outWord[q] = newWord[q]; - outWord[length] = 0; - *maxFreq = freq; - } -} - -// Will find the highest frequency of the words like the one passed as an argument, -// that is, everything that only differs by case/accents. -int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord, - const int length, short unsigned int* outWord) { - int32_t newWord[MAX_WORD_LENGTH_INTERNAL]; - int depth = 0; - int maxFreq = -1; - const uint8_t* const root = DICT_ROOT; - - mStackChildCount[0] = root[0]; - mStackInputIndex[0] = 0; - mStackSiblingPos[0] = 1; - while (depth >= 0) { - const int charGroupCount = mStackChildCount[depth]; - int pos = mStackSiblingPos[depth]; - for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) { - int inputIndex = mStackInputIndex[depth]; - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - // Test whether all chars in this group match with the word we are searching for. If so, - // we want to traverse its children (or if the length match, evaluate its frequency). - // Note that this function will output the position regardless, but will only write - // into inputIndex if there is a match. - const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, - inputIndex, newWord, &inputIndex, &pos); - if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) { - const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); - onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); - } - pos = BinaryFormat::skipFrequency(flags, pos); - const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); - const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); - // If we had a match and the word has children, we want to traverse them. We don't have - // to traverse words longer than the one we are searching for, since they will not match - // anyway, so don't traverse unless inputIndex < length. - if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) { - // Save position for this depth, to get back to this once children are done - mStackChildCount[depth] = charGroupIndex; - mStackSiblingPos[depth] = siblingPos; - // Prepare stack values for next depth - ++depth; - int childrenPos = childrenNodePos; - mStackChildCount[depth] = - BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos); - mStackSiblingPos[depth] = childrenPos; - mStackInputIndex[depth] = inputIndex; - pos = childrenPos; - // Go to the next depth level. - ++depth; - break; - } else { - // No match, or no children, or word too long to ever match: go the next sibling. - pos = siblingPos; - } - } - --depth; - } - return maxFreq; -} - -bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const { - return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); -} - -// TODO: remove this function. -int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, - int length) const { - return -1; -} - -// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not. -// If the return value is false, then the caller should read in the output "nextSiblingPosition" -// to find out the address of the next sibling node and pass it to a new call of processCurrentNode. -// It is worthy to note that when false is returned, the output values other than -// nextSiblingPosition are undefined. -// If the return value is true, then the caller must proceed to traverse the children of this -// node. processCurrentNode will output the information about the children: their count in -// newCount, their position in newChildrenPosition, the traverseAllNodes flag in -// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the -// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into -// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when -// there aren't any more nodes at this level, it merely returns the address of the first byte after -// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any -// given level, as output into newCount when traversing this level's parent. -inline bool UnigramDictionary::processCurrentNode(const int initialPos, - Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition) { - if (DEBUG_DICT) { - correction->checkState(); - } - int pos = initialPos; - - // Flags contain the following information: - // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: - // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address - // is on the specified number of bytes. - // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address. - // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. - // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) - // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); - const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); - const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); - - bool needsToInvokeOnTerminal = false; - - // This gets only ONE character from the stream. Next there will be: - // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node - // else if FLAG_IS_TERMINAL: the frequency - // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address - // Note that you can't have a node that both is not a terminal and has no children. - int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); - assert(NOT_A_CHARACTER != c); - - // We are going to loop through each character and make it look like it's a different - // node each time. To do that, we will process characters in this node in order until - // we find the character terminator. This is signalled by getCharCode* returning - // NOT_A_CHARACTER. - // As a special case, if there is only one character in this node, we must not read the - // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. - // This way, each loop run will look like a "virtual node". - do { - // We prefetch the next char. If 'c' is the last char of this node, we will have - // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node - // should behave as a terminal or not and whether we have children. - const int32_t nextc = hasMultipleChars - ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; - const bool isLastChar = (NOT_A_CHARACTER == nextc); - // If there are more chars in this nodes, then this virtual node is not a terminal. - // If we are on the last char, this virtual node is a terminal if this node is. - const bool isTerminal = isLastChar && isTerminalNode; - - Correction::CorrectionType stateType = correction->processCharAndCalcState( - c, isTerminal); - if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL - || stateType == Correction::ON_TERMINAL) { - needsToInvokeOnTerminal = true; - } else if (stateType == Correction::UNRELATED) { - // We found that this is an unrelated character, so we should give up traversing - // this node and its children entirely. - // However we may not be on the last virtual node yet so we skip the remaining - // characters in this node, the frequency if it's there, read the next sibling - // position to output it, then return false. - // We don't have to output other values because we return false, as in - // "don't traverse children". - if (!isLastChar) { - pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); - } - pos = BinaryFormat::skipFrequency(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - return false; - } - - // Prepare for the next character. Promote the prefetched char to current char - the loop - // will take care of prefetching the next. If we finally found our last char, nextc will - // contain NOT_A_CHARACTER. - c = nextc; - } while (NOT_A_CHARACTER != c); - - if (isTerminalNode) { - if (needsToInvokeOnTerminal) { - // The frequency should be here, because we come here only if this is actually - // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - onTerminal(freq, mCorrection); - } - - // If there are more chars in this node, then this virtual node has children. - // If we are on the last char, this virtual node has children if this node has. - const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); - - // This character matched the typed character (enough to traverse the node at least) - // so we just evaluated it. Now we should evaluate this virtual node's children - that - // is, if it has any. If it has no children, we're done here - so we skip the end of - // the node, output the siblings position, and return false "don't traverse children". - // Note that !hasChildren implies isLastChar, so we know we don't have to skip any - // remaining char in this group for there can't be any. - if (!hasChildren) { - pos = BinaryFormat::skipFrequency(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - return false; - } - - // Optimization: Prune out words that are too long compared to how much was typed. - if (correction->needsToPrune()) { - pos = BinaryFormat::skipFrequency(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - if (DEBUG_DICT_FULL) { - LOGI("Traversing was pruned."); - } - return false; - } - } - - // Now we finished processing this node, and we want to traverse children. If there are no - // children, we can't come here. - assert(BinaryFormat::hasChildrenInFlags(flags)); - - // If this node was a terminal it still has the frequency under the pointer (it may have been - // read, but not skipped - see readFrequencyWithoutMovingPointer). - // Next come the children position, then possibly attributes (attributes are bigrams only for - // now, maybe something related to shortcuts in the future). - // Once this is read, we still need to output the number of nodes in the immediate children of - // this node, so we read and output it before returning true, as in "please traverse children". - pos = BinaryFormat::skipFrequency(flags, pos); - int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); - *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); - *newChildrenPosition = childrenPos; - return true; -} - -} // namespace latinime |