diff options
50 files changed, 902 insertions, 397 deletions
diff --git a/java/res/values-ru/strings.xml b/java/res/values-ru/strings.xml index 3ee986930..b00fa65a0 100644 --- a/java/res/values-ru/strings.xml +++ b/java/res/values-ru/strings.xml @@ -33,19 +33,19 @@ <string name="misc_category" msgid="6894192814868233453">"Другие варианты"</string> <string name="advanced_settings" msgid="362895144495591463">"Расширенные настройки"</string> <string name="advanced_settings_summary" msgid="4487980456152830271">"Для опытных пользователей"</string> - <string name="include_other_imes_in_language_switch_list" msgid="4533689960308565519">"Другой способ ввода"</string> + <string name="include_other_imes_in_language_switch_list" msgid="4533689960308565519">"Смена способов ввода"</string> <string name="include_other_imes_in_language_switch_list_summary" msgid="840637129103317635">"Клавиша переключения языков также служит для смены способа ввода"</string> <string name="show_language_switch_key" msgid="5915478828318774384">"Клавиша смены языка"</string> <string name="show_language_switch_key_summary" msgid="7343403647474265713">"Показывать, когда включено несколько раскладок"</string> - <string name="sliding_key_input_preview" msgid="6604262359510068370">"Показывать индикатор перехода"</string> - <string name="sliding_key_input_preview_summary" msgid="6340524345729093886">"Индикатор перехода между регистрами или цифр. и букв. режимами"</string> + <string name="sliding_key_input_preview" msgid="6604262359510068370">"След от переключателя режима"</string> + <string name="sliding_key_input_preview_summary" msgid="6340524345729093886">"Показывать след при проведении пальцем от кнопок Shift и \"Символы\""</string> <string name="key_preview_popup_dismiss_delay" msgid="6213164897443068248">"Задержка закрытия"</string> <string name="key_preview_popup_dismiss_no_delay" msgid="2096123151571458064">"Без задержки"</string> <string name="key_preview_popup_dismiss_default_delay" msgid="2166964333903906734">"По умолчанию"</string> <string name="abbreviation_unit_milliseconds" msgid="8700286094028323363">"<xliff:g id="MILLISECONDS">%s</xliff:g> мс"</string> <string name="use_contacts_dict" msgid="4435317977804180815">"Подсказывать имена"</string> <string name="use_contacts_dict_summary" msgid="6599983334507879959">"Подсказывать исправления на основе имен из списка контактов"</string> - <string name="use_double_space_period" msgid="8781529969425082860">"Точка с пробелом"</string> + <string name="use_double_space_period" msgid="8781529969425082860">"Ставить точки автоматически"</string> <string name="use_double_space_period_summary" msgid="6532892187247952799">"Вводить точку с пробелом двойным нажатием кнопки \"Пробел\"."</string> <string name="auto_cap" msgid="1719746674854628252">"Заглавные автоматически"</string> <string name="auto_cap_summary" msgid="7934452761022946874">"Писать первое слово предложения с прописной буквы"</string> @@ -164,7 +164,7 @@ <string name="prefs_key_longpress_timeout_settings" msgid="6102240298932897873">"Долгое нажатие"</string> <string name="prefs_keypress_vibration_duration_settings" msgid="7918341459947439226">"Вибросигнал при нажатии клавиш"</string> <string name="prefs_keypress_sound_volume_settings" msgid="6027007337036891623">"Звук при нажатии клавиш"</string> - <string name="prefs_read_external_dictionary" msgid="2588931418575013067">"Считывать данные из внешнего словаря"</string> + <string name="prefs_read_external_dictionary" msgid="2588931418575013067">"Загрузить словарь из файла"</string> <string name="read_external_dictionary_no_files_message" msgid="4947420942224623792">"В папке \"Загрузки\" нет словарей"</string> <string name="read_external_dictionary_multiple_files_title" msgid="7637749044265808628">"Выберите файл словаря"</string> <string name="read_external_dictionary_confirm_install_message" msgid="6898610163768980870">"Установить этот файл для следующего языка: <xliff:g id="LOCALE_NAME">%s</xliff:g>?"</string> diff --git a/java/res/values-zh-rCN/strings.xml b/java/res/values-zh-rCN/strings.xml index 4bb2ce438..a5bbab2a8 100644 --- a/java/res/values-zh-rCN/strings.xml +++ b/java/res/values-zh-rCN/strings.xml @@ -212,9 +212,9 @@ <string name="install_dict" msgid="180852772562189365">"安装"</string> <string name="cancel_download_dict" msgid="7843340278507019303">"取消"</string> <string name="delete_dict" msgid="756853268088330054">"删除"</string> - <string name="should_download_over_metered_prompt" msgid="2878629598667658845">"支持您移动设备上所选语言的词典现已可供下载啦!<br/>建议您<b>下载</b>这部<xliff:g id="LANGUAGE">%1$s</xliff:g>词典,以享受更好的输入体验。<br/><br/>通过 3G 进行下载可能需要 1 到 2 分钟的时间。如果您使用的不是<b>无流量限制的套餐</b>,则可能需要支付一定的费用。<br/>如果您不确定自己使用的是哪种流量套餐,建议您使用 Wi-Fi 连接自动开始下载。<br/><br/>提示:您可以访问移动设备的<b>设置</b>菜单中的<b>语言和输入法</b>,来下载和删除词典。"</string> + <string name="should_download_over_metered_prompt" msgid="2878629598667658845">"支持您移动设备上所选语言的词典现已可供下载啦!<br/>建议您<b>下载</b>这部<xliff:g id="LANGUAGE">%1$s</xliff:g>词典,以享受更好的输入体验。<br/><br/>通过 3G 进行下载可能需要 1 到 2 分钟的时间。如果您使用的不是<b>无流量限制的套餐</b>,则可能需要支付一定的费用。<br/>如果您不确定自己使用的是哪种流量套餐,建议您使用 WLAN 连接自动开始下载。<br/><br/>提示:您可以访问移动设备的<b>设置</b>菜单中的<b>语言和输入法</b>,来下载和删除词典。"</string> <string name="download_over_metered" msgid="1643065851159409546">"立即下载 (<xliff:g id="SIZE_IN_MEGABYTES">%1$.1f</xliff:g>MB)"</string> - <string name="do_not_download_over_metered" msgid="2176209579313941583">"通过 Wi-Fi 下载"</string> + <string name="do_not_download_over_metered" msgid="2176209579313941583">"通过 WLAN 下载"</string> <string name="dict_available_notification_title" msgid="6514288591959117288">"<xliff:g id="LANGUAGE">%1$s</xliff:g>词典可供下载"</string> <string name="dict_available_notification_description" msgid="1075194169443163487">"按此通知即可查看和下载"</string> <string name="toast_downloading_suggestions" msgid="1313027353588566660">"下载中:很快就能启用<xliff:g id="LANGUAGE">%1$s</xliff:g>的词典建议服务了!"</string> diff --git a/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java b/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java index 1e93e7e7a..4b89d20bb 100644 --- a/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java +++ b/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java @@ -304,7 +304,7 @@ public final class DictionarySettingsFragment extends PreferenceFragment // the description. final String key = matchLevelString + "." + description + "." + wordlistId; final WordListPreference existingPref = prefMap.get(key); - if (null == existingPref || hasPriority(status, existingPref.mStatus)) { + if (null == existingPref || existingPref.hasPriorityOver(status)) { final WordListPreference oldPreference = mCurrentPreferenceMap.get(key); final WordListPreference pref; if (null != oldPreference @@ -315,7 +315,7 @@ public final class DictionarySettingsFragment extends PreferenceFragment // need to be the same, others have been tested through the key of the // map. Also, status may differ so we don't want to use #equals() here. pref = oldPreference; - pref.mStatus = status; + pref.setStatus(status); } else { // Otherwise, discard it and create a new one instead. pref = new WordListPreference(activity, mDictionaryListInterfaceState, @@ -331,18 +331,6 @@ public final class DictionarySettingsFragment extends PreferenceFragment } } - /** - * Finds out if a given status has priority over another for display order. - * - * @param newStatus - * @param oldStatus - * @return whether newStatus has priority over oldStatus. - */ - private static boolean hasPriority(final int newStatus, final int oldStatus) { - // Both of these should be one of MetadataDbHelper.STATUS_* - return newStatus > oldStatus; - } - @Override public boolean onOptionsItemSelected(final MenuItem item) { switch (item.getItemId()) { diff --git a/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java b/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java index a1031c2ca..7ec7e9c13 100644 --- a/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java +++ b/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java @@ -61,7 +61,7 @@ public final class WordListPreference extends Preference { public final Locale mLocale; public final String mDescription; // The status - public int mStatus; + private int mStatus; // The size of the dictionary file private final int mFilesize; @@ -92,7 +92,7 @@ public final class WordListPreference extends Preference { setKey(wordlistId); } - private void setStatus(final int status) { + public void setStatus(final int status) { if (status == mStatus) return; mStatus = status; setSummary(getSummary(status)); @@ -106,6 +106,11 @@ public final class WordListPreference extends Preference { return mInterfaceState.addToCacheAndReturnView(newView); } + public boolean hasPriorityOver(final int otherPrefStatus) { + // Both of these should be one of MetadataDbHelper.STATUS_* + return mStatus > otherPrefStatus; + } + private String getSummary(final int status) { switch (status) { // If we are deleting the word list, for the user it's like it's already deleted. diff --git a/java/src/com/android/inputmethod/latin/AssetFileAddress.java b/java/src/com/android/inputmethod/latin/AssetFileAddress.java index 47c750f54..875192554 100644 --- a/java/src/com/android/inputmethod/latin/AssetFileAddress.java +++ b/java/src/com/android/inputmethod/latin/AssetFileAddress.java @@ -24,7 +24,7 @@ import java.io.File; * the package file. Open it correctly thus requires the name of the package it is in, but * also the offset in the file and the length of this data. This class encapsulates these three. */ -final class AssetFileAddress { +public final class AssetFileAddress { public final String mFilename; public final long mOffset; public final long mLength; diff --git a/java/src/com/android/inputmethod/latin/DictionaryFactory.java b/java/src/com/android/inputmethod/latin/DictionaryFactory.java index 40e51672a..4514ec2ec 100644 --- a/java/src/com/android/inputmethod/latin/DictionaryFactory.java +++ b/java/src/com/android/inputmethod/latin/DictionaryFactory.java @@ -21,6 +21,8 @@ import android.content.res.AssetFileDescriptor; import android.content.res.Resources; import android.util.Log; +import com.android.inputmethod.annotations.UsedForTesting; + import java.io.File; import java.util.ArrayList; import java.util.LinkedList; @@ -126,21 +128,22 @@ public final class DictionaryFactory { /** * Create a dictionary from passed data. This is intended for unit tests only. - * @param dictionary the file to read - * @param startOffset the offset in the file where the data starts - * @param length the length of the data + * @param dictionaryList the list of files to read, with their offsets and lengths * @param useFullEditDistance whether to use the full edit distance in suggestions * @return the created dictionary, or null. */ - public static Dictionary createDictionaryForTest(File dictionary, long startOffset, long length, + @UsedForTesting + public static Dictionary createDictionaryForTest(final AssetFileAddress[] dictionaryList, final boolean useFullEditDistance, Locale locale) { - if (dictionary.isFile()) { - return new BinaryDictionary(dictionary.getAbsolutePath(), startOffset, length, - useFullEditDistance, locale, Dictionary.TYPE_MAIN); - } else { - Log.e(TAG, "Could not find the file. path=" + dictionary.getAbsolutePath()); - return null; + final DictionaryCollection dictionaryCollection = + new DictionaryCollection(Dictionary.TYPE_MAIN); + for (final AssetFileAddress address : dictionaryList) { + final BinaryDictionary binaryDictionary = new BinaryDictionary(address.mFilename, + address.mOffset, address.mLength, useFullEditDistance, locale, + Dictionary.TYPE_MAIN); + dictionaryCollection.addDictionary(binaryDictionary); } + return dictionaryCollection; } /** diff --git a/java/src/com/android/inputmethod/latin/LatinIME.java b/java/src/com/android/inputmethod/latin/LatinIME.java index c9a42a3a4..70f8d0de8 100644 --- a/java/src/com/android/inputmethod/latin/LatinIME.java +++ b/java/src/com/android/inputmethod/latin/LatinIME.java @@ -1797,8 +1797,6 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) { final String word = mWordComposer.getTypedWord(); ResearchLogger.latinIME_handleBackspace_batch(word, 1); - ResearchLogger.getInstance().uncommitCurrentLogUnit( - word, false /* dumpCurrentLogUnit */); } final String rejectedSuggestion = mWordComposer.getTypedWord(); mWordComposer.reset(); @@ -1825,6 +1823,9 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen // like the smiley key or the .com key. final int length = mEnteredText.length(); mConnection.deleteSurroundingText(length, 0); + if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) { + ResearchLogger.latinIME_handleBackspace_cancelTextInput(mEnteredText); + } mEnteredText = null; // If we have mEnteredText, then we know that mHasUncommittedTypedChars == false. // In addition we know that spaceState is false, and that we should not be @@ -1858,7 +1859,8 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen mLastSelectionEnd = mLastSelectionStart; mConnection.deleteSurroundingText(numCharsDeleted, 0); if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) { - ResearchLogger.latinIME_handleBackspace(numCharsDeleted); + ResearchLogger.latinIME_handleBackspace(numCharsDeleted, + false /* shouldUncommitLogUnit */); } } else { // There is no selection, just delete one character. @@ -1876,12 +1878,13 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen mConnection.deleteSurroundingText(1, 0); } if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) { - ResearchLogger.latinIME_handleBackspace(1); + ResearchLogger.latinIME_handleBackspace(1, true /* shouldUncommitLogUnit */); } if (mDeleteCount > DELETE_ACCELERATE_AT) { mConnection.deleteSurroundingText(1, 0); if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) { - ResearchLogger.latinIME_handleBackspace(1); + ResearchLogger.latinIME_handleBackspace(1, + true /* shouldUncommitLogUnit */); } } } diff --git a/java/src/com/android/inputmethod/latin/Suggest.java b/java/src/com/android/inputmethod/latin/Suggest.java index 5d580f29b..e783e6d51 100644 --- a/java/src/com/android/inputmethod/latin/Suggest.java +++ b/java/src/com/android/inputmethod/latin/Suggest.java @@ -23,7 +23,6 @@ import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; -import java.io.File; import java.util.ArrayList; import java.util.Comparator; import java.util.HashSet; @@ -77,9 +76,9 @@ public final class Suggest { } @UsedForTesting - Suggest(final File dictionary, final long startOffset, final long length, final Locale locale) { - final Dictionary mainDict = DictionaryFactory.createDictionaryForTest(dictionary, - startOffset, length /* useFullEditDistance */, false, locale); + Suggest(final AssetFileAddress[] dictionaryList, final Locale locale) { + final Dictionary mainDict = DictionaryFactory.createDictionaryForTest(dictionaryList, + false /* useFullEditDistance */, locale); mLocale = locale; mMainDictionary = mainDict; addOrReplaceDictionary(mDictionaries, Dictionary.TYPE_MAIN, mainDict); diff --git a/java/src/com/android/inputmethod/research/ResearchLogger.java b/java/src/com/android/inputmethod/research/ResearchLogger.java index d84f69659..ec54616b7 100644 --- a/java/src/com/android/inputmethod/research/ResearchLogger.java +++ b/java/src/com/android/inputmethod/research/ResearchLogger.java @@ -863,7 +863,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang // Check that expected word matches. if (oldLogUnit != null) { final String oldLogUnitWords = oldLogUnit.getWordsAsString(); - if (oldLogUnitWords != null && !oldLogUnitWords.equals(expectedWord)) { + // Because the word is stored in the LogUnit with digits scrubbed, the comparison must + // be made on a scrubbed version of the expectedWord as well. + if (oldLogUnitWords != null && !oldLogUnitWords.equals( + scrubDigitsFromString(expectedWord))) { return; } } @@ -1274,6 +1277,16 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang } /** + * Log a revert of onTextInput() (known in the IME as "EnteredText"). + * + * SystemResponse: Remove the LogUnit recording the textInput + */ + public static void latinIME_handleBackspace_cancelTextInput(final String text) { + final ResearchLogger researchLogger = getInstance(); + researchLogger.uncommitCurrentLogUnit(text, true /* dumpCurrentLogUnit */); + } + + /** * Log a call to LatinIME.pickSuggestionManually(). * * UserAction: The user has chosen a specific word from the suggestion strip. @@ -1811,17 +1824,26 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang SystemClock.uptimeMillis()); } + private static final LogStatement LOGSTATEMENT_LATINIME_HANDLEBACKSPACE = + new LogStatement("LatinIMEHandleBackspace", true, false, "numCharacters"); /** * Log a call to LatinIME.handleBackspace() that is not a batch delete. * * UserInput: The user is deleting one or more characters by hitting the backspace key once. * The covers single character deletes as well as deleting selections. + * + * @param numCharacters how many characters the backspace operation deleted + * @param shouldUncommitLogUnit whether to uncommit the last {@code LogUnit} in the + * {@code LogBuffer} */ - private static final LogStatement LOGSTATEMENT_LATINIME_HANDLEBACKSPACE = - new LogStatement("LatinIMEHandleBackspace", true, false, "numCharacters"); - public static void latinIME_handleBackspace(final int numCharacters) { + public static void latinIME_handleBackspace(final int numCharacters, + final boolean shouldUncommitLogUnit) { final ResearchLogger researchLogger = getInstance(); researchLogger.enqueueEvent(LOGSTATEMENT_LATINIME_HANDLEBACKSPACE, numCharacters); + if (shouldUncommitLogUnit) { + ResearchLogger.getInstance().uncommitCurrentLogUnit( + null, true /* dumpCurrentLogUnit */); + } } /** @@ -1839,6 +1861,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang numCharacters); researchLogger.mStatistics.recordGestureDelete(deletedText.length(), SystemClock.uptimeMillis()); + researchLogger.uncommitCurrentLogUnit(deletedText.toString(), + false /* dumpCurrentLogUnit */); } /** diff --git a/native/jni/Android.mk b/native/jni/Android.mk index 34b352433..1518dad17 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -46,8 +46,6 @@ LATIN_IME_JNI_SRC_FILES := \ jni_common.cpp LATIN_IME_CORE_SRC_FILES := \ - bigram_dictionary.cpp \ - char_utils.cpp \ correction.cpp \ dic_traverse_wrapper.cpp \ unigram_dictionary.cpp \ @@ -58,6 +56,9 @@ LATIN_IME_CORE_SRC_FILES := \ dic_node_utils.cpp \ dic_nodes_cache.cpp) \ $(addprefix suggest/core/dictionary/, \ + bigram_dictionary.cpp \ + binary_dictionary_format.cpp \ + byte_array_utils.cpp \ dictionary.cpp \ digraph_utils.cpp) \ $(addprefix suggest/core/layout/, \ @@ -74,7 +75,8 @@ LATIN_IME_CORE_SRC_FILES := \ typing_scoring.cpp \ typing_suggest_policy.cpp \ typing_traversal.cpp \ - typing_weighting.cpp) + typing_weighting.cpp) \ + utils/char_utils.cpp LOCAL_SRC_FILES := \ $(LATIN_IME_JNI_SRC_FILES) \ diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 9f5e2ae73..e94120587 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -34,9 +34,10 @@ #include "correction.h" #include "jni.h" #include "jni_common.h" -#include "suggest_options.h" -#include "suggest/core/dictionary/binary_format.h" +#include "suggest/core/dictionary/binary_dictionary_format.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/suggest_options.h" namespace latinime { @@ -110,8 +111,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s return 0; } Dictionary *dictionary = 0; - if (BinaryFormat::UNKNOWN_FORMAT - == BinaryFormat::detectFormat(static_cast<uint8_t *>(dictBuf), + if (BinaryDictionaryFormat::UNKNOWN_VERSION + == BinaryDictionaryFormat::detectFormatVersion(static_cast<uint8_t *>(dictBuf), static_cast<int>(dictSize))) { AKLOGE("DICT: dictionary format is unknown, bad magic number"); #ifdef USE_MMAP_FOR_DICTIONARY @@ -260,7 +261,7 @@ static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jclass clazz, ji static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return; - const void *dictBuf = dictionary->getDict(); + const void *dictBuf = dictionary->getBinaryDictionaryInfo()->getDictBuf(); if (!dictBuf) return; #ifdef USE_MMAP_FOR_DICTIONARY releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(), diff --git a/native/jni/src/char_utils.h b/native/jni/src/char_utils.h deleted file mode 100644 index b429f40b2..000000000 --- a/native/jni/src/char_utils.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (C) 2010 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_CHAR_UTILS_H -#define LATINIME_CHAR_UTILS_H - -#include <cctype> - -#include "defines.h" - -namespace latinime { - -inline static bool isAsciiUpper(int c) { - // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to - // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). - return (c >= 'A' && c <= 'Z'); -} - -inline static int toAsciiLower(int c) { - return c - 'A' + 'a'; -} - -inline static bool isAscii(int c) { - return isascii(c) != 0; -} - -unsigned short latin_tolower(const unsigned short c); - -/** - * Table mapping most combined Latin, Greek, and Cyrillic characters - * to their base characters. If c is in range, BASE_CHARS[c] == c - * if c is not a combined character, or the base character if it - * is combined. - */ -static const int BASE_CHARS_SIZE = 0x0500; -extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; - -inline static int toBaseCodePoint(int c) { - if (c < BASE_CHARS_SIZE) { - return static_cast<int>(BASE_CHARS[c]); - } - return c; -} - -AK_FORCE_INLINE static int toLowerCase(const int c) { - if (isAsciiUpper(c)) { - return toAsciiLower(c); - } - if (isAscii(c)) { - return c; - } - return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); -} - -AK_FORCE_INLINE static int toBaseLowerCase(const int c) { - return toLowerCase(toBaseCodePoint(c)); -} - -inline static bool isIntentionalOmissionCodePoint(const int codePoint) { - // TODO: Do not hardcode here - return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; -} - -inline static int getCodePointCount(const int arraySize, const int *const codePoints) { - int size = 0; - for (; size < arraySize; ++size) { - if (codePoints[size] == '\0') { - break; - } - } - return size; -} - -} // namespace latinime -#endif // LATINIME_CHAR_UTILS_H diff --git a/native/jni/src/correction.cpp b/native/jni/src/correction.cpp index e2ad557c5..feed5622b 100644 --- a/native/jni/src/correction.cpp +++ b/native/jni/src/correction.cpp @@ -18,13 +18,13 @@ #include <cmath> -#include "char_utils.h" #include "correction.h" #include "defines.h" #include "suggest/core/layout/proximity_info_state.h" #include "suggest/core/layout/touch_position_correction_utils.h" #include "suggest/policyimpl/utils/edit_distance.h" #include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h" +#include "utils/char_utils.h" namespace latinime { @@ -528,7 +528,7 @@ inline static int getQuoteCount(const int *word, const int length) { } inline static bool isUpperCase(unsigned short c) { - return isAsciiUpper(toBaseCodePoint(c)); + return CharUtils::isAsciiUpper(CharUtils::toBaseCodePoint(c)); } ////////////////////// diff --git a/native/jni/src/correction.h b/native/jni/src/correction.h index 75b49952c..84d6429ba 100644 --- a/native/jni/src/correction.h +++ b/native/jni/src/correction.h @@ -22,6 +22,7 @@ #include "correction_state.h" #include "defines.h" #include "suggest/core/layout/proximity_info_state.h" +#include "utils/char_utils.h" namespace latinime { @@ -342,13 +343,13 @@ AK_FORCE_INLINE static void calcEditDistanceOneStep(int *editDistanceTable, cons const int *const prevprev = outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputSize + 1) : 0; current[0] = outputLength; - const int co = toBaseLowerCase(output[outputLength - 1]); - const int prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; + const int co = CharUtils::toBaseLowerCase(output[outputLength - 1]); + const int prevCO = outputLength >= 2 ? CharUtils::toBaseLowerCase(output[outputLength - 2]) : 0; for (int i = 1; i <= inputSize; ++i) { - const int ci = toBaseLowerCase(input[i - 1]); + const int ci = CharUtils::toBaseLowerCase(input[i - 1]); const int cost = (ci == co) ? 0 : 1; current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); - if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) { + if (i >= 2 && prevprev && ci == prevCO && co == CharUtils::toBaseLowerCase(input[i - 2])) { current[i] = min(current[i], prevprev[i - 2] + 1); } } diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index 1510e3d5e..3f64d07b2 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -17,13 +17,13 @@ #ifndef LATINIME_DIC_NODE_H #define LATINIME_DIC_NODE_H -#include "char_utils.h" #include "defines.h" #include "suggest/core/dicnode/dic_node_state.h" #include "suggest/core/dicnode/dic_node_profiler.h" #include "suggest/core/dicnode/dic_node_properties.h" #include "suggest/core/dicnode/dic_node_release_listener.h" #include "suggest/core/dictionary/digraph_utils.h" +#include "utils/char_utils.h" #if DEBUG_DICT #define LOGI_SHOW_ADD_COST_PROP \ @@ -221,7 +221,7 @@ class DicNode { bool isFirstCharUppercase() const { const int c = getOutputWordBuf()[0]; - return isAsciiUpper(c); + return CharUtils::isAsciiUpper(c); } bool isFirstWord() const { @@ -375,7 +375,7 @@ class DicNode { // Whether the current codepoint can be an intentional omission, in which case the traversal // algorithm will always check for a possible omission here. bool canBeIntentionalOmission() const { - return isIntentionalOmissionCodePoint(getNodeCodePoint()); + return CharUtils::isIntentionalOmissionCodePoint(getNodeCodePoint()); } // Whether the omission is so frequent that it should incur zero cost. diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index 7f0d0ed0e..3deee1a42 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -20,10 +20,13 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_utils.h" #include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/multi_bigram_map.h" +#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/layout/proximity_info.h" #include "suggest/core/layout/proximity_info_state.h" +#include "utils/char_utils.h" namespace latinime { @@ -31,20 +34,23 @@ namespace latinime { // Node initialization utils // /////////////////////////////// -/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot, - const int prevWordNodePos, DicNode *newRootNode) { - int curPos = rootPos; +/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, + const int prevWordNodePos, DicNode *const newRootNode) { + int curPos = binaryDictionaryInfo->getRootPosition(); const int pos = curPos; - const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); + const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( + binaryDictionaryInfo->getDictRoot(), &curPos); const int childrenPos = curPos; newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos); } -/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos, - const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) { - int curPos = rootPos; +/*static */ void DicNodeUtils::initAsRootWithPreviousWord( + const BinaryDictionaryInfo *const binaryDictionaryInfo, + DicNode *const prevWordLastNode, DicNode *const newRootNode) { + int curPos = binaryDictionaryInfo->getRootPosition(); const int pos = curPos; - const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); + const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( + binaryDictionaryInfo->getDictRoot(), &curPos); const int childrenPos = curPos; newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount); } @@ -62,24 +68,27 @@ namespace latinime { DicNodeVector *childDicNodes) { // Passing multiple chars node. No need to traverse child const int codePoint = dicNode->getNodeTypedCodePoint(); - const int baseLowerCaseCodePoint = toBaseLowerCase(codePoint); + const int baseLowerCaseCodePoint = CharUtils::toBaseLowerCase(codePoint); const bool isMatch = isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, codePoint); - if (isMatch || isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) { + if (isMatch || CharUtils::isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) { childDicNodes->pushPassingChild(dicNode); } } /* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos, - const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState, - const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, - const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth, + const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, + const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, + DicNodeVector *childDicNodes) { int nextPos = pos; - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer( + binaryDictionaryInfo->getDictRoot(), &pos); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); - int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos); + int codePoint = BinaryFormat::getCodePointAndForwardPointer( + binaryDictionaryInfo->getDictRoot(), &pos); ASSERT(NOT_A_CODE_POINT != codePoint); const int nodeCodePoint = codePoint; // TODO: optimize this @@ -89,7 +98,8 @@ namespace latinime { do { const int nextCodePoint = hasMultipleChars - ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT; + ? BinaryFormat::getCodePointAndForwardPointer( + binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT; const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint); if (!isLastChar) { additionalWordBuf[additionalSubwordLength++] = nextCodePoint; @@ -97,12 +107,14 @@ namespace latinime { codePoint = nextCodePoint; } while (NOT_A_CODE_POINT != codePoint); - const int probability = - isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1; + const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer( + binaryDictionaryInfo->getDictRoot(), pos) : -1; pos = BinaryFormat::skipProbability(flags, pos); - int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0; + int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition( + binaryDictionaryInfo->getDictRoot(), flags, pos) : 0; const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos); - const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos); + const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes( + binaryDictionaryInfo->getDictRoot(), flags, pos); if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) { return siblingPos; @@ -110,8 +122,8 @@ namespace latinime { if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) { return siblingPos; } - const int childrenCount = hasChildren - ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0; + const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer( + binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0; childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos, nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf); @@ -125,13 +137,13 @@ namespace latinime { return false; } if (pInfo && (pInfo->getKeyIndexOf(nodeCodePoint) == NOT_AN_INDEX - || isIntentionalOmissionCodePoint(nodeCodePoint))) { + || CharUtils::isIntentionalOmissionCodePoint(nodeCodePoint))) { // If normalized nodeCodePoint is not on the keyboard or skippable, this child is never // filtered. return false; } - const int lowerCodePoint = toLowerCase(nodeCodePoint); - const int baseLowerCodePoint = toBaseCodePoint(lowerCodePoint); + const int lowerCodePoint = CharUtils::toLowerCase(nodeCodePoint); + const int baseLowerCodePoint = CharUtils::toBaseCodePoint(lowerCodePoint); // TODO: Avoid linear search for (int i = 0; i < filterSize; ++i) { // Checking if a normalized code point is in filter characters when pInfo is not @@ -147,16 +159,18 @@ namespace latinime { } /* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode, - const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, - const bool exactOnly, const std::vector<int> *const codePointsFilter, - const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, + const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, + DicNodeVector *childDicNodes) { const int terminalDepth = dicNode->getLeavingDepth(); const int childCount = dicNode->getChildrenCount(); int nextPos = dicNode->getChildrenPos(); for (int i = 0; i < childCount; i++) { const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; - nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState, - pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); + nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo, + terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo, + childDicNodes); if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) { // All code points have been found. break; @@ -164,14 +178,15 @@ namespace latinime { } } -/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, - DicNodeVector *childDicNodes) { - getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes); +/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, + const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) { + getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes); } /* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode, - const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, - bool exactOnly, DicNodeVector *childDicNodes) { + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, + DicNodeVector *childDicNodes) { if (dicNode->isTotalInputSizeExceedingLimit()) { return; } @@ -179,9 +194,9 @@ namespace latinime { DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly, childDicNodes); } else { - DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex, - exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */, - childDicNodes); + DicNodeUtils::createAndGetAllLeavingChildNodes( + dicNode, binaryDictionaryInfo, pInfoState, pointIndex, exactOnly, + 0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes); } } @@ -191,32 +206,35 @@ namespace latinime { /** * Computes the combined bigram / unigram cost for the given dicNode. */ -/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot, +/* static */ float DicNodeUtils::getBigramNodeImprobability( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const DicNode *const node, MultiBigramMap *multiBigramMap) { if (node->isImpossibleBigramWord()) { return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); } - const int probability = getBigramNodeProbability(dicRoot, node, multiBigramMap); + const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap); // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. const float cost = static_cast<float>(MAX_PROBABILITY - probability) / static_cast<float>(MAX_PROBABILITY); return cost; } -/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot, +/* static */ int DicNodeUtils::getBigramNodeProbability( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const DicNode *const node, MultiBigramMap *multiBigramMap) { const int unigramProbability = node->getProbability(); const int wordPos = node->getPos(); const int prevWordPos = node->getPrevWordPos(); if (NOT_VALID_WORD == wordPos || NOT_VALID_WORD == prevWordPos) { // Note: Normally wordPos comes from the dictionary and should never equal NOT_VALID_WORD. - return backoff(unigramProbability); + return ProbabilityUtils::backoff(unigramProbability); } if (multiBigramMap) { return multiBigramMap->getBigramProbability( - dicRoot, prevWordPos, wordPos, unigramProbability); + binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability); } - return BinaryFormat::getBigramProbability(dicRoot, prevWordPos, wordPos, unigramProbability); + return BinaryFormat::getBigramProbability( + binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability); } /////////////////////////////////////// diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h index 5bc542d05..e198d6181 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -24,6 +24,7 @@ namespace latinime { +class BinaryDictionaryInfo; class DicNode; class DicNodeVector; class ProximityInfo; @@ -34,19 +35,20 @@ class DicNodeUtils { public: static int appendTwoWords(const int *src0, const int16_t length0, const int *src1, const int16_t length1, int *dest); - static void initAsRoot(const int rootPos, const uint8_t *const dicRoot, + static void initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int prevWordNodePos, DicNode *newRootNode); - static void initAsRootWithPreviousWord(const int rootPos, const uint8_t *const dicRoot, + static void initAsRootWithPreviousWord(const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNode *prevWordLastNode, DicNode *newRootNode); static void initByCopy(DicNode *srcNode, DicNode *destNode); - static void getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, - DicNodeVector *childDicNodes); - static float getBigramNodeImprobability(const uint8_t *const dicRoot, + static void getAllChildDicNodes(DicNode *dicNode, + const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes); + static float getBigramNodeImprobability(const BinaryDictionaryInfo *const binaryDictionaryInfo, const DicNode *const node, MultiBigramMap *const multiBigramMap); static bool isDicNodeFilteredOut(const int nodeCodePoint, const ProximityInfo *const pInfo, const std::vector<int> *const codePointsFilter); // TODO: Move to private - static void getProximityChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, + static void getProximityChildDicNodes(DicNode *dicNode, + const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, DicNodeVector *childDicNodes); @@ -60,16 +62,18 @@ class DicNodeUtils { // Max number of bigrams to look up static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500; - static int getBigramNodeProbability(const uint8_t *const dicRoot, const DicNode *const node, - MultiBigramMap *multiBigramMap); + static int getBigramNodeProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, + const DicNode *const node, MultiBigramMap *multiBigramMap); static void createAndGetPassingChildNode(DicNode *dicNode, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, DicNodeVector *childDicNodes); - static void createAndGetAllLeavingChildNodes(DicNode *dicNode, const uint8_t *const dicRoot, + static void createAndGetAllLeavingChildNodes(DicNode *dicNode, + const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes); - static int createAndGetLeavingChildNode(DicNode *dicNode, int pos, const uint8_t *const dicRoot, - const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex, + static int createAndGetLeavingChildNode(DicNode *dicNode, int pos, + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth, + const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes); diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index c592542bd..59d1b19b6 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -20,15 +20,18 @@ #include "bigram_dictionary.h" -#include "char_utils.h" #include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/probability_utils.h" +#include "utils/char_utils.h" namespace latinime { -BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) { +BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo) + : mBinaryDictionaryInfo(binaryDictionaryInfo) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } @@ -52,7 +55,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int int insertAt = 0; while (insertAt < MAX_RESULTS) { if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability - && length < getCodePointCount(MAX_WORD_LENGTH, + && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { break; } @@ -103,7 +106,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams @@ -134,7 +137,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. - const int probability = BinaryFormat::computeProbabilityForBigram( + const int probability = ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramProbabilityTemp); addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, outputTypes); @@ -149,7 +152,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return 0; - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, forceLowerCaseSearch); @@ -170,7 +173,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); if (0 == pos) { @@ -196,9 +199,9 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons // what user typed. int maxAlt = MAX_ALTERNATIVES; - const int firstBaseLowerCodePoint = toBaseLowerCase(*word); + const int firstBaseLowerCodePoint = CharUtils::toBaseLowerCase(*word); while (maxAlt > 0) { - if (toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) { + if (CharUtils::toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) { return true; } inputCodePoints++; @@ -209,7 +212,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const { - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; diff --git a/native/jni/src/bigram_dictionary.h b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h index b86e564c3..8b7a253a2 100644 --- a/native/jni/src/bigram_dictionary.h +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h @@ -24,9 +24,12 @@ namespace latinime { +class BinaryDictionaryInfo; + class BigramDictionary { public: - BigramDictionary(const uint8_t *const streamStart); + BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo); + int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const; void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, @@ -35,13 +38,14 @@ class BigramDictionary { ~BigramDictionary(); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); + void addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const; bool checkFirstCharacter(int *word, int *inputCodePoints) const; int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const; - const uint8_t *const DICT_ROOT; + const BinaryDictionaryInfo *const mBinaryDictionaryInfo; // TODO: Re-implement proximity correction for bigram correction static const int MAX_ALTERNATIVES = 1; }; diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.cpp new file mode 100644 index 000000000..50e0211d7 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/binary_dictionary_format.h" + +namespace latinime { + +/** + * Dictionary size + */ +// Any file smaller than this is not a dictionary. +const int BinaryDictionaryFormat::DICTIONARY_MINIMUM_SIZE = 4; + +/** + * Format versions + */ +// Originally, format version 1 had a 16-bit magic number, then the version number `01' +// then options that must be 0. Hence the first 32-bits of the format are always as follow +// and it's okay to consider them a magic number as a whole. +const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; +const int BinaryDictionaryFormat::FORMAT_VERSION_1_HEADER_SIZE = 5; + +// The versions of Latin IME that only handle format version 1 only test for the magic +// number, so we had to change it so that version 2 files would be rejected by older +// implementations. On this occasion, we made the magic number 32 bits long. +const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; +// Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12 +const int BinaryDictionaryFormat::FORMAT_VERSION_2_MINIMUM_SIZE = 12; +const int BinaryDictionaryFormat::VERSION_2_MAGIC_NUMBER_SIZE = 4; +const int BinaryDictionaryFormat::VERSION_2_DICTIONARY_VERSION_SIZE = 2; +const int BinaryDictionaryFormat::VERSION_2_DICTIONARY_FLAG_SIZE = 2; + +/* static */ BinaryDictionaryFormat::FORMAT_VERSION BinaryDictionaryFormat::detectFormatVersion( + const uint8_t *const dict, const int dictSize) { + // The magic number is stored big-endian. + // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't + // understand this format. + if (dictSize < DICTIONARY_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0); + switch (magicNumber) { + case FORMAT_VERSION_1_MAGIC_NUMBER: + // Format 1 header is exactly 5 bytes long and looks like: + // Magic number (2 bytes) 0x78 0xB1 + // Version number (1 byte) 0x01 + // Options (2 bytes) must be 0x00 0x00 + return VERSION_1; + case FORMAT_VERSION_2_MAGIC_NUMBER: + // Version 2 dictionaries are at least 12 bytes long. + // If this dictionary has the version 2 magic number but is less than 12 bytes long, + // then it's an unknown format and we need to avoid confidently reading the next bytes. + if (dictSize < FORMAT_VERSION_2_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + // Format 2 header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Version number (2 bytes) 0x00 0x02 + // Options (2 bytes) + // Header size (4 bytes) : integer, big endian + if (ByteArrayUtils::readUint16(dict, 4) == 2) { + return VERSION_2; + } else { + return UNKNOWN_VERSION; + } + default: + return UNKNOWN_VERSION; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.h new file mode 100644 index 000000000..3aa1662da --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_FORMAT_H +#define LATINIME_BINARY_DICTIONARY_FORMAT_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/dictionary/byte_array_utils.h" + +namespace latinime { + +/** + * Methods to handle binary dictionary format version. + * + * Currently, we have a file with a similar name, binary_format.h. binary_format.h contains binary + * reading methods and utility methods for various purposes. + * On the other hand, this file deals with only about dictionary format version. + */ +class BinaryDictionaryFormat { + public: + // TODO: Remove obsolete version logic + enum FORMAT_VERSION { + VERSION_1, + VERSION_2, + UNKNOWN_VERSION + }; + + static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); + + static AK_FORCE_INLINE int getHeaderSize( + const uint8_t *const dict, const FORMAT_VERSION format) { + switch (format) { + case VERSION_1: + return FORMAT_VERSION_1_HEADER_SIZE; + case VERSION_2: + // See the format of the header in the comment in detectFormat() above + return ByteArrayUtils::readUint32(dict, 8); + default: + return S_INT_MAX; + } + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryFormat); + + static const int DICTIONARY_MINIMUM_SIZE; + static const uint32_t FORMAT_VERSION_1_MAGIC_NUMBER; + static const int FORMAT_VERSION_1_HEADER_SIZE; + static const uint32_t FORMAT_VERSION_2_MAGIC_NUMBER; + static const int FORMAT_VERSION_2_MINIMUM_SIZE; + static const int VERSION_2_MAGIC_NUMBER_SIZE; + static const int VERSION_2_DICTIONARY_VERSION_SIZE ; + static const int VERSION_2_DICTIONARY_FLAG_SIZE; +}; +} // namespace latinime +#endif /* LATINIME_BINARY_DICTIONARY_FORMAT_H */ diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h new file mode 100644 index 000000000..8508c6786 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_INFO_H +#define LATINIME_BINARY_DICTIONARY_INFO_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_format.h" + +namespace latinime { + +class BinaryDictionaryInfo { + public: + BinaryDictionaryInfo(const uint8_t *const dictBuf, const int dictSize) + : mDictBuf(dictBuf), + mFormat(BinaryDictionaryFormat::detectFormatVersion(mDictBuf, dictSize)), + mDictRoot(mDictBuf + BinaryDictionaryFormat::getHeaderSize(mDictBuf, mFormat)) {} + + AK_FORCE_INLINE const uint8_t *getDictBuf() const { + return mDictBuf; + } + + AK_FORCE_INLINE const uint8_t *getDictRoot() const { + return mDictRoot; + } + + AK_FORCE_INLINE BinaryDictionaryFormat::FORMAT_VERSION getFormat() const { + return mFormat; + } + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryInfo); + + const uint8_t *const mDictBuf; + const BinaryDictionaryFormat::FORMAT_VERSION mFormat; + const uint8_t *const mDictRoot; +}; +} +#endif /* LATINIME_BINARY_DICTIONARY_INFO_H */ diff --git a/native/jni/src/suggest/core/dictionary/binary_format.h b/native/jni/src/suggest/core/dictionary/binary_format.h index 65c2e9115..1b57793fa 100644 --- a/native/jni/src/suggest/core/dictionary/binary_format.h +++ b/native/jni/src/suggest/core/dictionary/binary_format.h @@ -18,12 +18,12 @@ #define LATINIME_BINARY_FORMAT_H #include <cstdlib> -#include <map> #include <stdint.h> -#include "char_utils.h" -#include "hash_map_compat.h" #include "suggest/core/dictionary/bloom_filter.h" +#include "suggest/core/dictionary/probability_utils.h" +#include "utils/char_utils.h" +#include "utils/hash_map_compat.h" namespace latinime { @@ -91,10 +91,6 @@ class BinaryFormat { const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramProbability); - static int computeProbabilityForBigram( - const int unigramProbability, const int bigramProbability); - static int getProbability(const int position, const std::map<int, int> *bigramMap, - const uint8_t *bigramFilter, const int unigramProbability); static int getBigramProbabilityFromHashMap(const int position, const hash_map_compat<int, int> *bigramMap, const int unigramProbability); static float getMultiWordCostMultiplier(const uint8_t *const dict, const int dictSize); @@ -473,7 +469,8 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, // there was no match (or we would have found it). if (wordPos >= length) return NOT_VALID_WORD; int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); - const int wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos]; + const int wChar = forceLowerCaseSearch + ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; while (true) { // If there are no more character groups in this node, it means we could not // find a matching character for this depth, therefore there is no match. @@ -677,51 +674,18 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } -static inline int backoff(const int unigramProbability) { - return unigramProbability; - // For some reason, applying the backoff weight gives bad results in tests. To apply the - // backoff weight, we divide the probability by 2, which in our storing format means - // decreasing the score by 8. - // TODO: figure out what's wrong with this. - // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); -} - -inline int BinaryFormat::computeProbabilityForBigram( - const int unigramProbability, const int bigramProbability) { - // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the - // unigram probability to be the median value of the 17th step from the top. A value of - // 0 for the bigram probability represents the middle of the 16th step from the top, - // while a value of 15 represents the middle of the top step. - // See makedict.BinaryDictInputOutput for details. - const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) - / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); - return unigramProbability - + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); -} - -// This returns a probability in log space. -inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, - const uint8_t *bigramFilter, const int unigramProbability) { - if (!bigramMap || !bigramFilter) return backoff(unigramProbability); - if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); - const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); - if (bigramProbabilityIt != bigramMap->end()) { - const int bigramProbability = bigramProbabilityIt->second; - return computeProbabilityForBigram(unigramProbability, bigramProbability); - } - return backoff(unigramProbability); -} - // This returns a probability in log space. inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position, const hash_map_compat<int, int> *bigramMap, const int unigramProbability) { - if (!bigramMap) return backoff(unigramProbability); + if (!bigramMap) { + return ProbabilityUtils::backoff(unigramProbability); + } const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; - return computeProbabilityForBigram(unigramProbability, bigramProbability); + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability); } - return backoff(unigramProbability); + return ProbabilityUtils::backoff(unigramProbability); } AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( @@ -742,7 +706,9 @@ AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position, const int nextPosition, const int unigramProbability) { position = getBigramListPositionForWordPosition(root, position); - if (0 == position) return backoff(unigramProbability); + if (0 == position) { + return ProbabilityUtils::backoff(unigramProbability); + } uint8_t bigramFlags; do { @@ -751,10 +717,11 @@ AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root root, bigramFlags, &position); if (bigramPos == nextPosition) { const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags; - return computeProbabilityForBigram(unigramProbability, bigramProbability); + return ProbabilityUtils::computeProbabilityForBigram( + unigramProbability, bigramProbability); } } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); - return backoff(unigramProbability); + return ProbabilityUtils::backoff(unigramProbability); } // Returns a pointer to the start of the bigram list. diff --git a/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp b/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp new file mode 100644 index 000000000..68b1d5d15 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/byte_array_utils.h" + +namespace latinime { + +const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/byte_array_utils.h b/native/jni/src/suggest/core/dictionary/byte_array_utils.h new file mode 100644 index 000000000..832b74725 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/byte_array_utils.h @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_UTILS_H +#define LATINIME_BYTE_ARRAY_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +/** + * Utility methods for reading byte arrays. + */ +class ByteArrayUtils { + public: + /** + * Integer + * + * Each method read a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) + ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; + } + + static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; + } + + static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 8) ^ buffer[pos + 1]; + } + + static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { + return buffer[pos]; + } + + static AK_FORCE_INLINE uint32_t readUint32andAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint32(buffer, *pos); + *pos += 4; + return value; + } + + static AK_FORCE_INLINE uint32_t readUint24andAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint24(buffer, *pos); + *pos += 3; + return value; + } + + static AK_FORCE_INLINE uint16_t readUint16andAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint16_t value = readUint16(buffer, *pos); + *pos += 2; + return value; + } + + static AK_FORCE_INLINE uint8_t readUint8andAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return buffer[(*pos)++]; + } + + /** + * Code Point + * + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + */ + static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { + int p = pos; + return readCodePointAndAdvancePosition(buffer, &p); + } + + static AK_FORCE_INLINE int readCodePointAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t firstByte = readUint8(buffer, *pos); + if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte == CHARACTER_ARRAY_TERMINATOR) { + *pos += 1; + return NOT_A_CODE_POINT; + } else { + return readUint24andAdvancePosition(buffer, pos); + } + } else { + *pos += 1; + return firstByte; + } + } + + /** + * String (array of code points) + * + * Reads code points until the terminator is found. + */ + // Returns the length of the string. + static int readStringAndAdvancePosition(const uint8_t *const buffer, int *const pos, + int *const outBuffer, const int maxLength) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + outBuffer[length++] = codePoint; + codePoint = readCodePointAndAdvancePosition(buffer, pos); + } + return length; + } + + // Advances the position and returns the length of the string. + static int advancePositionToBehindString( + const uint8_t *const buffer, int *const pos, const int maxLength) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + codePoint = readCodePointAndAdvancePosition(buffer, pos); + } + return length; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); + + static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t CHARACTER_ARRAY_TERMINATOR; +}; +} // namespace latinime +#endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 1939c7420..6fd755dfe 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -21,12 +21,12 @@ #include <map> // TODO: remove #include <stdint.h> -#include "bigram_dictionary.h" #include "defines.h" #include "dic_traverse_wrapper.h" -#include "suggest_options.h" -#include "suggest/core/suggest.h" +#include "suggest/core/dictionary/bigram_dictionary.h" #include "suggest/core/dictionary/binary_format.h" +#include "suggest/core/suggest.h" +#include "suggest/core/suggest_options.h" #include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" #include "suggest/policyimpl/typing/typing_suggest_policy_factory.h" #include "unigram_dictionary.h" @@ -34,13 +34,11 @@ namespace latinime { Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust) - : mDict(static_cast<unsigned char *>(dict)), - mOffsetDict((static_cast<unsigned char *>(dict)) - + BinaryFormat::getHeaderSize(mDict, dictSize)), + : mBinaryDicitonaryInfo(static_cast<const uint8_t *>(dict), dictSize), mDictSize(dictSize), mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust), - mUnigramDictionary(new UnigramDictionary(mOffsetDict, - BinaryFormat::getFlags(mDict, dictSize))), - mBigramDictionary(new BigramDictionary(mOffsetDict)), + mUnigramDictionary(new UnigramDictionary(&mBinaryDicitonaryInfo, + BinaryFormat::getFlags(mBinaryDicitonaryInfo.getDictBuf(), dictSize))), + mBigramDictionary(new BigramDictionary(&mBinaryDicitonaryInfo)), mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())), mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) { } diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index e6861a3dd..771837bc6 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -20,6 +20,7 @@ #include <stdint.h> #include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" namespace latinime { @@ -64,11 +65,8 @@ class Dictionary { int getProbability(const int *word, int length) const; bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const; - const uint8_t *getDict() const { // required to release dictionary buffer - return mDict; - } - const uint8_t *getOffsetDict() const { - return mOffsetDict; + const BinaryDictionaryInfo *getBinaryDictionaryInfo() const { + return &mBinaryDicitonaryInfo; } int getDictSize() const { return mDictSize; } int getMmapFd() const { return mMmapFd; } @@ -78,9 +76,8 @@ class Dictionary { private: DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary); - const uint8_t *mDict; - const uint8_t *mOffsetDict; + const BinaryDictionaryInfo mBinaryDicitonaryInfo; // Used only for the mmap version of dictionary loading, but we use these as dummy variables // also for the malloc version. const int mDictSize; diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp index 7a0f755e5..f53e56ef1 100644 --- a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp @@ -16,9 +16,9 @@ #include "suggest/core/dictionary/digraph_utils.h" -#include "char_utils.h" #include "defines.h" #include "suggest/core/dictionary/binary_format.h" +#include "utils/char_utils.h" namespace latinime { @@ -122,7 +122,7 @@ const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = /* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForDigraphTypeAndCodePoint( const DigraphUtils::DigraphType digraphType, const int compositeGlyphCodePoint) { const DigraphUtils::digraph_t *digraphs = 0; - const int compositeGlyphLowerCodePoint = toLowerCase(compositeGlyphCodePoint); + const int compositeGlyphLowerCodePoint = CharUtils::toLowerCase(compositeGlyphCodePoint); const int digraphsSize = DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(digraphType, &digraphs); for (int i = 0; i < digraphsSize; i++) { diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index fcac98f35..ba97e5842 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -17,11 +17,10 @@ #ifndef LATINIME_MULTI_BIGRAM_MAP_H #define LATINIME_MULTI_BIGRAM_MAP_H -#include <stdint.h> - #include "defines.h" -#include "hash_map_compat.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" +#include "utils/hash_map_compat.h" namespace latinime { @@ -35,20 +34,20 @@ class MultiBigramMap { // Look up the bigram probability for the given word pair from the cached bigram maps. // Also caches the bigrams if there is space remaining and they have not been cached already. - int getBigramProbability(const uint8_t *const dicRoot, const int wordPosition, - const int nextWordPosition, const int unigramProbability) { + int getBigramProbability(const BinaryDictionaryInfo *const binaryDicitonaryInfo, + const int wordPosition, const int nextWordPosition, const int unigramProbability) { hash_map_compat<int, BigramMap>::const_iterator mapPosition = mBigramMaps.find(wordPosition); if (mapPosition != mBigramMaps.end()) { return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability); } if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { - addBigramsForWordPosition(dicRoot, wordPosition); + addBigramsForWordPosition(binaryDicitonaryInfo, wordPosition); return mBigramMaps[wordPosition].getBigramProbability( nextWordPosition, unigramProbability); } - return BinaryFormat::getBigramProbability( - dicRoot, wordPosition, nextWordPosition, unigramProbability); + return BinaryFormat::getBigramProbability(binaryDicitonaryInfo->getDictRoot(), + wordPosition, nextWordPosition, unigramProbability); } void clear() { @@ -63,8 +62,9 @@ class MultiBigramMap { BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {} ~BigramMap() {} - void init(const uint8_t *const dicRoot, int position) { - BinaryFormat::fillBigramProbabilityToHashMap(dicRoot, position, &mBigramMap); + void init(const BinaryDictionaryInfo *const binaryDicitonaryInfo, const int position) { + BinaryFormat::fillBigramProbabilityToHashMap( + binaryDicitonaryInfo->getDictRoot(), position, &mBigramMap); } inline int getBigramProbability(const int nextWordPosition, const int unigramProbability) @@ -78,8 +78,9 @@ class MultiBigramMap { hash_map_compat<int, int> mBigramMap; }; - void addBigramsForWordPosition(const uint8_t *const dicRoot, const int position) { - mBigramMaps[position].init(dicRoot, position); + void addBigramsForWordPosition(const BinaryDictionaryInfo *const binaryDicitonaryInfo, + const int position) { + mBigramMaps[position].init(binaryDicitonaryInfo, position); } hash_map_compat<int, BigramMap> mBigramMaps; diff --git a/native/jni/src/suggest/core/dictionary/probability_utils.h b/native/jni/src/suggest/core/dictionary/probability_utils.h new file mode 100644 index 000000000..14d2f8436 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/probability_utils.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_UTILS_H +#define LATINIME_PROBABILITY_UTILS_H + +#include <map> +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class ProbabilityUtils { + public: + static AK_FORCE_INLINE int backoff(const int unigramProbability) { + return unigramProbability; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramProbability > 8 ? + // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); + } + + static AK_FORCE_INLINE int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want + // the unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictInputOutput for details. + const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); + } + + // This returns a probability in log space. + static AK_FORCE_INLINE int getProbability(const int position, + const std::map<int, int> *const bigramMap, + const uint8_t *bigramFilter, const int unigramProbability) { + if (!bigramMap || !bigramFilter) { + return backoff(unigramProbability); + } + if (!isInFilter(bigramFilter, position)){ + return backoff(unigramProbability); + } + const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); + if (bigramProbabilityIt != bigramMap->end()) { + const int bigramProbability = bigramProbabilityIt->second; + return computeProbabilityForBigram(unigramProbability, bigramProbability); + } + return backoff(unigramProbability); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); +}; +} +#endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/terminal_attributes.h b/native/jni/src/suggest/core/dictionary/terminal_attributes.h index 8377c603d..bbd9af090 100644 --- a/native/jni/src/suggest/core/dictionary/terminal_attributes.h +++ b/native/jni/src/suggest/core/dictionary/terminal_attributes.h @@ -19,6 +19,7 @@ #include <stdint.h> +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" namespace latinime { @@ -32,8 +33,9 @@ class TerminalAttributes { public: class ShortcutIterator { public: - ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) - : mDict(dict), mPos(pos), + ShortcutIterator(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos, + const uint8_t flags) + : mBinaryDicitionaryInfo(binaryDictionaryInfo), mPos(pos), mHasNextShortcutTarget(0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS)) { } @@ -44,11 +46,13 @@ class TerminalAttributes { // Gets the shortcut target itself as an int string. For parameters and return value // see BinaryFormat::getWordAtAddress. inline int getNextShortcutTarget(const int maxDepth, int *outWord, int *outFreq) { - const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos); + const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer( + mBinaryDicitionaryInfo->getDictRoot(), &mPos); mHasNextShortcutTarget = 0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT); unsigned int i; for (i = 0; i < MAX_WORD_LENGTH; ++i) { - const int codePoint = BinaryFormat::getCodePointAndForwardPointer(mDict, &mPos); + const int codePoint = BinaryFormat::getCodePointAndForwardPointer( + mBinaryDicitionaryInfo->getDictRoot(), &mPos); if (NOT_A_CODE_POINT == codePoint) break; outWord[i] = codePoint; } @@ -57,19 +61,21 @@ class TerminalAttributes { } private: - const uint8_t *const mDict; + const BinaryDictionaryInfo *const mBinaryDicitionaryInfo; int mPos; bool mHasNextShortcutTarget; }; - TerminalAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) - : mDict(dict), mFlags(flags), mStartPos(pos) { + TerminalAttributes(const BinaryDictionaryInfo *const binaryDicitonaryInfo, + const uint8_t flags, const int pos) + : mBinaryDicitionaryInfo(binaryDicitonaryInfo), mFlags(flags), mStartPos(pos) { } inline ShortcutIterator getShortcutIterator() const { // The size of the shortcuts is stored here so that the whole shortcut chunk can be // skipped quickly, so we ignore it. - return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags); + return ShortcutIterator( + mBinaryDicitionaryInfo, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags); } bool isBlacklistedOrNotAWord() const { @@ -78,7 +84,7 @@ class TerminalAttributes { private: DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes); - const uint8_t *const mDict; + const BinaryDictionaryInfo *const mBinaryDicitionaryInfo; const uint8_t mFlags; const int mStartPos; }; diff --git a/native/jni/src/suggest/core/layout/proximity_info.cpp b/native/jni/src/suggest/core/layout/proximity_info.cpp index 6dd88051c..80355c148 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info.cpp @@ -21,12 +21,12 @@ #include <cstring> #include <cmath> -#include "char_utils.h" #include "defines.h" #include "jni.h" #include "suggest/core/layout/additional_proximity_chars.h" #include "suggest/core/layout/geometry_utils.h" #include "suggest/core/layout/proximity_info_params.h" +#include "utils/char_utils.h" namespace latinime { @@ -165,7 +165,7 @@ void ProximityInfo::initializeG() { // TODO: Optimize for (int i = 0; i < KEY_COUNT; ++i) { const int code = mKeyCodePoints[i]; - const int lowerCode = toLowerCase(code); + const int lowerCode = CharUtils::toLowerCase(code); mCenterXsG[i] = mKeyXCoordinates[i] + mKeyWidths[i] / 2; mCenterYsG[i] = mKeyYCoordinates[i] + mKeyHeights[i] / 2; mCodeToKeyMap[lowerCode] = i; diff --git a/native/jni/src/suggest/core/layout/proximity_info.h b/native/jni/src/suggest/core/layout/proximity_info.h index 6d2ddd4bc..6ca2fdd7b 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.h +++ b/native/jni/src/suggest/core/layout/proximity_info.h @@ -18,9 +18,9 @@ #define LATINIME_PROXIMITY_INFO_H #include "defines.h" -#include "hash_map_compat.h" #include "jni.h" #include "suggest/core/layout/proximity_info_utils.h" +#include "utils/hash_map_compat.h" namespace latinime { diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.cpp b/native/jni/src/suggest/core/layout/proximity_info_state.cpp index 2bd3ceb7e..4e53992d4 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info_state.cpp @@ -26,6 +26,7 @@ #include "suggest/core/layout/geometry_utils.h" #include "suggest/core/layout/proximity_info.h" #include "suggest/core/layout/proximity_info_state_utils.h" +#include "utils/char_utils.h" namespace latinime { @@ -175,7 +176,7 @@ float ProximityInfoState::getPointToKeyLength( const int index = inputIndex * mProximityInfo->getKeyCount() + keyId; return min(mSampledNormalizedSquaredLengthCache[index], mMaxPointToKeyLength); } - if (isIntentionalOmissionCodePoint(codePoint)) { + if (CharUtils::isIntentionalOmissionCodePoint(codePoint)) { return 0.0f; } // If the char is not a key on the keyboard then return the max length. @@ -203,7 +204,7 @@ ProximityType ProximityInfoState::getProximityType(const int index, const int co const bool checkProximityChars, int *proximityIndex) const { const int *currentCodePoints = getProximityCodePointsAt(index); const int firstCodePoint = currentCodePoints[0]; - const int baseLowerC = toBaseLowerCase(codePoint); + const int baseLowerC = CharUtils::toBaseLowerCase(codePoint); // The first char in the array is what user typed. If it matches right away, that means the // user typed that same char for this pos. @@ -215,7 +216,7 @@ ProximityType ProximityInfoState::getProximityType(const int index, const int co // If the non-accented, lowercased version of that first character matches c, then we have a // non-accented version of the accented character the user typed. Treat it as a close char. - if (toBaseLowerCase(firstCodePoint) == baseLowerC) { + if (CharUtils::toBaseLowerCase(firstCodePoint) == baseLowerC) { return PROXIMITY_CHAR; } @@ -257,8 +258,8 @@ ProximityType ProximityInfoState::getProximityTypeG(const int index, const int c if (!isUsed()) { return UNRELATED_CHAR; } - const int lowerCodePoint = toLowerCase(codePoint); - const int baseLowerCodePoint = toBaseCodePoint(lowerCodePoint); + const int lowerCodePoint = CharUtils::toLowerCase(codePoint); + const int baseLowerCodePoint = CharUtils::toBaseCodePoint(lowerCodePoint); for (int i = 0; i < static_cast<int>(mSampledSearchKeyVectors[index].size()); ++i) { if (mSampledSearchKeyVectors[index][i] == lowerCodePoint || mSampledSearchKeyVectors[index][i] == baseLowerCodePoint) { diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.h b/native/jni/src/suggest/core/layout/proximity_info_state.h index fd09307fe..0079ab5b8 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state.h +++ b/native/jni/src/suggest/core/layout/proximity_info_state.h @@ -20,11 +20,10 @@ #include <cstring> // for memset() #include <vector> -#include "char_utils.h" #include "defines.h" -#include "hash_map_compat.h" #include "suggest/core/layout/proximity_info_params.h" #include "suggest/core/layout/proximity_info_state_utils.h" +#include "utils/hash_map_compat.h" namespace latinime { diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h index 1837c7ab6..66fe07926 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h +++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h @@ -21,7 +21,7 @@ #include <vector> #include "defines.h" -#include "hash_map_compat.h" +#include "utils/hash_map_compat.h" namespace latinime { class ProximityInfo; diff --git a/native/jni/src/suggest/core/layout/proximity_info_utils.h b/native/jni/src/suggest/core/layout/proximity_info_utils.h index c3a275b3c..54f7539d1 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_utils.h +++ b/native/jni/src/suggest/core/layout/proximity_info_utils.h @@ -19,11 +19,11 @@ #include <cmath> -#include "char_utils.h" #include "defines.h" -#include "hash_map_compat.h" #include "suggest/core/layout/additional_proximity_chars.h" #include "suggest/core/layout/geometry_utils.h" +#include "utils/char_utils.h" +#include "utils/hash_map_compat.h" namespace latinime { class ProximityInfoUtils { @@ -37,7 +37,7 @@ class ProximityInfoUtils { if (c == NOT_A_CODE_POINT) { return NOT_AN_INDEX; } - const int lowerCode = toLowerCase(c); + const int lowerCode = CharUtils::toLowerCase(c); hash_map_compat<int, int>::const_iterator mapPos = codeToKeyMap->find(lowerCode); if (mapPos != codeToKeyMap->end()) { return mapPos->second; diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp index d01531f07..0c57ca001 100644 --- a/native/jni/src/suggest/core/policy/weighting.cpp +++ b/native/jni/src/suggest/core/policy/weighting.cpp @@ -16,7 +16,6 @@ #include "suggest/core/policy/weighting.h" -#include "char_utils.h" #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_profiler.h" @@ -143,7 +142,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n case CT_TERMINAL: { const float languageImprobability = DicNodeUtils::getBigramNodeImprobability( - traverseSession->getOffsetDict(), dicNode, multiBigramMap); + traverseSession->getBinaryDictionaryInfo(), dicNode, multiBigramMap); return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability); } case CT_NEW_WORD_SPACE_SUBSTITUTION: diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp index 4e634500c..be293df42 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp +++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -20,6 +20,7 @@ #include "dic_traverse_wrapper.h" #include "jni.h" #include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/dictionary.h" @@ -65,7 +66,8 @@ static TraverseSessionFactoryRegisterer traverseSessionFactoryRegisterer; void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord, int prevWordLength, const SuggestOptions *const suggestOptions) { mDictionary = dictionary; - mMultiWordCostMultiplier = BinaryFormat::getMultiWordCostMultiplier(mDictionary->getDict(), + mMultiWordCostMultiplier = BinaryFormat::getMultiWordCostMultiplier( + mDictionary->getBinaryDictionaryInfo()->getDictBuf(), mDictionary->getDictSize()); mSuggestOptions = suggestOptions; if (!prevWord) { @@ -73,12 +75,14 @@ void DicTraverseSession::init(const Dictionary *const dictionary, const int *pre return; } // TODO: merge following similar calls to getTerminalPosition into one case-insensitive call. - mPrevWordPos = BinaryFormat::getTerminalPosition(dictionary->getOffsetDict(), prevWord, + mPrevWordPos = BinaryFormat::getTerminalPosition( + dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord, prevWordLength, false /* forceLowerCaseSearch */); if (mPrevWordPos == NOT_VALID_WORD) { // Check bigrams for lower-cased previous word if original was not found. Useful for // auto-capitalized words like "The [current_word]". - mPrevWordPos = BinaryFormat::getTerminalPosition(dictionary->getOffsetDict(), prevWord, + mPrevWordPos = BinaryFormat::getTerminalPosition( + dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord, prevWordLength, true /* forceLowerCaseSearch */); } } @@ -93,8 +97,8 @@ void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo, maxSpatialDistance, maxPointerCount); } -const uint8_t *DicTraverseSession::getOffsetDict() const { - return mDictionary->getOffsetDict(); +const BinaryDictionaryInfo *DicTraverseSession::getBinaryDictionaryInfo() const { + return mDictionary->getBinaryDictionaryInfo(); } int DicTraverseSession::getDictFlags() const { diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h index e5c7f8e0c..3b6a3dc8c 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.h +++ b/native/jni/src/suggest/core/session/dic_traverse_session.h @@ -28,6 +28,7 @@ namespace latinime { +class BinaryDictionaryInfo; class Dictionary; class ProximityInfo; class SuggestOptions; @@ -56,7 +57,7 @@ class DicTraverseSession { void resetCache(const int nextActiveCacheSize, const int maxWords); // TODO: Remove - const uint8_t *getOffsetDict() const; + const BinaryDictionaryInfo *getBinaryDictionaryInfo() const; int getDictFlags() const; //-------------------- diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 94441877a..1f108e400 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -16,7 +16,6 @@ #include "suggest/core/suggest.h" -#include "char_utils.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_priority_queue.h" #include "suggest/core/dicnode/dic_node_vector.h" @@ -106,8 +105,8 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPo traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(), MAX_RESULTS); // Create a new dic node here DicNode rootNode; - DicNodeUtils::initAsRoot(traverseSession->getDicRootPos(), - traverseSession->getOffsetDict(), traverseSession->getPrevWordPos(), &rootNode); + DicNodeUtils::initAsRoot(traverseSession->getBinaryDictionaryInfo(), + traverseSession->getPrevWordPos(), &rootNode); traverseSession->getDicTraverseCache()->copyPushActive(&rootNode); } } @@ -159,7 +158,7 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel); const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) + doubleLetterCost; - const TerminalAttributes terminalAttributes(traverseSession->getOffsetDict(), + const TerminalAttributes terminalAttributes(traverseSession->getBinaryDictionaryInfo(), terminalDicNode->getFlags(), terminalDicNode->getAttributesPos()); const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0; const bool isExactMatch = terminalDicNode->isExactMatch(); @@ -285,7 +284,7 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { } DicNodeUtils::getAllChildDicNodes( - &dicNode, traverseSession->getOffsetDict(), &childDicNodes); + &dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes); const int childDicNodesSize = childDicNodes.getSizeAndLock(); for (int i = 0; i < childDicNodesSize; ++i) { @@ -432,7 +431,8 @@ void Suggest::processDicNodeAsDigraph(DicTraverseSession *traverseSession, void Suggest::processDicNodeAsOmission( DicTraverseSession *traverseSession, DicNode *dicNode) const { DicNodeVector childDicNodes; - DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getOffsetDict(), &childDicNodes); + DicNodeUtils::getAllChildDicNodes( + dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes); const int size = childDicNodes.getSizeAndLock(); for (int i = 0; i < size; i++) { @@ -457,7 +457,7 @@ void Suggest::processDicNodeAsInsertion(DicTraverseSession *traverseSession, DicNode *dicNode) const { const int16_t pointIndex = dicNode->getInputIndex(0); DicNodeVector childDicNodes; - DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getOffsetDict(), + DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(), traverseSession->getProximityInfoState(0), pointIndex + 1, true, &childDicNodes); const int size = childDicNodes.getSizeAndLock(); for (int i = 0; i < size; i++) { @@ -475,14 +475,14 @@ void Suggest::processDicNodeAsTransposition(DicTraverseSession *traverseSession, DicNode *dicNode) const { const int16_t pointIndex = dicNode->getInputIndex(0); DicNodeVector childDicNodes1; - DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getOffsetDict(), + DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(), traverseSession->getProximityInfoState(0), pointIndex + 1, false, &childDicNodes1); const int childSize1 = childDicNodes1.getSizeAndLock(); for (int i = 0; i < childSize1; i++) { if (childDicNodes1[i]->hasChildren()) { DicNodeVector childDicNodes2; DicNodeUtils::getProximityChildDicNodes( - childDicNodes1[i], traverseSession->getOffsetDict(), + childDicNodes1[i], traverseSession->getBinaryDictionaryInfo(), traverseSession->getProximityInfoState(0), pointIndex, false, &childDicNodes2); const int childSize2 = childDicNodes2.getSizeAndLock(); for (int j = 0; j < childSize2; j++) { @@ -522,8 +522,8 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode // Create a non-cached node here. DicNode newDicNode; - DicNodeUtils::initAsRootWithPreviousWord(traverseSession->getDicRootPos(), - traverseSession->getOffsetDict(), dicNode, &newDicNode); + DicNodeUtils::initAsRootWithPreviousWord( + traverseSession->getBinaryDictionaryInfo(), dicNode, &newDicNode); const CorrectionType correctionType = spaceSubstitution ? CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMITTION; Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode, diff --git a/native/jni/src/suggest_options.h b/native/jni/src/suggest/core/suggest_options.h index 1b21aafcf..1b21aafcf 100644 --- a/native/jni/src/suggest_options.h +++ b/native/jni/src/suggest/core/suggest_options.h diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h index b212fe101..e21b318e6 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h @@ -19,7 +19,6 @@ #include <stdint.h> -#include "char_utils.h" #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" @@ -27,6 +26,7 @@ #include "suggest/core/policy/traversal.h" #include "suggest/core/session/dic_traverse_session.h" #include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" namespace latinime { class TypingTraversal : public Traversal { @@ -64,9 +64,9 @@ class TypingTraversal : public Traversal { } const int point0Index = dicNode->getInputIndex(0); const int currentBaseLowerCodePoint = - toBaseLowerCase(childDicNode->getNodeCodePoint()); + CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint()); const int typedBaseLowerCodePoint = - toBaseLowerCase(traverseSession->getProximityInfoState(0) + CharUtils::toBaseLowerCase(traverseSession->getProximityInfoState(0) ->getPrimaryCodePointAt(point0Index)); return (currentBaseLowerCodePoint != typedBaseLowerCodePoint); } @@ -172,7 +172,7 @@ class TypingTraversal : public Traversal { } const int c = dicNode->getOutputWordBuf()[0]; const bool shortCappedWord = dicNode->getDepth() - < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && isAsciiUpper(c); + < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && CharUtils::isAsciiUpper(c); return !shortCappedWord || probability >= ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED; } diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index cb6abd574..17fa11082 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -23,6 +23,7 @@ #include "suggest/core/policy/weighting.h" #include "suggest/core/session/dic_traverse_session.h" #include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" namespace latinime { @@ -98,9 +99,9 @@ class TypingWeighting : public Weighting { bool isProximityDicNode(const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { const int pointIndex = dicNode->getInputIndex(0); - const int primaryCodePoint = toBaseLowerCase( + const int primaryCodePoint = CharUtils::toBaseLowerCase( traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex)); - const int dicNodeChar = toBaseLowerCase(dicNode->getNodeCodePoint()); + const int dicNodeChar = CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint()); return primaryCodePoint != dicNodeChar; } @@ -145,7 +146,7 @@ class TypingWeighting : public Weighting { float getNewWordBigramCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) const { - return DicNodeUtils::getBigramNodeImprobability(traverseSession->getOffsetDict(), + return DicNodeUtils::getBigramNodeImprobability(traverseSession->getBinaryDictionaryInfo(), dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; } diff --git a/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h b/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h index ec1457455..81614bc9c 100644 --- a/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h +++ b/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h @@ -17,8 +17,8 @@ #ifndef LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H #define LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H -#include "char_utils.h" #include "suggest/policyimpl/utils/edit_distance_policy.h" +#include "utils/char_utils.h" namespace latinime { @@ -31,8 +31,8 @@ class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy { ~DamerauLevenshteinEditDistancePolicy() {} AK_FORCE_INLINE float getSubstitutionCost(const int index0, const int index1) const { - const int c0 = toBaseLowerCase(mString0[index0]); - const int c1 = toBaseLowerCase(mString1[index1]); + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); return (c0 == c1) ? 0.0f : 1.0f; } @@ -45,10 +45,10 @@ class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy { } AK_FORCE_INLINE bool allowTransposition(const int index0, const int index1) const { - const int c0 = toBaseLowerCase(mString0[index0]); - const int c1 = toBaseLowerCase(mString1[index1]); - if (index0 > 0 && index1 > 0 && c0 == toBaseLowerCase(mString1[index1 - 1]) - && c1 == toBaseLowerCase(mString0[index0 - 1])) { + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); + if (index0 > 0 && index1 > 0 && c0 == CharUtils::toBaseLowerCase(mString1[index1 - 1]) + && c1 == CharUtils::toBaseLowerCase(mString0[index0 - 1])) { return true; } return false; diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 1133256c4..5820a1d0e 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -18,13 +18,15 @@ #define LOG_TAG "LatinIME: unigram_dictionary.cpp" -#include "char_utils.h" #include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/dictionary/terminal_attributes.h" #include "suggest/core/layout/proximity_info.h" +#include "utils/char_utils.h" #include "unigram_dictionary.h" #include "words_priority_queue.h" #include "words_priority_queue_pool.h" @@ -32,8 +34,9 @@ namespace latinime { // TODO: check the header -UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags) - : DICT_ROOT(streamStart), ROOT_POS(0), +UnigramDictionary::UnigramDictionary( + const BinaryDictionaryInfo *const binaryDicitonaryInfo, const uint8_t dictFlags) + : mBinaryDicitonaryInfo(binaryDicitonaryInfo), MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) { if (DEBUG_DICT) { AKLOGI("UnigramDictionary - constructor"); @@ -315,9 +318,10 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, correction->setCorrectionParams(0, 0, 0, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, doAutoCompletion, maxErrors); - int rootPosition = ROOT_POS; + int rootPosition = mBinaryDicitonaryInfo->getRootPosition(); // Get the number of children of root, then increment the position - int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); + int childCount = BinaryFormat::getGroupCountAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &rootPosition); int outputIndex = 0; correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0)); @@ -696,8 +700,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; int codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); - int baseChar = toBaseLowerCase(codePoint); - const int wChar = toBaseLowerCase(inWord[startInputIndex]); + int baseChar = CharUtils::toBaseLowerCase(codePoint); + const int wChar = CharUtils::toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; @@ -709,8 +713,9 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, if (hasMultipleChars) { codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); while (NOT_A_CODE_POINT != codePoint) { - baseChar = toBaseLowerCase(codePoint); - if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + baseChar = CharUtils::toBaseLowerCase(codePoint); + if (inputIndex + 1 >= inputSize + || CharUtils::toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false; @@ -746,7 +751,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con int newWord[MAX_WORD_LENGTH]; int depth = 0; int maxFreq = -1; - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot(); int stackChildCount[MAX_WORD_LENGTH]; int stackInputIndex[MAX_WORD_LENGTH]; int stackSiblingPos[MAX_WORD_LENGTH]; @@ -805,7 +810,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con } int UnigramDictionary::getProbability(const int *const inWord, const int length) const { - const uint8_t *const root = DICT_ROOT; + const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot(); int pos = BinaryFormat::getTerminalPosition(root, inWord, length, false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == pos) { @@ -822,7 +827,7 @@ int UnigramDictionary::getProbability(const int *const inWord, const int length) if (hasMultipleChars) { pos = BinaryFormat::skipOtherCharacters(root, pos); } else { - BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); + BinaryFormat::getCodePointAndForwardPointer(root, &pos); } const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos); return unigramProbability; @@ -864,7 +869,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &pos); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); @@ -875,7 +881,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // else if FLAG_IS_TERMINAL: the probability // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. - int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); + int c = BinaryFormat::getCodePointAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &pos); ASSERT(NOT_A_CODE_POINT != c); // We are going to loop through each character and make it look like it's a different @@ -889,8 +896,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // We prefetch the next char. If 'c' is the last char of this node, we will have // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node // should behave as a terminal or not and whether we have children. - const int nextc = hasMultipleChars - ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT; + const int nextc = hasMultipleChars ? BinaryFormat::getCodePointAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT; const bool isLastChar = (NOT_A_CODE_POINT == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. // If we are on the last char, this virtual node is a terminal if this node is. @@ -910,11 +917,11 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // We don't have to output other values because we return false, as in // "don't traverse children". if (!isLastChar) { - pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); + pos = BinaryFormat::skipOtherCharacters(mBinaryDicitonaryInfo->getDictRoot(), pos); } pos = BinaryFormat::skipProbability(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); return false; } @@ -927,15 +934,15 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, if (isTerminalNode) { // The probability should be here, because we come here only if this is actually // a terminal node, and we are on its last char. - const int unigramProbability = - BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos); + const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer( + mBinaryDicitonaryInfo->getDictRoot(), pos); const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); - TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); + TerminalAttributes terminalAttributes(mBinaryDicitonaryInfo, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. - const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, - unigramProbability); + const int probability = ProbabilityUtils::getProbability( + initialPos, bigramMap, bigramFilter, unigramProbability); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); @@ -951,16 +958,16 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // remaining char in this group for there can't be any. if (!hasChildren) { pos = BinaryFormat::skipProbability(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); return false; } // Optimization: Prune out words that are too long compared to how much was typed. if (correction->needsToPrune()) { pos = BinaryFormat::skipProbability(flags, pos); - *nextSiblingPosition = - BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); if (DEBUG_DICT_FULL) { AKLOGI("Traversing was pruned."); } @@ -979,9 +986,12 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, // Once this is read, we still need to output the number of nodes in the immediate children of // this node, so we read and output it before returning true, as in "please traverse children". pos = BinaryFormat::skipProbability(flags, pos); - int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); - *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); - *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); + int childrenPos = BinaryFormat::readChildrenPosition( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes( + mBinaryDicitonaryInfo->getDictRoot(), flags, pos); + *newCount = BinaryFormat::getGroupCountAndForwardPointer( + mBinaryDicitonaryInfo->getDictRoot(), &childrenPos); *newChildrenPosition = childrenPos; return true; } diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index a50503256..4edd1f847 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -25,6 +25,7 @@ namespace latinime { +class BinaryDictionaryInfo; class Correction; class ProximityInfo; class TerminalAttributes; @@ -39,7 +40,10 @@ class UnigramDictionary { static const int FLAG_MULTIPLE_SUGGEST_ABORT = 0; static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1; static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2; - UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags); + + UnigramDictionary(const BinaryDictionaryInfo *const binaryDicitonaryInfo, + const uint8_t dictFlags); + virtual ~UnigramDictionary(); int getProbability(const int *const inWord, const int length) const; int getBigramPosition(int pos, int *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, @@ -48,7 +52,6 @@ class UnigramDictionary { const bool useFullEditDistance, int *outWords, int *frequencies, int *outputTypes) const; int getDictFlags() const { return DICT_FLAGS; } - virtual ~UnigramDictionary(); private: DISALLOW_IMPLICIT_CONSTRUCTORS(UnigramDictionary); @@ -108,8 +111,7 @@ class UnigramDictionary { const int outputWordLength, int *freqArray, int *wordLengthArray, int *outputWord) const; - const uint8_t *const DICT_ROOT; - const int ROOT_POS; + const BinaryDictionaryInfo *const mBinaryDicitonaryInfo; const int MAX_DIGRAPH_SEARCH_DEPTH; const int DICT_FLAGS; }; diff --git a/native/jni/src/char_utils.cpp b/native/jni/src/utils/char_utils.cpp index e219beb62..0e7039610 100644 --- a/native/jni/src/char_utils.cpp +++ b/native/jni/src/utils/char_utils.cpp @@ -14,9 +14,10 @@ * limitations under the License. */ +#include "utils/char_utils.h" + #include <cstdlib> -#include "char_utils.h" #include "defines.h" namespace latinime { @@ -36,8 +37,7 @@ struct LatinCapitalSmallPair { * $ apt-get install libicu-dev * * 3. Build the following code - * (You need this file, char_utils.h, and defines.h) - * $ g++ -o char_utils -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc + * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc */ #ifdef UPDATING_CHAR_UTILS #include <stdio.h> @@ -47,7 +47,7 @@ extern "C" int main() { for (unsigned short c = 0; c < 0xFFFF; c++) { if (c <= 0x7F) continue; const unsigned short icu4cLowerC = u_tolower(c); - const unsigned short myLowerC = latin_tolower(c); + const unsigned short myLowerC = CharUtils::latin_tolower(c); if (c != icu4cLowerC) { #ifdef CONFIRMING_CHAR_UTILS if (icu4cLowerC != myLowerC) { @@ -70,7 +70,7 @@ extern "C" int main() { * * 5. Update the SORTED_CHAR_MAP[] array below with the output above. * Then, rebuild with -DCONFIRMING_CHAR_UTILS and confirm the program exits successfully. - * $ g++ -o char_utils -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc + * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc * $ ./char_utils * $ */ @@ -1054,7 +1054,7 @@ static int compare_pair_capital(const void *a, const void *b) { - static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital); } -unsigned short latin_tolower(const unsigned short c) { +/* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) { struct LatinCapitalSmallPair *p = static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP, NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital)); @@ -1063,7 +1063,7 @@ unsigned short latin_tolower(const unsigned short c) { /* * Table mapping most combined Latin, Greek, and Cyrillic characters - * to their base characters. If c is in range, BASE_CHARS[c] == c + * to their base characters. If c is in range, CharUtils::BASE_CHARS[c] == c * if c is not a combined character, or the base character if it * is combined. * @@ -1074,7 +1074,7 @@ unsigned short latin_tolower(const unsigned short c) { * for ($j = $i; $j < $i + 8; $j++) { \ * printf("0x%04X, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' */ -const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = { +/* static */ const unsigned short CharUtils::BASE_CHARS[CharUtils::BASE_CHARS_SIZE] = { /* U+0000 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, /* U+0008 */ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, /* U+0010 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h new file mode 100644 index 000000000..2e735a81c --- /dev/null +++ b/native/jni/src/utils/char_utils.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_CHAR_UTILS_H +#define LATINIME_CHAR_UTILS_H + +#include <cctype> + +#include "defines.h" + +namespace latinime { + +class CharUtils { + public: + static AK_FORCE_INLINE bool isAsciiUpper(int c) { + // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to + // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). + return (c >= 'A' && c <= 'Z'); + } + + static AK_FORCE_INLINE int toAsciiLower(int c) { + return c - 'A' + 'a'; + } + + static AK_FORCE_INLINE bool isAscii(int c) { + return isascii(c) != 0; + } + + static AK_FORCE_INLINE int toLowerCase(const int c) { + if (isAsciiUpper(c)) { + return toAsciiLower(c); + } + if (isAscii(c)) { + return c; + } + return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); + } + + static AK_FORCE_INLINE int toBaseLowerCase(const int c) { + return toLowerCase(toBaseCodePoint(c)); + } + + static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) { + // TODO: Do not hardcode here + return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; + } + + static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { + int size = 0; + for (; size < arraySize; ++size) { + if (codePoints[size] == '\0') { + break; + } + } + return size; + } + + static AK_FORCE_INLINE int toBaseCodePoint(int c) { + if (c < BASE_CHARS_SIZE) { + return static_cast<int>(BASE_CHARS[c]); + } + return c; + } + + static unsigned short latin_tolower(const unsigned short c); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); + + /** + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + */ + static const int BASE_CHARS_SIZE = 0x0500; + static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; +}; +} // namespace latinime +#endif // LATINIME_CHAR_UTILS_H diff --git a/native/jni/src/hash_map_compat.h b/native/jni/src/utils/hash_map_compat.h index a1e982bc4..a1e982bc4 100644 --- a/native/jni/src/hash_map_compat.h +++ b/native/jni/src/utils/hash_map_compat.h |