aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--java/res/values-ru/strings.xml10
-rw-r--r--java/res/values-zh-rCN/strings.xml4
-rw-r--r--java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java16
-rw-r--r--java/src/com/android/inputmethod/dictionarypack/WordListPreference.java9
-rw-r--r--java/src/com/android/inputmethod/latin/AssetFileAddress.java2
-rw-r--r--java/src/com/android/inputmethod/latin/DictionaryFactory.java23
-rw-r--r--java/src/com/android/inputmethod/latin/LatinIME.java13
-rw-r--r--java/src/com/android/inputmethod/latin/Suggest.java7
-rw-r--r--java/src/com/android/inputmethod/research/ResearchLogger.java32
-rw-r--r--native/jni/Android.mk8
-rw-r--r--native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp11
-rw-r--r--native/jni/src/char_utils.h88
-rw-r--r--native/jni/src/correction.cpp4
-rw-r--r--native/jni/src/correction.h9
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node.h6
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node_utils.cpp106
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node_utils.h26
-rw-r--r--native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp (renamed from native/jni/src/bigram_dictionary.cpp)23
-rw-r--r--native/jni/src/suggest/core/dictionary/bigram_dictionary.h (renamed from native/jni/src/bigram_dictionary.h)8
-rw-r--r--native/jni/src/suggest/core/dictionary/binary_dictionary_format.cpp84
-rw-r--r--native/jni/src/suggest/core/dictionary/binary_dictionary_format.h71
-rw-r--r--native/jni/src/suggest/core/dictionary/binary_dictionary_info.h58
-rw-r--r--native/jni/src/suggest/core/dictionary/binary_format.h65
-rw-r--r--native/jni/src/suggest/core/dictionary/byte_array_utils.cpp24
-rw-r--r--native/jni/src/suggest/core/dictionary/byte_array_utils.h148
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.cpp16
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.h11
-rw-r--r--native/jni/src/suggest/core/dictionary/digraph_utils.cpp4
-rw-r--r--native/jni/src/suggest/core/dictionary/multi_bigram_map.h25
-rw-r--r--native/jni/src/suggest/core/dictionary/probability_utils.h74
-rw-r--r--native/jni/src/suggest/core/dictionary/terminal_attributes.h24
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info.cpp4
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info.h2
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_state.cpp11
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_state.h3
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_state_utils.h2
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_utils.h6
-rw-r--r--native/jni/src/suggest/core/policy/weighting.cpp3
-rw-r--r--native/jni/src/suggest/core/session/dic_traverse_session.cpp14
-rw-r--r--native/jni/src/suggest/core/session/dic_traverse_session.h3
-rw-r--r--native/jni/src/suggest/core/suggest.cpp22
-rw-r--r--native/jni/src/suggest/core/suggest_options.h (renamed from native/jni/src/suggest_options.h)0
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_traversal.h8
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_weighting.h7
-rw-r--r--native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h14
-rw-r--r--native/jni/src/unigram_dictionary.cpp72
-rw-r--r--native/jni/src/unigram_dictionary.h10
-rw-r--r--native/jni/src/utils/char_utils.cpp (renamed from native/jni/src/char_utils.cpp)16
-rw-r--r--native/jni/src/utils/char_utils.h93
-rw-r--r--native/jni/src/utils/hash_map_compat.h (renamed from native/jni/src/hash_map_compat.h)0
50 files changed, 902 insertions, 397 deletions
diff --git a/java/res/values-ru/strings.xml b/java/res/values-ru/strings.xml
index 3ee986930..b00fa65a0 100644
--- a/java/res/values-ru/strings.xml
+++ b/java/res/values-ru/strings.xml
@@ -33,19 +33,19 @@
<string name="misc_category" msgid="6894192814868233453">"Другие варианты"</string>
<string name="advanced_settings" msgid="362895144495591463">"Расширенные настройки"</string>
<string name="advanced_settings_summary" msgid="4487980456152830271">"Для опытных пользователей"</string>
- <string name="include_other_imes_in_language_switch_list" msgid="4533689960308565519">"Другой способ ввода"</string>
+ <string name="include_other_imes_in_language_switch_list" msgid="4533689960308565519">"Смена способов ввода"</string>
<string name="include_other_imes_in_language_switch_list_summary" msgid="840637129103317635">"Клавиша переключения языков также служит для смены способа ввода"</string>
<string name="show_language_switch_key" msgid="5915478828318774384">"Клавиша смены языка"</string>
<string name="show_language_switch_key_summary" msgid="7343403647474265713">"Показывать, когда включено несколько раскладок"</string>
- <string name="sliding_key_input_preview" msgid="6604262359510068370">"Показывать индикатор перехода"</string>
- <string name="sliding_key_input_preview_summary" msgid="6340524345729093886">"Индикатор перехода между регистрами или цифр. и букв. режимами"</string>
+ <string name="sliding_key_input_preview" msgid="6604262359510068370">"След от переключателя режима"</string>
+ <string name="sliding_key_input_preview_summary" msgid="6340524345729093886">"Показывать след при проведении пальцем от кнопок Shift и \"Символы\""</string>
<string name="key_preview_popup_dismiss_delay" msgid="6213164897443068248">"Задержка закрытия"</string>
<string name="key_preview_popup_dismiss_no_delay" msgid="2096123151571458064">"Без задержки"</string>
<string name="key_preview_popup_dismiss_default_delay" msgid="2166964333903906734">"По умолчанию"</string>
<string name="abbreviation_unit_milliseconds" msgid="8700286094028323363">"<xliff:g id="MILLISECONDS">%s</xliff:g> мс"</string>
<string name="use_contacts_dict" msgid="4435317977804180815">"Подсказывать имена"</string>
<string name="use_contacts_dict_summary" msgid="6599983334507879959">"Подсказывать исправления на основе имен из списка контактов"</string>
- <string name="use_double_space_period" msgid="8781529969425082860">"Точка с пробелом"</string>
+ <string name="use_double_space_period" msgid="8781529969425082860">"Ставить точки автоматически"</string>
<string name="use_double_space_period_summary" msgid="6532892187247952799">"Вводить точку с пробелом двойным нажатием кнопки \"Пробел\"."</string>
<string name="auto_cap" msgid="1719746674854628252">"Заглавные автоматически"</string>
<string name="auto_cap_summary" msgid="7934452761022946874">"Писать первое слово предложения с прописной буквы"</string>
@@ -164,7 +164,7 @@
<string name="prefs_key_longpress_timeout_settings" msgid="6102240298932897873">"Долгое нажатие"</string>
<string name="prefs_keypress_vibration_duration_settings" msgid="7918341459947439226">"Вибросигнал при нажатии клавиш"</string>
<string name="prefs_keypress_sound_volume_settings" msgid="6027007337036891623">"Звук при нажатии клавиш"</string>
- <string name="prefs_read_external_dictionary" msgid="2588931418575013067">"Считывать данные из внешнего словаря"</string>
+ <string name="prefs_read_external_dictionary" msgid="2588931418575013067">"Загрузить словарь из файла"</string>
<string name="read_external_dictionary_no_files_message" msgid="4947420942224623792">"В папке \"Загрузки\" нет словарей"</string>
<string name="read_external_dictionary_multiple_files_title" msgid="7637749044265808628">"Выберите файл словаря"</string>
<string name="read_external_dictionary_confirm_install_message" msgid="6898610163768980870">"Установить этот файл для следующего языка: <xliff:g id="LOCALE_NAME">%s</xliff:g>?"</string>
diff --git a/java/res/values-zh-rCN/strings.xml b/java/res/values-zh-rCN/strings.xml
index 4bb2ce438..a5bbab2a8 100644
--- a/java/res/values-zh-rCN/strings.xml
+++ b/java/res/values-zh-rCN/strings.xml
@@ -212,9 +212,9 @@
<string name="install_dict" msgid="180852772562189365">"安装"</string>
<string name="cancel_download_dict" msgid="7843340278507019303">"取消"</string>
<string name="delete_dict" msgid="756853268088330054">"删除"</string>
- <string name="should_download_over_metered_prompt" msgid="2878629598667658845">"支持您移动设备上所选语言的词典现已可供下载啦!&lt;br/&gt;建议您&lt;b&gt;下载&lt;/b&gt;这部<xliff:g id="LANGUAGE">%1$s</xliff:g>词典,以享受更好的输入体验。&lt;br/&gt;&lt;br/&gt;通过 3G 进行下载可能需要 1 到 2 分钟的时间。如果您使用的不是&lt;b&gt;无流量限制的套餐&lt;/b&gt;,则可能需要支付一定的费用。&lt;br/&gt;如果您不确定自己使用的是哪种流量套餐,建议您使用 Wi-Fi 连接自动开始下载。&lt;br/&gt;&lt;br/&gt;提示:您可以访问移动设备的&lt;b&gt;设置&lt;/b&gt;菜单中的&lt;b&gt;语言和输入法&lt;/b&gt;,来下载和删除词典。"</string>
+ <string name="should_download_over_metered_prompt" msgid="2878629598667658845">"支持您移动设备上所选语言的词典现已可供下载啦!&lt;br/&gt;建议您&lt;b&gt;下载&lt;/b&gt;这部<xliff:g id="LANGUAGE">%1$s</xliff:g>词典,以享受更好的输入体验。&lt;br/&gt;&lt;br/&gt;通过 3G 进行下载可能需要 1 到 2 分钟的时间。如果您使用的不是&lt;b&gt;无流量限制的套餐&lt;/b&gt;,则可能需要支付一定的费用。&lt;br/&gt;如果您不确定自己使用的是哪种流量套餐,建议您使用 WLAN 连接自动开始下载。&lt;br/&gt;&lt;br/&gt;提示:您可以访问移动设备的&lt;b&gt;设置&lt;/b&gt;菜单中的&lt;b&gt;语言和输入法&lt;/b&gt;,来下载和删除词典。"</string>
<string name="download_over_metered" msgid="1643065851159409546">"立即下载 (<xliff:g id="SIZE_IN_MEGABYTES">%1$.1f</xliff:g>MB)"</string>
- <string name="do_not_download_over_metered" msgid="2176209579313941583">"通过 Wi-Fi 下载"</string>
+ <string name="do_not_download_over_metered" msgid="2176209579313941583">"通过 WLAN 下载"</string>
<string name="dict_available_notification_title" msgid="6514288591959117288">"<xliff:g id="LANGUAGE">%1$s</xliff:g>词典可供下载"</string>
<string name="dict_available_notification_description" msgid="1075194169443163487">"按此通知即可查看和下载"</string>
<string name="toast_downloading_suggestions" msgid="1313027353588566660">"下载中:很快就能启用<xliff:g id="LANGUAGE">%1$s</xliff:g>的词典建议服务了!"</string>
diff --git a/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java b/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java
index 1e93e7e7a..4b89d20bb 100644
--- a/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java
+++ b/java/src/com/android/inputmethod/dictionarypack/DictionarySettingsFragment.java
@@ -304,7 +304,7 @@ public final class DictionarySettingsFragment extends PreferenceFragment
// the description.
final String key = matchLevelString + "." + description + "." + wordlistId;
final WordListPreference existingPref = prefMap.get(key);
- if (null == existingPref || hasPriority(status, existingPref.mStatus)) {
+ if (null == existingPref || existingPref.hasPriorityOver(status)) {
final WordListPreference oldPreference = mCurrentPreferenceMap.get(key);
final WordListPreference pref;
if (null != oldPreference
@@ -315,7 +315,7 @@ public final class DictionarySettingsFragment extends PreferenceFragment
// need to be the same, others have been tested through the key of the
// map. Also, status may differ so we don't want to use #equals() here.
pref = oldPreference;
- pref.mStatus = status;
+ pref.setStatus(status);
} else {
// Otherwise, discard it and create a new one instead.
pref = new WordListPreference(activity, mDictionaryListInterfaceState,
@@ -331,18 +331,6 @@ public final class DictionarySettingsFragment extends PreferenceFragment
}
}
- /**
- * Finds out if a given status has priority over another for display order.
- *
- * @param newStatus
- * @param oldStatus
- * @return whether newStatus has priority over oldStatus.
- */
- private static boolean hasPriority(final int newStatus, final int oldStatus) {
- // Both of these should be one of MetadataDbHelper.STATUS_*
- return newStatus > oldStatus;
- }
-
@Override
public boolean onOptionsItemSelected(final MenuItem item) {
switch (item.getItemId()) {
diff --git a/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java b/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java
index a1031c2ca..7ec7e9c13 100644
--- a/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java
+++ b/java/src/com/android/inputmethod/dictionarypack/WordListPreference.java
@@ -61,7 +61,7 @@ public final class WordListPreference extends Preference {
public final Locale mLocale;
public final String mDescription;
// The status
- public int mStatus;
+ private int mStatus;
// The size of the dictionary file
private final int mFilesize;
@@ -92,7 +92,7 @@ public final class WordListPreference extends Preference {
setKey(wordlistId);
}
- private void setStatus(final int status) {
+ public void setStatus(final int status) {
if (status == mStatus) return;
mStatus = status;
setSummary(getSummary(status));
@@ -106,6 +106,11 @@ public final class WordListPreference extends Preference {
return mInterfaceState.addToCacheAndReturnView(newView);
}
+ public boolean hasPriorityOver(final int otherPrefStatus) {
+ // Both of these should be one of MetadataDbHelper.STATUS_*
+ return mStatus > otherPrefStatus;
+ }
+
private String getSummary(final int status) {
switch (status) {
// If we are deleting the word list, for the user it's like it's already deleted.
diff --git a/java/src/com/android/inputmethod/latin/AssetFileAddress.java b/java/src/com/android/inputmethod/latin/AssetFileAddress.java
index 47c750f54..875192554 100644
--- a/java/src/com/android/inputmethod/latin/AssetFileAddress.java
+++ b/java/src/com/android/inputmethod/latin/AssetFileAddress.java
@@ -24,7 +24,7 @@ import java.io.File;
* the package file. Open it correctly thus requires the name of the package it is in, but
* also the offset in the file and the length of this data. This class encapsulates these three.
*/
-final class AssetFileAddress {
+public final class AssetFileAddress {
public final String mFilename;
public final long mOffset;
public final long mLength;
diff --git a/java/src/com/android/inputmethod/latin/DictionaryFactory.java b/java/src/com/android/inputmethod/latin/DictionaryFactory.java
index 40e51672a..4514ec2ec 100644
--- a/java/src/com/android/inputmethod/latin/DictionaryFactory.java
+++ b/java/src/com/android/inputmethod/latin/DictionaryFactory.java
@@ -21,6 +21,8 @@ import android.content.res.AssetFileDescriptor;
import android.content.res.Resources;
import android.util.Log;
+import com.android.inputmethod.annotations.UsedForTesting;
+
import java.io.File;
import java.util.ArrayList;
import java.util.LinkedList;
@@ -126,21 +128,22 @@ public final class DictionaryFactory {
/**
* Create a dictionary from passed data. This is intended for unit tests only.
- * @param dictionary the file to read
- * @param startOffset the offset in the file where the data starts
- * @param length the length of the data
+ * @param dictionaryList the list of files to read, with their offsets and lengths
* @param useFullEditDistance whether to use the full edit distance in suggestions
* @return the created dictionary, or null.
*/
- public static Dictionary createDictionaryForTest(File dictionary, long startOffset, long length,
+ @UsedForTesting
+ public static Dictionary createDictionaryForTest(final AssetFileAddress[] dictionaryList,
final boolean useFullEditDistance, Locale locale) {
- if (dictionary.isFile()) {
- return new BinaryDictionary(dictionary.getAbsolutePath(), startOffset, length,
- useFullEditDistance, locale, Dictionary.TYPE_MAIN);
- } else {
- Log.e(TAG, "Could not find the file. path=" + dictionary.getAbsolutePath());
- return null;
+ final DictionaryCollection dictionaryCollection =
+ new DictionaryCollection(Dictionary.TYPE_MAIN);
+ for (final AssetFileAddress address : dictionaryList) {
+ final BinaryDictionary binaryDictionary = new BinaryDictionary(address.mFilename,
+ address.mOffset, address.mLength, useFullEditDistance, locale,
+ Dictionary.TYPE_MAIN);
+ dictionaryCollection.addDictionary(binaryDictionary);
}
+ return dictionaryCollection;
}
/**
diff --git a/java/src/com/android/inputmethod/latin/LatinIME.java b/java/src/com/android/inputmethod/latin/LatinIME.java
index c9a42a3a4..70f8d0de8 100644
--- a/java/src/com/android/inputmethod/latin/LatinIME.java
+++ b/java/src/com/android/inputmethod/latin/LatinIME.java
@@ -1797,8 +1797,6 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen
if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) {
final String word = mWordComposer.getTypedWord();
ResearchLogger.latinIME_handleBackspace_batch(word, 1);
- ResearchLogger.getInstance().uncommitCurrentLogUnit(
- word, false /* dumpCurrentLogUnit */);
}
final String rejectedSuggestion = mWordComposer.getTypedWord();
mWordComposer.reset();
@@ -1825,6 +1823,9 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen
// like the smiley key or the .com key.
final int length = mEnteredText.length();
mConnection.deleteSurroundingText(length, 0);
+ if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) {
+ ResearchLogger.latinIME_handleBackspace_cancelTextInput(mEnteredText);
+ }
mEnteredText = null;
// If we have mEnteredText, then we know that mHasUncommittedTypedChars == false.
// In addition we know that spaceState is false, and that we should not be
@@ -1858,7 +1859,8 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen
mLastSelectionEnd = mLastSelectionStart;
mConnection.deleteSurroundingText(numCharsDeleted, 0);
if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) {
- ResearchLogger.latinIME_handleBackspace(numCharsDeleted);
+ ResearchLogger.latinIME_handleBackspace(numCharsDeleted,
+ false /* shouldUncommitLogUnit */);
}
} else {
// There is no selection, just delete one character.
@@ -1876,12 +1878,13 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen
mConnection.deleteSurroundingText(1, 0);
}
if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) {
- ResearchLogger.latinIME_handleBackspace(1);
+ ResearchLogger.latinIME_handleBackspace(1, true /* shouldUncommitLogUnit */);
}
if (mDeleteCount > DELETE_ACCELERATE_AT) {
mConnection.deleteSurroundingText(1, 0);
if (ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS) {
- ResearchLogger.latinIME_handleBackspace(1);
+ ResearchLogger.latinIME_handleBackspace(1,
+ true /* shouldUncommitLogUnit */);
}
}
}
diff --git a/java/src/com/android/inputmethod/latin/Suggest.java b/java/src/com/android/inputmethod/latin/Suggest.java
index 5d580f29b..e783e6d51 100644
--- a/java/src/com/android/inputmethod/latin/Suggest.java
+++ b/java/src/com/android/inputmethod/latin/Suggest.java
@@ -23,7 +23,6 @@ import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.keyboard.ProximityInfo;
import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
-import java.io.File;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
@@ -77,9 +76,9 @@ public final class Suggest {
}
@UsedForTesting
- Suggest(final File dictionary, final long startOffset, final long length, final Locale locale) {
- final Dictionary mainDict = DictionaryFactory.createDictionaryForTest(dictionary,
- startOffset, length /* useFullEditDistance */, false, locale);
+ Suggest(final AssetFileAddress[] dictionaryList, final Locale locale) {
+ final Dictionary mainDict = DictionaryFactory.createDictionaryForTest(dictionaryList,
+ false /* useFullEditDistance */, locale);
mLocale = locale;
mMainDictionary = mainDict;
addOrReplaceDictionary(mDictionaries, Dictionary.TYPE_MAIN, mainDict);
diff --git a/java/src/com/android/inputmethod/research/ResearchLogger.java b/java/src/com/android/inputmethod/research/ResearchLogger.java
index d84f69659..ec54616b7 100644
--- a/java/src/com/android/inputmethod/research/ResearchLogger.java
+++ b/java/src/com/android/inputmethod/research/ResearchLogger.java
@@ -863,7 +863,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
// Check that expected word matches.
if (oldLogUnit != null) {
final String oldLogUnitWords = oldLogUnit.getWordsAsString();
- if (oldLogUnitWords != null && !oldLogUnitWords.equals(expectedWord)) {
+ // Because the word is stored in the LogUnit with digits scrubbed, the comparison must
+ // be made on a scrubbed version of the expectedWord as well.
+ if (oldLogUnitWords != null && !oldLogUnitWords.equals(
+ scrubDigitsFromString(expectedWord))) {
return;
}
}
@@ -1274,6 +1277,16 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
}
/**
+ * Log a revert of onTextInput() (known in the IME as "EnteredText").
+ *
+ * SystemResponse: Remove the LogUnit recording the textInput
+ */
+ public static void latinIME_handleBackspace_cancelTextInput(final String text) {
+ final ResearchLogger researchLogger = getInstance();
+ researchLogger.uncommitCurrentLogUnit(text, true /* dumpCurrentLogUnit */);
+ }
+
+ /**
* Log a call to LatinIME.pickSuggestionManually().
*
* UserAction: The user has chosen a specific word from the suggestion strip.
@@ -1811,17 +1824,26 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
SystemClock.uptimeMillis());
}
+ private static final LogStatement LOGSTATEMENT_LATINIME_HANDLEBACKSPACE =
+ new LogStatement("LatinIMEHandleBackspace", true, false, "numCharacters");
/**
* Log a call to LatinIME.handleBackspace() that is not a batch delete.
*
* UserInput: The user is deleting one or more characters by hitting the backspace key once.
* The covers single character deletes as well as deleting selections.
+ *
+ * @param numCharacters how many characters the backspace operation deleted
+ * @param shouldUncommitLogUnit whether to uncommit the last {@code LogUnit} in the
+ * {@code LogBuffer}
*/
- private static final LogStatement LOGSTATEMENT_LATINIME_HANDLEBACKSPACE =
- new LogStatement("LatinIMEHandleBackspace", true, false, "numCharacters");
- public static void latinIME_handleBackspace(final int numCharacters) {
+ public static void latinIME_handleBackspace(final int numCharacters,
+ final boolean shouldUncommitLogUnit) {
final ResearchLogger researchLogger = getInstance();
researchLogger.enqueueEvent(LOGSTATEMENT_LATINIME_HANDLEBACKSPACE, numCharacters);
+ if (shouldUncommitLogUnit) {
+ ResearchLogger.getInstance().uncommitCurrentLogUnit(
+ null, true /* dumpCurrentLogUnit */);
+ }
}
/**
@@ -1839,6 +1861,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
numCharacters);
researchLogger.mStatistics.recordGestureDelete(deletedText.length(),
SystemClock.uptimeMillis());
+ researchLogger.uncommitCurrentLogUnit(deletedText.toString(),
+ false /* dumpCurrentLogUnit */);
}
/**
diff --git a/native/jni/Android.mk b/native/jni/Android.mk
index 34b352433..1518dad17 100644
--- a/native/jni/Android.mk
+++ b/native/jni/Android.mk
@@ -46,8 +46,6 @@ LATIN_IME_JNI_SRC_FILES := \
jni_common.cpp
LATIN_IME_CORE_SRC_FILES := \
- bigram_dictionary.cpp \
- char_utils.cpp \
correction.cpp \
dic_traverse_wrapper.cpp \
unigram_dictionary.cpp \
@@ -58,6 +56,9 @@ LATIN_IME_CORE_SRC_FILES := \
dic_node_utils.cpp \
dic_nodes_cache.cpp) \
$(addprefix suggest/core/dictionary/, \
+ bigram_dictionary.cpp \
+ binary_dictionary_format.cpp \
+ byte_array_utils.cpp \
dictionary.cpp \
digraph_utils.cpp) \
$(addprefix suggest/core/layout/, \
@@ -74,7 +75,8 @@ LATIN_IME_CORE_SRC_FILES := \
typing_scoring.cpp \
typing_suggest_policy.cpp \
typing_traversal.cpp \
- typing_weighting.cpp)
+ typing_weighting.cpp) \
+ utils/char_utils.cpp
LOCAL_SRC_FILES := \
$(LATIN_IME_JNI_SRC_FILES) \
diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
index 9f5e2ae73..e94120587 100644
--- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
+++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
@@ -34,9 +34,10 @@
#include "correction.h"
#include "jni.h"
#include "jni_common.h"
-#include "suggest_options.h"
-#include "suggest/core/dictionary/binary_format.h"
+#include "suggest/core/dictionary/binary_dictionary_format.h"
+#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/dictionary.h"
+#include "suggest/core/suggest_options.h"
namespace latinime {
@@ -110,8 +111,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s
return 0;
}
Dictionary *dictionary = 0;
- if (BinaryFormat::UNKNOWN_FORMAT
- == BinaryFormat::detectFormat(static_cast<uint8_t *>(dictBuf),
+ if (BinaryDictionaryFormat::UNKNOWN_VERSION
+ == BinaryDictionaryFormat::detectFormatVersion(static_cast<uint8_t *>(dictBuf),
static_cast<int>(dictSize))) {
AKLOGE("DICT: dictionary format is unknown, bad magic number");
#ifdef USE_MMAP_FOR_DICTIONARY
@@ -260,7 +261,7 @@ static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jclass clazz, ji
static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) return;
- const void *dictBuf = dictionary->getDict();
+ const void *dictBuf = dictionary->getBinaryDictionaryInfo()->getDictBuf();
if (!dictBuf) return;
#ifdef USE_MMAP_FOR_DICTIONARY
releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(),
diff --git a/native/jni/src/char_utils.h b/native/jni/src/char_utils.h
deleted file mode 100644
index b429f40b2..000000000
--- a/native/jni/src/char_utils.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2010 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_CHAR_UTILS_H
-#define LATINIME_CHAR_UTILS_H
-
-#include <cctype>
-
-#include "defines.h"
-
-namespace latinime {
-
-inline static bool isAsciiUpper(int c) {
- // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
- // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
- return (c >= 'A' && c <= 'Z');
-}
-
-inline static int toAsciiLower(int c) {
- return c - 'A' + 'a';
-}
-
-inline static bool isAscii(int c) {
- return isascii(c) != 0;
-}
-
-unsigned short latin_tolower(const unsigned short c);
-
-/**
- * Table mapping most combined Latin, Greek, and Cyrillic characters
- * to their base characters. If c is in range, BASE_CHARS[c] == c
- * if c is not a combined character, or the base character if it
- * is combined.
- */
-static const int BASE_CHARS_SIZE = 0x0500;
-extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
-
-inline static int toBaseCodePoint(int c) {
- if (c < BASE_CHARS_SIZE) {
- return static_cast<int>(BASE_CHARS[c]);
- }
- return c;
-}
-
-AK_FORCE_INLINE static int toLowerCase(const int c) {
- if (isAsciiUpper(c)) {
- return toAsciiLower(c);
- }
- if (isAscii(c)) {
- return c;
- }
- return static_cast<int>(latin_tolower(static_cast<unsigned short>(c)));
-}
-
-AK_FORCE_INLINE static int toBaseLowerCase(const int c) {
- return toLowerCase(toBaseCodePoint(c));
-}
-
-inline static bool isIntentionalOmissionCodePoint(const int codePoint) {
- // TODO: Do not hardcode here
- return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
-}
-
-inline static int getCodePointCount(const int arraySize, const int *const codePoints) {
- int size = 0;
- for (; size < arraySize; ++size) {
- if (codePoints[size] == '\0') {
- break;
- }
- }
- return size;
-}
-
-} // namespace latinime
-#endif // LATINIME_CHAR_UTILS_H
diff --git a/native/jni/src/correction.cpp b/native/jni/src/correction.cpp
index e2ad557c5..feed5622b 100644
--- a/native/jni/src/correction.cpp
+++ b/native/jni/src/correction.cpp
@@ -18,13 +18,13 @@
#include <cmath>
-#include "char_utils.h"
#include "correction.h"
#include "defines.h"
#include "suggest/core/layout/proximity_info_state.h"
#include "suggest/core/layout/touch_position_correction_utils.h"
#include "suggest/policyimpl/utils/edit_distance.h"
#include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -528,7 +528,7 @@ inline static int getQuoteCount(const int *word, const int length) {
}
inline static bool isUpperCase(unsigned short c) {
- return isAsciiUpper(toBaseCodePoint(c));
+ return CharUtils::isAsciiUpper(CharUtils::toBaseCodePoint(c));
}
//////////////////////
diff --git a/native/jni/src/correction.h b/native/jni/src/correction.h
index 75b49952c..84d6429ba 100644
--- a/native/jni/src/correction.h
+++ b/native/jni/src/correction.h
@@ -22,6 +22,7 @@
#include "correction_state.h"
#include "defines.h"
#include "suggest/core/layout/proximity_info_state.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -342,13 +343,13 @@ AK_FORCE_INLINE static void calcEditDistanceOneStep(int *editDistanceTable, cons
const int *const prevprev =
outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputSize + 1) : 0;
current[0] = outputLength;
- const int co = toBaseLowerCase(output[outputLength - 1]);
- const int prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0;
+ const int co = CharUtils::toBaseLowerCase(output[outputLength - 1]);
+ const int prevCO = outputLength >= 2 ? CharUtils::toBaseLowerCase(output[outputLength - 2]) : 0;
for (int i = 1; i <= inputSize; ++i) {
- const int ci = toBaseLowerCase(input[i - 1]);
+ const int ci = CharUtils::toBaseLowerCase(input[i - 1]);
const int cost = (ci == co) ? 0 : 1;
current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
- if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) {
+ if (i >= 2 && prevprev && ci == prevCO && co == CharUtils::toBaseLowerCase(input[i - 2])) {
current[i] = min(current[i], prevprev[i - 2] + 1);
}
}
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h
index 1510e3d5e..3f64d07b2 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node.h
@@ -17,13 +17,13 @@
#ifndef LATINIME_DIC_NODE_H
#define LATINIME_DIC_NODE_H
-#include "char_utils.h"
#include "defines.h"
#include "suggest/core/dicnode/dic_node_state.h"
#include "suggest/core/dicnode/dic_node_profiler.h"
#include "suggest/core/dicnode/dic_node_properties.h"
#include "suggest/core/dicnode/dic_node_release_listener.h"
#include "suggest/core/dictionary/digraph_utils.h"
+#include "utils/char_utils.h"
#if DEBUG_DICT
#define LOGI_SHOW_ADD_COST_PROP \
@@ -221,7 +221,7 @@ class DicNode {
bool isFirstCharUppercase() const {
const int c = getOutputWordBuf()[0];
- return isAsciiUpper(c);
+ return CharUtils::isAsciiUpper(c);
}
bool isFirstWord() const {
@@ -375,7 +375,7 @@ class DicNode {
// Whether the current codepoint can be an intentional omission, in which case the traversal
// algorithm will always check for a possible omission here.
bool canBeIntentionalOmission() const {
- return isIntentionalOmissionCodePoint(getNodeCodePoint());
+ return CharUtils::isIntentionalOmissionCodePoint(getNodeCodePoint());
}
// Whether the omission is so frequent that it should incur zero cost.
diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp
index 7f0d0ed0e..3deee1a42 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp
+++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp
@@ -20,10 +20,13 @@
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_utils.h"
#include "suggest/core/dicnode/dic_node_vector.h"
+#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/multi_bigram_map.h"
+#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/layout/proximity_info.h"
#include "suggest/core/layout/proximity_info_state.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -31,20 +34,23 @@ namespace latinime {
// Node initialization utils //
///////////////////////////////
-/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot,
- const int prevWordNodePos, DicNode *newRootNode) {
- int curPos = rootPos;
+/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo,
+ const int prevWordNodePos, DicNode *const newRootNode) {
+ int curPos = binaryDictionaryInfo->getRootPosition();
const int pos = curPos;
- const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
+ const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
+ binaryDictionaryInfo->getDictRoot(), &curPos);
const int childrenPos = curPos;
newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos);
}
-/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos,
- const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) {
- int curPos = rootPos;
+/*static */ void DicNodeUtils::initAsRootWithPreviousWord(
+ const BinaryDictionaryInfo *const binaryDictionaryInfo,
+ DicNode *const prevWordLastNode, DicNode *const newRootNode) {
+ int curPos = binaryDictionaryInfo->getRootPosition();
const int pos = curPos;
- const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
+ const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
+ binaryDictionaryInfo->getDictRoot(), &curPos);
const int childrenPos = curPos;
newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount);
}
@@ -62,24 +68,27 @@ namespace latinime {
DicNodeVector *childDicNodes) {
// Passing multiple chars node. No need to traverse child
const int codePoint = dicNode->getNodeTypedCodePoint();
- const int baseLowerCaseCodePoint = toBaseLowerCase(codePoint);
+ const int baseLowerCaseCodePoint = CharUtils::toBaseLowerCase(codePoint);
const bool isMatch = isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, codePoint);
- if (isMatch || isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) {
+ if (isMatch || CharUtils::isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) {
childDicNodes->pushPassingChild(dicNode);
}
}
/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos,
- const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState,
- const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter,
- const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
+ const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth,
+ const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
+ const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
+ DicNodeVector *childDicNodes) {
int nextPos = pos;
- const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos);
+ const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
+ binaryDictionaryInfo->getDictRoot(), &pos);
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
- int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos);
+ int codePoint = BinaryFormat::getCodePointAndForwardPointer(
+ binaryDictionaryInfo->getDictRoot(), &pos);
ASSERT(NOT_A_CODE_POINT != codePoint);
const int nodeCodePoint = codePoint;
// TODO: optimize this
@@ -89,7 +98,8 @@ namespace latinime {
do {
const int nextCodePoint = hasMultipleChars
- ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT;
+ ? BinaryFormat::getCodePointAndForwardPointer(
+ binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
if (!isLastChar) {
additionalWordBuf[additionalSubwordLength++] = nextCodePoint;
@@ -97,12 +107,14 @@ namespace latinime {
codePoint = nextCodePoint;
} while (NOT_A_CODE_POINT != codePoint);
- const int probability =
- isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1;
+ const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(
+ binaryDictionaryInfo->getDictRoot(), pos) : -1;
pos = BinaryFormat::skipProbability(flags, pos);
- int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0;
+ int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(
+ binaryDictionaryInfo->getDictRoot(), flags, pos) : 0;
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos);
- const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos);
+ const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(
+ binaryDictionaryInfo->getDictRoot(), flags, pos);
if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) {
return siblingPos;
@@ -110,8 +122,8 @@ namespace latinime {
if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) {
return siblingPos;
}
- const int childrenCount = hasChildren
- ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0;
+ const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer(
+ binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0;
childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos,
nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal,
hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf);
@@ -125,13 +137,13 @@ namespace latinime {
return false;
}
if (pInfo && (pInfo->getKeyIndexOf(nodeCodePoint) == NOT_AN_INDEX
- || isIntentionalOmissionCodePoint(nodeCodePoint))) {
+ || CharUtils::isIntentionalOmissionCodePoint(nodeCodePoint))) {
// If normalized nodeCodePoint is not on the keyboard or skippable, this child is never
// filtered.
return false;
}
- const int lowerCodePoint = toLowerCase(nodeCodePoint);
- const int baseLowerCodePoint = toBaseCodePoint(lowerCodePoint);
+ const int lowerCodePoint = CharUtils::toLowerCase(nodeCodePoint);
+ const int baseLowerCodePoint = CharUtils::toBaseCodePoint(lowerCodePoint);
// TODO: Avoid linear search
for (int i = 0; i < filterSize; ++i) {
// Checking if a normalized code point is in filter characters when pInfo is not
@@ -147,16 +159,18 @@ namespace latinime {
}
/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode,
- const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
- const bool exactOnly, const std::vector<int> *const codePointsFilter,
- const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
+ const BinaryDictionaryInfo *const binaryDictionaryInfo,
+ const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
+ const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
+ DicNodeVector *childDicNodes) {
const int terminalDepth = dicNode->getLeavingDepth();
const int childCount = dicNode->getChildrenCount();
int nextPos = dicNode->getChildrenPos();
for (int i = 0; i < childCount; i++) {
const int filterSize = codePointsFilter ? codePointsFilter->size() : 0;
- nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState,
- pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes);
+ nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo,
+ terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo,
+ childDicNodes);
if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) {
// All code points have been found.
break;
@@ -164,14 +178,15 @@ namespace latinime {
}
}
-/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
- DicNodeVector *childDicNodes) {
- getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes);
+/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode,
+ const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) {
+ getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes);
}
/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode,
- const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
- bool exactOnly, DicNodeVector *childDicNodes) {
+ const BinaryDictionaryInfo *const binaryDictionaryInfo,
+ const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
+ DicNodeVector *childDicNodes) {
if (dicNode->isTotalInputSizeExceedingLimit()) {
return;
}
@@ -179,9 +194,9 @@ namespace latinime {
DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly,
childDicNodes);
} else {
- DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex,
- exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */,
- childDicNodes);
+ DicNodeUtils::createAndGetAllLeavingChildNodes(
+ dicNode, binaryDictionaryInfo, pInfoState, pointIndex, exactOnly,
+ 0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes);
}
}
@@ -191,32 +206,35 @@ namespace latinime {
/**
* Computes the combined bigram / unigram cost for the given dicNode.
*/
-/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot,
+/* static */ float DicNodeUtils::getBigramNodeImprobability(
+ const BinaryDictionaryInfo *const binaryDictionaryInfo,
const DicNode *const node, MultiBigramMap *multiBigramMap) {
if (node->isImpossibleBigramWord()) {
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
}
- const int probability = getBigramNodeProbability(dicRoot, node, multiBigramMap);
+ const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap);
// TODO: This equation to calculate the improbability looks unreasonable. Investigate this.
const float cost = static_cast<float>(MAX_PROBABILITY - probability)
/ static_cast<float>(MAX_PROBABILITY);
return cost;
}
-/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot,
+/* static */ int DicNodeUtils::getBigramNodeProbability(
+ const BinaryDictionaryInfo *const binaryDictionaryInfo,
const DicNode *const node, MultiBigramMap *multiBigramMap) {
const int unigramProbability = node->getProbability();
const int wordPos = node->getPos();
const int prevWordPos = node->getPrevWordPos();
if (NOT_VALID_WORD == wordPos || NOT_VALID_WORD == prevWordPos) {
// Note: Normally wordPos comes from the dictionary and should never equal NOT_VALID_WORD.
- return backoff(unigramProbability);
+ return ProbabilityUtils::backoff(unigramProbability);
}
if (multiBigramMap) {
return multiBigramMap->getBigramProbability(
- dicRoot, prevWordPos, wordPos, unigramProbability);
+ binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
}
- return BinaryFormat::getBigramProbability(dicRoot, prevWordPos, wordPos, unigramProbability);
+ return BinaryFormat::getBigramProbability(
+ binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
}
///////////////////////////////////////
diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h
index 5bc542d05..e198d6181 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h
@@ -24,6 +24,7 @@
namespace latinime {
+class BinaryDictionaryInfo;
class DicNode;
class DicNodeVector;
class ProximityInfo;
@@ -34,19 +35,20 @@ class DicNodeUtils {
public:
static int appendTwoWords(const int *src0, const int16_t length0, const int *src1,
const int16_t length1, int *dest);
- static void initAsRoot(const int rootPos, const uint8_t *const dicRoot,
+ static void initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo,
const int prevWordNodePos, DicNode *newRootNode);
- static void initAsRootWithPreviousWord(const int rootPos, const uint8_t *const dicRoot,
+ static void initAsRootWithPreviousWord(const BinaryDictionaryInfo *const binaryDictionaryInfo,
DicNode *prevWordLastNode, DicNode *newRootNode);
static void initByCopy(DicNode *srcNode, DicNode *destNode);
- static void getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
- DicNodeVector *childDicNodes);
- static float getBigramNodeImprobability(const uint8_t *const dicRoot,
+ static void getAllChildDicNodes(DicNode *dicNode,
+ const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes);
+ static float getBigramNodeImprobability(const BinaryDictionaryInfo *const binaryDictionaryInfo,
const DicNode *const node, MultiBigramMap *const multiBigramMap);
static bool isDicNodeFilteredOut(const int nodeCodePoint, const ProximityInfo *const pInfo,
const std::vector<int> *const codePointsFilter);
// TODO: Move to private
- static void getProximityChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
+ static void getProximityChildDicNodes(DicNode *dicNode,
+ const BinaryDictionaryInfo *const binaryDictionaryInfo,
const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
DicNodeVector *childDicNodes);
@@ -60,16 +62,18 @@ class DicNodeUtils {
// Max number of bigrams to look up
static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500;
- static int getBigramNodeProbability(const uint8_t *const dicRoot, const DicNode *const node,
- MultiBigramMap *multiBigramMap);
+ static int getBigramNodeProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo,
+ const DicNode *const node, MultiBigramMap *multiBigramMap);
static void createAndGetPassingChildNode(DicNode *dicNode, const ProximityInfoState *pInfoState,
const int pointIndex, const bool exactOnly, DicNodeVector *childDicNodes);
- static void createAndGetAllLeavingChildNodes(DicNode *dicNode, const uint8_t *const dicRoot,
+ static void createAndGetAllLeavingChildNodes(DicNode *dicNode,
+ const BinaryDictionaryInfo *const binaryDictionaryInfo,
const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
const std::vector<int> *const codePointsFilter,
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
- static int createAndGetLeavingChildNode(DicNode *dicNode, int pos, const uint8_t *const dicRoot,
- const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex,
+ static int createAndGetLeavingChildNode(DicNode *dicNode, int pos,
+ const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth,
+ const ProximityInfoState *pInfoState, const int pointIndex,
const bool exactOnly, const std::vector<int> *const codePointsFilter,
const ProximityInfo *const pInfo, DicNodeVector *childDicNodes);
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp
index c592542bd..59d1b19b6 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp
@@ -20,15 +20,18 @@
#include "bigram_dictionary.h"
-#include "char_utils.h"
#include "defines.h"
+#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/bloom_filter.h"
#include "suggest/core/dictionary/dictionary.h"
+#include "suggest/core/dictionary/probability_utils.h"
+#include "utils/char_utils.h"
namespace latinime {
-BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) {
+BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo)
+ : mBinaryDictionaryInfo(binaryDictionaryInfo) {
if (DEBUG_DICT) {
AKLOGI("BigramDictionary - constructor");
}
@@ -52,7 +55,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int
int insertAt = 0;
while (insertAt < MAX_RESULTS) {
if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability
- && length < getCodePointCount(MAX_WORD_LENGTH,
+ && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH,
bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
break;
}
@@ -103,7 +106,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
- const uint8_t *const root = DICT_ROOT;
+ const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
false /* forceLowerCaseSearch */);
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
@@ -134,7 +137,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
// here, but it can't get too bad.
- const int probability = BinaryFormat::computeProbabilityForBigram(
+ const int probability = ProbabilityUtils::computeProbabilityForBigram(
unigramProbability, bigramProbabilityTemp);
addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints,
outputTypes);
@@ -149,7 +152,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
const bool forceLowerCaseSearch) const {
if (0 >= prevWordLength) return 0;
- const uint8_t *const root = DICT_ROOT;
+ const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
forceLowerCaseSearch);
@@ -170,7 +173,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in
void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
- const uint8_t *const root = DICT_ROOT;
+ const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
false /* forceLowerCaseSearch */);
if (0 == pos) {
@@ -196,9 +199,9 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons
// what user typed.
int maxAlt = MAX_ALTERNATIVES;
- const int firstBaseLowerCodePoint = toBaseLowerCase(*word);
+ const int firstBaseLowerCodePoint = CharUtils::toBaseLowerCase(*word);
while (maxAlt > 0) {
- if (toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) {
+ if (CharUtils::toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) {
return true;
}
inputCodePoints++;
@@ -209,7 +212,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons
bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
int length2) const {
- const uint8_t *const root = DICT_ROOT;
+ const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
if (0 == pos) return false;
diff --git a/native/jni/src/bigram_dictionary.h b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h
index b86e564c3..8b7a253a2 100644
--- a/native/jni/src/bigram_dictionary.h
+++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h
@@ -24,9 +24,12 @@
namespace latinime {
+class BinaryDictionaryInfo;
+
class BigramDictionary {
public:
- BigramDictionary(const uint8_t *const streamStart);
+ BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
+
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
int *frequencies, int *outputTypes) const;
void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength,
@@ -35,13 +38,14 @@ class BigramDictionary {
~BigramDictionary();
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);
+
void addWordBigram(int *word, int length, int probability, int *bigramProbability,
int *bigramCodePoints, int *outputTypes) const;
bool checkFirstCharacter(int *word, int *inputCodePoints) const;
int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
const bool forceLowerCaseSearch) const;
- const uint8_t *const DICT_ROOT;
+ const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
// TODO: Re-implement proximity correction for bigram correction
static const int MAX_ALTERNATIVES = 1;
};
diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.cpp
new file mode 100644
index 000000000..50e0211d7
--- /dev/null
+++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/core/dictionary/binary_dictionary_format.h"
+
+namespace latinime {
+
+/**
+ * Dictionary size
+ */
+// Any file smaller than this is not a dictionary.
+const int BinaryDictionaryFormat::DICTIONARY_MINIMUM_SIZE = 4;
+
+/**
+ * Format versions
+ */
+// Originally, format version 1 had a 16-bit magic number, then the version number `01'
+// then options that must be 0. Hence the first 32-bits of the format are always as follow
+// and it's okay to consider them a magic number as a whole.
+const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
+const int BinaryDictionaryFormat::FORMAT_VERSION_1_HEADER_SIZE = 5;
+
+// The versions of Latin IME that only handle format version 1 only test for the magic
+// number, so we had to change it so that version 2 files would be rejected by older
+// implementations. On this occasion, we made the magic number 32 bits long.
+const uint32_t BinaryDictionaryFormat::FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
+// Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12
+const int BinaryDictionaryFormat::FORMAT_VERSION_2_MINIMUM_SIZE = 12;
+const int BinaryDictionaryFormat::VERSION_2_MAGIC_NUMBER_SIZE = 4;
+const int BinaryDictionaryFormat::VERSION_2_DICTIONARY_VERSION_SIZE = 2;
+const int BinaryDictionaryFormat::VERSION_2_DICTIONARY_FLAG_SIZE = 2;
+
+/* static */ BinaryDictionaryFormat::FORMAT_VERSION BinaryDictionaryFormat::detectFormatVersion(
+ const uint8_t *const dict, const int dictSize) {
+ // The magic number is stored big-endian.
+ // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't
+ // understand this format.
+ if (dictSize < DICTIONARY_MINIMUM_SIZE) {
+ return UNKNOWN_VERSION;
+ }
+ const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0);
+ switch (magicNumber) {
+ case FORMAT_VERSION_1_MAGIC_NUMBER:
+ // Format 1 header is exactly 5 bytes long and looks like:
+ // Magic number (2 bytes) 0x78 0xB1
+ // Version number (1 byte) 0x01
+ // Options (2 bytes) must be 0x00 0x00
+ return VERSION_1;
+ case FORMAT_VERSION_2_MAGIC_NUMBER:
+ // Version 2 dictionaries are at least 12 bytes long.
+ // If this dictionary has the version 2 magic number but is less than 12 bytes long,
+ // then it's an unknown format and we need to avoid confidently reading the next bytes.
+ if (dictSize < FORMAT_VERSION_2_MINIMUM_SIZE) {
+ return UNKNOWN_VERSION;
+ }
+ // Format 2 header is as follows:
+ // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
+ // Version number (2 bytes) 0x00 0x02
+ // Options (2 bytes)
+ // Header size (4 bytes) : integer, big endian
+ if (ByteArrayUtils::readUint16(dict, 4) == 2) {
+ return VERSION_2;
+ } else {
+ return UNKNOWN_VERSION;
+ }
+ default:
+ return UNKNOWN_VERSION;
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.h
new file mode 100644
index 000000000..3aa1662da
--- /dev/null
+++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_format.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BINARY_DICTIONARY_FORMAT_H
+#define LATINIME_BINARY_DICTIONARY_FORMAT_H
+
+#include <stdint.h>
+
+#include "defines.h"
+#include "suggest/core/dictionary/byte_array_utils.h"
+
+namespace latinime {
+
+/**
+ * Methods to handle binary dictionary format version.
+ *
+ * Currently, we have a file with a similar name, binary_format.h. binary_format.h contains binary
+ * reading methods and utility methods for various purposes.
+ * On the other hand, this file deals with only about dictionary format version.
+ */
+class BinaryDictionaryFormat {
+ public:
+ // TODO: Remove obsolete version logic
+ enum FORMAT_VERSION {
+ VERSION_1,
+ VERSION_2,
+ UNKNOWN_VERSION
+ };
+
+ static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize);
+
+ static AK_FORCE_INLINE int getHeaderSize(
+ const uint8_t *const dict, const FORMAT_VERSION format) {
+ switch (format) {
+ case VERSION_1:
+ return FORMAT_VERSION_1_HEADER_SIZE;
+ case VERSION_2:
+ // See the format of the header in the comment in detectFormat() above
+ return ByteArrayUtils::readUint32(dict, 8);
+ default:
+ return S_INT_MAX;
+ }
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryFormat);
+
+ static const int DICTIONARY_MINIMUM_SIZE;
+ static const uint32_t FORMAT_VERSION_1_MAGIC_NUMBER;
+ static const int FORMAT_VERSION_1_HEADER_SIZE;
+ static const uint32_t FORMAT_VERSION_2_MAGIC_NUMBER;
+ static const int FORMAT_VERSION_2_MINIMUM_SIZE;
+ static const int VERSION_2_MAGIC_NUMBER_SIZE;
+ static const int VERSION_2_DICTIONARY_VERSION_SIZE ;
+ static const int VERSION_2_DICTIONARY_FLAG_SIZE;
+};
+} // namespace latinime
+#endif /* LATINIME_BINARY_DICTIONARY_FORMAT_H */
diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h
new file mode 100644
index 000000000..8508c6786
--- /dev/null
+++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BINARY_DICTIONARY_INFO_H
+#define LATINIME_BINARY_DICTIONARY_INFO_H
+
+#include <stdint.h>
+
+#include "defines.h"
+#include "suggest/core/dictionary/binary_dictionary_format.h"
+
+namespace latinime {
+
+class BinaryDictionaryInfo {
+ public:
+ BinaryDictionaryInfo(const uint8_t *const dictBuf, const int dictSize)
+ : mDictBuf(dictBuf),
+ mFormat(BinaryDictionaryFormat::detectFormatVersion(mDictBuf, dictSize)),
+ mDictRoot(mDictBuf + BinaryDictionaryFormat::getHeaderSize(mDictBuf, mFormat)) {}
+
+ AK_FORCE_INLINE const uint8_t *getDictBuf() const {
+ return mDictBuf;
+ }
+
+ AK_FORCE_INLINE const uint8_t *getDictRoot() const {
+ return mDictRoot;
+ }
+
+ AK_FORCE_INLINE BinaryDictionaryFormat::FORMAT_VERSION getFormat() const {
+ return mFormat;
+ }
+
+ AK_FORCE_INLINE int getRootPosition() const {
+ return 0;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryInfo);
+
+ const uint8_t *const mDictBuf;
+ const BinaryDictionaryFormat::FORMAT_VERSION mFormat;
+ const uint8_t *const mDictRoot;
+};
+}
+#endif /* LATINIME_BINARY_DICTIONARY_INFO_H */
diff --git a/native/jni/src/suggest/core/dictionary/binary_format.h b/native/jni/src/suggest/core/dictionary/binary_format.h
index 65c2e9115..1b57793fa 100644
--- a/native/jni/src/suggest/core/dictionary/binary_format.h
+++ b/native/jni/src/suggest/core/dictionary/binary_format.h
@@ -18,12 +18,12 @@
#define LATINIME_BINARY_FORMAT_H
#include <cstdlib>
-#include <map>
#include <stdint.h>
-#include "char_utils.h"
-#include "hash_map_compat.h"
#include "suggest/core/dictionary/bloom_filter.h"
+#include "suggest/core/dictionary/probability_utils.h"
+#include "utils/char_utils.h"
+#include "utils/hash_map_compat.h"
namespace latinime {
@@ -91,10 +91,6 @@ class BinaryFormat {
const int length, const bool forceLowerCaseSearch);
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
int *outWord, int *outUnigramProbability);
- static int computeProbabilityForBigram(
- const int unigramProbability, const int bigramProbability);
- static int getProbability(const int position, const std::map<int, int> *bigramMap,
- const uint8_t *bigramFilter, const int unigramProbability);
static int getBigramProbabilityFromHashMap(const int position,
const hash_map_compat<int, int> *bigramMap, const int unigramProbability);
static float getMultiWordCostMultiplier(const uint8_t *const dict, const int dictSize);
@@ -473,7 +469,8 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
// there was no match (or we would have found it).
if (wordPos >= length) return NOT_VALID_WORD;
int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
- const int wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
+ const int wChar = forceLowerCaseSearch
+ ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos];
while (true) {
// If there are no more character groups in this node, it means we could not
// find a matching character for this depth, therefore there is no match.
@@ -677,51 +674,18 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
return 0;
}
-static inline int backoff(const int unigramProbability) {
- return unigramProbability;
- // For some reason, applying the backoff weight gives bad results in tests. To apply the
- // backoff weight, we divide the probability by 2, which in our storing format means
- // decreasing the score by 8.
- // TODO: figure out what's wrong with this.
- // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8);
-}
-
-inline int BinaryFormat::computeProbabilityForBigram(
- const int unigramProbability, const int bigramProbability) {
- // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the
- // unigram probability to be the median value of the 17th step from the top. A value of
- // 0 for the bigram probability represents the middle of the 16th step from the top,
- // while a value of 15 represents the middle of the top step.
- // See makedict.BinaryDictInputOutput for details.
- const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability)
- / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY);
- return unigramProbability
- + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize);
-}
-
-// This returns a probability in log space.
-inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
- const uint8_t *bigramFilter, const int unigramProbability) {
- if (!bigramMap || !bigramFilter) return backoff(unigramProbability);
- if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability);
- const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
- if (bigramProbabilityIt != bigramMap->end()) {
- const int bigramProbability = bigramProbabilityIt->second;
- return computeProbabilityForBigram(unigramProbability, bigramProbability);
- }
- return backoff(unigramProbability);
-}
-
// This returns a probability in log space.
inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position,
const hash_map_compat<int, int> *bigramMap, const int unigramProbability) {
- if (!bigramMap) return backoff(unigramProbability);
+ if (!bigramMap) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ }
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
if (bigramProbabilityIt != bigramMap->end()) {
const int bigramProbability = bigramProbabilityIt->second;
- return computeProbabilityForBigram(unigramProbability, bigramProbability);
+ return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability);
}
- return backoff(unigramProbability);
+ return ProbabilityUtils::backoff(unigramProbability);
}
AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap(
@@ -742,7 +706,9 @@ AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap(
AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position,
const int nextPosition, const int unigramProbability) {
position = getBigramListPositionForWordPosition(root, position);
- if (0 == position) return backoff(unigramProbability);
+ if (0 == position) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ }
uint8_t bigramFlags;
do {
@@ -751,10 +717,11 @@ AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root
root, bigramFlags, &position);
if (bigramPos == nextPosition) {
const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
- return computeProbabilityForBigram(unigramProbability, bigramProbability);
+ return ProbabilityUtils::computeProbabilityForBigram(
+ unigramProbability, bigramProbability);
}
} while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
- return backoff(unigramProbability);
+ return ProbabilityUtils::backoff(unigramProbability);
}
// Returns a pointer to the start of the bigram list.
diff --git a/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp b/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp
new file mode 100644
index 000000000..68b1d5d15
--- /dev/null
+++ b/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/core/dictionary/byte_array_utils.h"
+
+namespace latinime {
+
+const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
+const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F;
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/core/dictionary/byte_array_utils.h b/native/jni/src/suggest/core/dictionary/byte_array_utils.h
new file mode 100644
index 000000000..832b74725
--- /dev/null
+++ b/native/jni/src/suggest/core/dictionary/byte_array_utils.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BYTE_ARRAY_UTILS_H
+#define LATINIME_BYTE_ARRAY_UTILS_H
+
+#include <stdint.h>
+
+#include "defines.h"
+
+namespace latinime {
+
+/**
+ * Utility methods for reading byte arrays.
+ */
+class ByteArrayUtils {
+ public:
+ /**
+ * Integer
+ *
+ * Each method read a corresponding size integer in a big endian manner.
+ */
+ static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) {
+ return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16)
+ ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3];
+ }
+
+ static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) {
+ return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2];
+ }
+
+ static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) {
+ return (buffer[pos] << 8) ^ buffer[pos + 1];
+ }
+
+ static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) {
+ return buffer[pos];
+ }
+
+ static AK_FORCE_INLINE uint32_t readUint32andAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint32_t value = readUint32(buffer, *pos);
+ *pos += 4;
+ return value;
+ }
+
+ static AK_FORCE_INLINE uint32_t readUint24andAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint32_t value = readUint24(buffer, *pos);
+ *pos += 3;
+ return value;
+ }
+
+ static AK_FORCE_INLINE uint16_t readUint16andAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint16_t value = readUint16(buffer, *pos);
+ *pos += 2;
+ return value;
+ }
+
+ static AK_FORCE_INLINE uint8_t readUint8andAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ return buffer[(*pos)++];
+ }
+
+ /**
+ * Code Point
+ *
+ * 1 byte = bbbbbbbb match
+ * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
+ * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
+ * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
+ * 00011111 would be outside unicode.
+ * else: iso-latin-1 code
+ * This allows for the whole unicode range to be encoded, including chars outside of
+ * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
+ * characters which should never happen anyway (and still work, but take 3 bytes).
+ */
+ static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
+ int p = pos;
+ return readCodePointAndAdvancePosition(buffer, &p);
+ }
+
+ static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint8_t firstByte = readUint8(buffer, *pos);
+ if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
+ if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
+ *pos += 1;
+ return NOT_A_CODE_POINT;
+ } else {
+ return readUint24andAdvancePosition(buffer, pos);
+ }
+ } else {
+ *pos += 1;
+ return firstByte;
+ }
+ }
+
+ /**
+ * String (array of code points)
+ *
+ * Reads code points until the terminator is found.
+ */
+ // Returns the length of the string.
+ static int readStringAndAdvancePosition(const uint8_t *const buffer, int *const pos,
+ int *const outBuffer, const int maxLength) {
+ int length = 0;
+ int codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
+ outBuffer[length++] = codePoint;
+ codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ }
+ return length;
+ }
+
+ // Advances the position and returns the length of the string.
+ static int advancePositionToBehindString(
+ const uint8_t *const buffer, int *const pos, const int maxLength) {
+ int length = 0;
+ int codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
+ codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ }
+ return length;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
+
+ static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE;
+ static const uint8_t CHARACTER_ARRAY_TERMINATOR;
+};
+} // namespace latinime
+#endif /* LATINIME_BYTE_ARRAY_UTILS_H */
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp
index 1939c7420..6fd755dfe 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp
@@ -21,12 +21,12 @@
#include <map> // TODO: remove
#include <stdint.h>
-#include "bigram_dictionary.h"
#include "defines.h"
#include "dic_traverse_wrapper.h"
-#include "suggest_options.h"
-#include "suggest/core/suggest.h"
+#include "suggest/core/dictionary/bigram_dictionary.h"
#include "suggest/core/dictionary/binary_format.h"
+#include "suggest/core/suggest.h"
+#include "suggest/core/suggest_options.h"
#include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h"
#include "suggest/policyimpl/typing/typing_suggest_policy_factory.h"
#include "unigram_dictionary.h"
@@ -34,13 +34,11 @@
namespace latinime {
Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust)
- : mDict(static_cast<unsigned char *>(dict)),
- mOffsetDict((static_cast<unsigned char *>(dict))
- + BinaryFormat::getHeaderSize(mDict, dictSize)),
+ : mBinaryDicitonaryInfo(static_cast<const uint8_t *>(dict), dictSize),
mDictSize(dictSize), mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust),
- mUnigramDictionary(new UnigramDictionary(mOffsetDict,
- BinaryFormat::getFlags(mDict, dictSize))),
- mBigramDictionary(new BigramDictionary(mOffsetDict)),
+ mUnigramDictionary(new UnigramDictionary(&mBinaryDicitonaryInfo,
+ BinaryFormat::getFlags(mBinaryDicitonaryInfo.getDictBuf(), dictSize))),
+ mBigramDictionary(new BigramDictionary(&mBinaryDicitonaryInfo)),
mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())),
mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) {
}
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h
index e6861a3dd..771837bc6 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.h
+++ b/native/jni/src/suggest/core/dictionary/dictionary.h
@@ -20,6 +20,7 @@
#include <stdint.h>
#include "defines.h"
+#include "suggest/core/dictionary/binary_dictionary_info.h"
namespace latinime {
@@ -64,11 +65,8 @@ class Dictionary {
int getProbability(const int *word, int length) const;
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
- const uint8_t *getDict() const { // required to release dictionary buffer
- return mDict;
- }
- const uint8_t *getOffsetDict() const {
- return mOffsetDict;
+ const BinaryDictionaryInfo *getBinaryDictionaryInfo() const {
+ return &mBinaryDicitonaryInfo;
}
int getDictSize() const { return mDictSize; }
int getMmapFd() const { return mMmapFd; }
@@ -78,9 +76,8 @@ class Dictionary {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary);
- const uint8_t *mDict;
- const uint8_t *mOffsetDict;
+ const BinaryDictionaryInfo mBinaryDicitonaryInfo;
// Used only for the mmap version of dictionary loading, but we use these as dummy variables
// also for the malloc version.
const int mDictSize;
diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp
index 7a0f755e5..f53e56ef1 100644
--- a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp
+++ b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp
@@ -16,9 +16,9 @@
#include "suggest/core/dictionary/digraph_utils.h"
-#include "char_utils.h"
#include "defines.h"
#include "suggest/core/dictionary/binary_format.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -122,7 +122,7 @@ const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] =
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForDigraphTypeAndCodePoint(
const DigraphUtils::DigraphType digraphType, const int compositeGlyphCodePoint) {
const DigraphUtils::digraph_t *digraphs = 0;
- const int compositeGlyphLowerCodePoint = toLowerCase(compositeGlyphCodePoint);
+ const int compositeGlyphLowerCodePoint = CharUtils::toLowerCase(compositeGlyphCodePoint);
const int digraphsSize =
DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(digraphType, &digraphs);
for (int i = 0; i < digraphsSize; i++) {
diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h
index fcac98f35..ba97e5842 100644
--- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h
+++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h
@@ -17,11 +17,10 @@
#ifndef LATINIME_MULTI_BIGRAM_MAP_H
#define LATINIME_MULTI_BIGRAM_MAP_H
-#include <stdint.h>
-
#include "defines.h"
-#include "hash_map_compat.h"
+#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
+#include "utils/hash_map_compat.h"
namespace latinime {
@@ -35,20 +34,20 @@ class MultiBigramMap {
// Look up the bigram probability for the given word pair from the cached bigram maps.
// Also caches the bigrams if there is space remaining and they have not been cached already.
- int getBigramProbability(const uint8_t *const dicRoot, const int wordPosition,
- const int nextWordPosition, const int unigramProbability) {
+ int getBigramProbability(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
+ const int wordPosition, const int nextWordPosition, const int unigramProbability) {
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
mBigramMaps.find(wordPosition);
if (mapPosition != mBigramMaps.end()) {
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
}
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
- addBigramsForWordPosition(dicRoot, wordPosition);
+ addBigramsForWordPosition(binaryDicitonaryInfo, wordPosition);
return mBigramMaps[wordPosition].getBigramProbability(
nextWordPosition, unigramProbability);
}
- return BinaryFormat::getBigramProbability(
- dicRoot, wordPosition, nextWordPosition, unigramProbability);
+ return BinaryFormat::getBigramProbability(binaryDicitonaryInfo->getDictRoot(),
+ wordPosition, nextWordPosition, unigramProbability);
}
void clear() {
@@ -63,8 +62,9 @@ class MultiBigramMap {
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {}
~BigramMap() {}
- void init(const uint8_t *const dicRoot, int position) {
- BinaryFormat::fillBigramProbabilityToHashMap(dicRoot, position, &mBigramMap);
+ void init(const BinaryDictionaryInfo *const binaryDicitonaryInfo, const int position) {
+ BinaryFormat::fillBigramProbabilityToHashMap(
+ binaryDicitonaryInfo->getDictRoot(), position, &mBigramMap);
}
inline int getBigramProbability(const int nextWordPosition, const int unigramProbability)
@@ -78,8 +78,9 @@ class MultiBigramMap {
hash_map_compat<int, int> mBigramMap;
};
- void addBigramsForWordPosition(const uint8_t *const dicRoot, const int position) {
- mBigramMaps[position].init(dicRoot, position);
+ void addBigramsForWordPosition(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
+ const int position) {
+ mBigramMaps[position].init(binaryDicitonaryInfo, position);
}
hash_map_compat<int, BigramMap> mBigramMaps;
diff --git a/native/jni/src/suggest/core/dictionary/probability_utils.h b/native/jni/src/suggest/core/dictionary/probability_utils.h
new file mode 100644
index 000000000..14d2f8436
--- /dev/null
+++ b/native/jni/src/suggest/core/dictionary/probability_utils.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PROBABILITY_UTILS_H
+#define LATINIME_PROBABILITY_UTILS_H
+
+#include <map>
+#include <stdint.h>
+
+#include "defines.h"
+
+namespace latinime {
+
+class ProbabilityUtils {
+ public:
+ static AK_FORCE_INLINE int backoff(const int unigramProbability) {
+ return unigramProbability;
+ // For some reason, applying the backoff weight gives bad results in tests. To apply the
+ // backoff weight, we divide the probability by 2, which in our storing format means
+ // decreasing the score by 8.
+ // TODO: figure out what's wrong with this.
+ // return unigramProbability > 8 ?
+ // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8);
+ }
+
+ static AK_FORCE_INLINE int computeProbabilityForBigram(
+ const int unigramProbability, const int bigramProbability) {
+ // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want
+ // the unigram probability to be the median value of the 17th step from the top. A value of
+ // 0 for the bigram probability represents the middle of the 16th step from the top,
+ // while a value of 15 represents the middle of the top step.
+ // See makedict.BinaryDictInputOutput for details.
+ const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability)
+ / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY);
+ return unigramProbability
+ + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize);
+ }
+
+ // This returns a probability in log space.
+ static AK_FORCE_INLINE int getProbability(const int position,
+ const std::map<int, int> *const bigramMap,
+ const uint8_t *bigramFilter, const int unigramProbability) {
+ if (!bigramMap || !bigramFilter) {
+ return backoff(unigramProbability);
+ }
+ if (!isInFilter(bigramFilter, position)){
+ return backoff(unigramProbability);
+ }
+ const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
+ if (bigramProbabilityIt != bigramMap->end()) {
+ const int bigramProbability = bigramProbabilityIt->second;
+ return computeProbabilityForBigram(unigramProbability, bigramProbability);
+ }
+ return backoff(unigramProbability);
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils);
+};
+}
+#endif /* LATINIME_PROBABILITY_UTILS_H */
diff --git a/native/jni/src/suggest/core/dictionary/terminal_attributes.h b/native/jni/src/suggest/core/dictionary/terminal_attributes.h
index 8377c603d..bbd9af090 100644
--- a/native/jni/src/suggest/core/dictionary/terminal_attributes.h
+++ b/native/jni/src/suggest/core/dictionary/terminal_attributes.h
@@ -19,6 +19,7 @@
#include <stdint.h>
+#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
namespace latinime {
@@ -32,8 +33,9 @@ class TerminalAttributes {
public:
class ShortcutIterator {
public:
- ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags)
- : mDict(dict), mPos(pos),
+ ShortcutIterator(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos,
+ const uint8_t flags)
+ : mBinaryDicitionaryInfo(binaryDictionaryInfo), mPos(pos),
mHasNextShortcutTarget(0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS)) {
}
@@ -44,11 +46,13 @@ class TerminalAttributes {
// Gets the shortcut target itself as an int string. For parameters and return value
// see BinaryFormat::getWordAtAddress.
inline int getNextShortcutTarget(const int maxDepth, int *outWord, int *outFreq) {
- const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
+ const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(
+ mBinaryDicitionaryInfo->getDictRoot(), &mPos);
mHasNextShortcutTarget = 0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT);
unsigned int i;
for (i = 0; i < MAX_WORD_LENGTH; ++i) {
- const int codePoint = BinaryFormat::getCodePointAndForwardPointer(mDict, &mPos);
+ const int codePoint = BinaryFormat::getCodePointAndForwardPointer(
+ mBinaryDicitionaryInfo->getDictRoot(), &mPos);
if (NOT_A_CODE_POINT == codePoint) break;
outWord[i] = codePoint;
}
@@ -57,19 +61,21 @@ class TerminalAttributes {
}
private:
- const uint8_t *const mDict;
+ const BinaryDictionaryInfo *const mBinaryDicitionaryInfo;
int mPos;
bool mHasNextShortcutTarget;
};
- TerminalAttributes(const uint8_t *const dict, const uint8_t flags, const int pos)
- : mDict(dict), mFlags(flags), mStartPos(pos) {
+ TerminalAttributes(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
+ const uint8_t flags, const int pos)
+ : mBinaryDicitionaryInfo(binaryDicitonaryInfo), mFlags(flags), mStartPos(pos) {
}
inline ShortcutIterator getShortcutIterator() const {
// The size of the shortcuts is stored here so that the whole shortcut chunk can be
// skipped quickly, so we ignore it.
- return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
+ return ShortcutIterator(
+ mBinaryDicitionaryInfo, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
}
bool isBlacklistedOrNotAWord() const {
@@ -78,7 +84,7 @@ class TerminalAttributes {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes);
- const uint8_t *const mDict;
+ const BinaryDictionaryInfo *const mBinaryDicitionaryInfo;
const uint8_t mFlags;
const int mStartPos;
};
diff --git a/native/jni/src/suggest/core/layout/proximity_info.cpp b/native/jni/src/suggest/core/layout/proximity_info.cpp
index 6dd88051c..80355c148 100644
--- a/native/jni/src/suggest/core/layout/proximity_info.cpp
+++ b/native/jni/src/suggest/core/layout/proximity_info.cpp
@@ -21,12 +21,12 @@
#include <cstring>
#include <cmath>
-#include "char_utils.h"
#include "defines.h"
#include "jni.h"
#include "suggest/core/layout/additional_proximity_chars.h"
#include "suggest/core/layout/geometry_utils.h"
#include "suggest/core/layout/proximity_info_params.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -165,7 +165,7 @@ void ProximityInfo::initializeG() {
// TODO: Optimize
for (int i = 0; i < KEY_COUNT; ++i) {
const int code = mKeyCodePoints[i];
- const int lowerCode = toLowerCase(code);
+ const int lowerCode = CharUtils::toLowerCase(code);
mCenterXsG[i] = mKeyXCoordinates[i] + mKeyWidths[i] / 2;
mCenterYsG[i] = mKeyYCoordinates[i] + mKeyHeights[i] / 2;
mCodeToKeyMap[lowerCode] = i;
diff --git a/native/jni/src/suggest/core/layout/proximity_info.h b/native/jni/src/suggest/core/layout/proximity_info.h
index 6d2ddd4bc..6ca2fdd7b 100644
--- a/native/jni/src/suggest/core/layout/proximity_info.h
+++ b/native/jni/src/suggest/core/layout/proximity_info.h
@@ -18,9 +18,9 @@
#define LATINIME_PROXIMITY_INFO_H
#include "defines.h"
-#include "hash_map_compat.h"
#include "jni.h"
#include "suggest/core/layout/proximity_info_utils.h"
+#include "utils/hash_map_compat.h"
namespace latinime {
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.cpp b/native/jni/src/suggest/core/layout/proximity_info_state.cpp
index 2bd3ceb7e..4e53992d4 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_state.cpp
+++ b/native/jni/src/suggest/core/layout/proximity_info_state.cpp
@@ -26,6 +26,7 @@
#include "suggest/core/layout/geometry_utils.h"
#include "suggest/core/layout/proximity_info.h"
#include "suggest/core/layout/proximity_info_state_utils.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -175,7 +176,7 @@ float ProximityInfoState::getPointToKeyLength(
const int index = inputIndex * mProximityInfo->getKeyCount() + keyId;
return min(mSampledNormalizedSquaredLengthCache[index], mMaxPointToKeyLength);
}
- if (isIntentionalOmissionCodePoint(codePoint)) {
+ if (CharUtils::isIntentionalOmissionCodePoint(codePoint)) {
return 0.0f;
}
// If the char is not a key on the keyboard then return the max length.
@@ -203,7 +204,7 @@ ProximityType ProximityInfoState::getProximityType(const int index, const int co
const bool checkProximityChars, int *proximityIndex) const {
const int *currentCodePoints = getProximityCodePointsAt(index);
const int firstCodePoint = currentCodePoints[0];
- const int baseLowerC = toBaseLowerCase(codePoint);
+ const int baseLowerC = CharUtils::toBaseLowerCase(codePoint);
// The first char in the array is what user typed. If it matches right away, that means the
// user typed that same char for this pos.
@@ -215,7 +216,7 @@ ProximityType ProximityInfoState::getProximityType(const int index, const int co
// If the non-accented, lowercased version of that first character matches c, then we have a
// non-accented version of the accented character the user typed. Treat it as a close char.
- if (toBaseLowerCase(firstCodePoint) == baseLowerC) {
+ if (CharUtils::toBaseLowerCase(firstCodePoint) == baseLowerC) {
return PROXIMITY_CHAR;
}
@@ -257,8 +258,8 @@ ProximityType ProximityInfoState::getProximityTypeG(const int index, const int c
if (!isUsed()) {
return UNRELATED_CHAR;
}
- const int lowerCodePoint = toLowerCase(codePoint);
- const int baseLowerCodePoint = toBaseCodePoint(lowerCodePoint);
+ const int lowerCodePoint = CharUtils::toLowerCase(codePoint);
+ const int baseLowerCodePoint = CharUtils::toBaseCodePoint(lowerCodePoint);
for (int i = 0; i < static_cast<int>(mSampledSearchKeyVectors[index].size()); ++i) {
if (mSampledSearchKeyVectors[index][i] == lowerCodePoint
|| mSampledSearchKeyVectors[index][i] == baseLowerCodePoint) {
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.h b/native/jni/src/suggest/core/layout/proximity_info_state.h
index fd09307fe..0079ab5b8 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_state.h
+++ b/native/jni/src/suggest/core/layout/proximity_info_state.h
@@ -20,11 +20,10 @@
#include <cstring> // for memset()
#include <vector>
-#include "char_utils.h"
#include "defines.h"
-#include "hash_map_compat.h"
#include "suggest/core/layout/proximity_info_params.h"
#include "suggest/core/layout/proximity_info_state_utils.h"
+#include "utils/hash_map_compat.h"
namespace latinime {
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h
index 1837c7ab6..66fe07926 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_state_utils.h
+++ b/native/jni/src/suggest/core/layout/proximity_info_state_utils.h
@@ -21,7 +21,7 @@
#include <vector>
#include "defines.h"
-#include "hash_map_compat.h"
+#include "utils/hash_map_compat.h"
namespace latinime {
class ProximityInfo;
diff --git a/native/jni/src/suggest/core/layout/proximity_info_utils.h b/native/jni/src/suggest/core/layout/proximity_info_utils.h
index c3a275b3c..54f7539d1 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_utils.h
+++ b/native/jni/src/suggest/core/layout/proximity_info_utils.h
@@ -19,11 +19,11 @@
#include <cmath>
-#include "char_utils.h"
#include "defines.h"
-#include "hash_map_compat.h"
#include "suggest/core/layout/additional_proximity_chars.h"
#include "suggest/core/layout/geometry_utils.h"
+#include "utils/char_utils.h"
+#include "utils/hash_map_compat.h"
namespace latinime {
class ProximityInfoUtils {
@@ -37,7 +37,7 @@ class ProximityInfoUtils {
if (c == NOT_A_CODE_POINT) {
return NOT_AN_INDEX;
}
- const int lowerCode = toLowerCase(c);
+ const int lowerCode = CharUtils::toLowerCase(c);
hash_map_compat<int, int>::const_iterator mapPos = codeToKeyMap->find(lowerCode);
if (mapPos != codeToKeyMap->end()) {
return mapPos->second;
diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp
index d01531f07..0c57ca001 100644
--- a/native/jni/src/suggest/core/policy/weighting.cpp
+++ b/native/jni/src/suggest/core/policy/weighting.cpp
@@ -16,7 +16,6 @@
#include "suggest/core/policy/weighting.h"
-#include "char_utils.h"
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_profiler.h"
@@ -143,7 +142,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_TERMINAL: {
const float languageImprobability =
DicNodeUtils::getBigramNodeImprobability(
- traverseSession->getOffsetDict(), dicNode, multiBigramMap);
+ traverseSession->getBinaryDictionaryInfo(), dicNode, multiBigramMap);
return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability);
}
case CT_NEW_WORD_SPACE_SUBSTITUTION:
diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
index 4e634500c..be293df42 100644
--- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp
+++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
@@ -20,6 +20,7 @@
#include "dic_traverse_wrapper.h"
#include "jni.h"
#include "suggest/core/dicnode/dic_node_utils.h"
+#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/dictionary.h"
@@ -65,7 +66,8 @@ static TraverseSessionFactoryRegisterer traverseSessionFactoryRegisterer;
void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord,
int prevWordLength, const SuggestOptions *const suggestOptions) {
mDictionary = dictionary;
- mMultiWordCostMultiplier = BinaryFormat::getMultiWordCostMultiplier(mDictionary->getDict(),
+ mMultiWordCostMultiplier = BinaryFormat::getMultiWordCostMultiplier(
+ mDictionary->getBinaryDictionaryInfo()->getDictBuf(),
mDictionary->getDictSize());
mSuggestOptions = suggestOptions;
if (!prevWord) {
@@ -73,12 +75,14 @@ void DicTraverseSession::init(const Dictionary *const dictionary, const int *pre
return;
}
// TODO: merge following similar calls to getTerminalPosition into one case-insensitive call.
- mPrevWordPos = BinaryFormat::getTerminalPosition(dictionary->getOffsetDict(), prevWord,
+ mPrevWordPos = BinaryFormat::getTerminalPosition(
+ dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord,
prevWordLength, false /* forceLowerCaseSearch */);
if (mPrevWordPos == NOT_VALID_WORD) {
// Check bigrams for lower-cased previous word if original was not found. Useful for
// auto-capitalized words like "The [current_word]".
- mPrevWordPos = BinaryFormat::getTerminalPosition(dictionary->getOffsetDict(), prevWord,
+ mPrevWordPos = BinaryFormat::getTerminalPosition(
+ dictionary->getBinaryDictionaryInfo()->getDictRoot(), prevWord,
prevWordLength, true /* forceLowerCaseSearch */);
}
}
@@ -93,8 +97,8 @@ void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo,
maxSpatialDistance, maxPointerCount);
}
-const uint8_t *DicTraverseSession::getOffsetDict() const {
- return mDictionary->getOffsetDict();
+const BinaryDictionaryInfo *DicTraverseSession::getBinaryDictionaryInfo() const {
+ return mDictionary->getBinaryDictionaryInfo();
}
int DicTraverseSession::getDictFlags() const {
diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h
index e5c7f8e0c..3b6a3dc8c 100644
--- a/native/jni/src/suggest/core/session/dic_traverse_session.h
+++ b/native/jni/src/suggest/core/session/dic_traverse_session.h
@@ -28,6 +28,7 @@
namespace latinime {
+class BinaryDictionaryInfo;
class Dictionary;
class ProximityInfo;
class SuggestOptions;
@@ -56,7 +57,7 @@ class DicTraverseSession {
void resetCache(const int nextActiveCacheSize, const int maxWords);
// TODO: Remove
- const uint8_t *getOffsetDict() const;
+ const BinaryDictionaryInfo *getBinaryDictionaryInfo() const;
int getDictFlags() const;
//--------------------
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index 94441877a..1f108e400 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -16,7 +16,6 @@
#include "suggest/core/suggest.h"
-#include "char_utils.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_priority_queue.h"
#include "suggest/core/dicnode/dic_node_vector.h"
@@ -106,8 +105,8 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPo
traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(), MAX_RESULTS);
// Create a new dic node here
DicNode rootNode;
- DicNodeUtils::initAsRoot(traverseSession->getDicRootPos(),
- traverseSession->getOffsetDict(), traverseSession->getPrevWordPos(), &rootNode);
+ DicNodeUtils::initAsRoot(traverseSession->getBinaryDictionaryInfo(),
+ traverseSession->getPrevWordPos(), &rootNode);
traverseSession->getDicTraverseCache()->copyPushActive(&rootNode);
}
}
@@ -159,7 +158,7 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
+ doubleLetterCost;
- const TerminalAttributes terminalAttributes(traverseSession->getOffsetDict(),
+ const TerminalAttributes terminalAttributes(traverseSession->getBinaryDictionaryInfo(),
terminalDicNode->getFlags(), terminalDicNode->getAttributesPos());
const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0;
const bool isExactMatch = terminalDicNode->isExactMatch();
@@ -285,7 +284,7 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const {
}
DicNodeUtils::getAllChildDicNodes(
- &dicNode, traverseSession->getOffsetDict(), &childDicNodes);
+ &dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes);
const int childDicNodesSize = childDicNodes.getSizeAndLock();
for (int i = 0; i < childDicNodesSize; ++i) {
@@ -432,7 +431,8 @@ void Suggest::processDicNodeAsDigraph(DicTraverseSession *traverseSession,
void Suggest::processDicNodeAsOmission(
DicTraverseSession *traverseSession, DicNode *dicNode) const {
DicNodeVector childDicNodes;
- DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getOffsetDict(), &childDicNodes);
+ DicNodeUtils::getAllChildDicNodes(
+ dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes);
const int size = childDicNodes.getSizeAndLock();
for (int i = 0; i < size; i++) {
@@ -457,7 +457,7 @@ void Suggest::processDicNodeAsInsertion(DicTraverseSession *traverseSession,
DicNode *dicNode) const {
const int16_t pointIndex = dicNode->getInputIndex(0);
DicNodeVector childDicNodes;
- DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getOffsetDict(),
+ DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(),
traverseSession->getProximityInfoState(0), pointIndex + 1, true, &childDicNodes);
const int size = childDicNodes.getSizeAndLock();
for (int i = 0; i < size; i++) {
@@ -475,14 +475,14 @@ void Suggest::processDicNodeAsTransposition(DicTraverseSession *traverseSession,
DicNode *dicNode) const {
const int16_t pointIndex = dicNode->getInputIndex(0);
DicNodeVector childDicNodes1;
- DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getOffsetDict(),
+ DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(),
traverseSession->getProximityInfoState(0), pointIndex + 1, false, &childDicNodes1);
const int childSize1 = childDicNodes1.getSizeAndLock();
for (int i = 0; i < childSize1; i++) {
if (childDicNodes1[i]->hasChildren()) {
DicNodeVector childDicNodes2;
DicNodeUtils::getProximityChildDicNodes(
- childDicNodes1[i], traverseSession->getOffsetDict(),
+ childDicNodes1[i], traverseSession->getBinaryDictionaryInfo(),
traverseSession->getProximityInfoState(0), pointIndex, false, &childDicNodes2);
const int childSize2 = childDicNodes2.getSizeAndLock();
for (int j = 0; j < childSize2; j++) {
@@ -522,8 +522,8 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
// Create a non-cached node here.
DicNode newDicNode;
- DicNodeUtils::initAsRootWithPreviousWord(traverseSession->getDicRootPos(),
- traverseSession->getOffsetDict(), dicNode, &newDicNode);
+ DicNodeUtils::initAsRootWithPreviousWord(
+ traverseSession->getBinaryDictionaryInfo(), dicNode, &newDicNode);
const CorrectionType correctionType = spaceSubstitution ?
CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMITTION;
Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode,
diff --git a/native/jni/src/suggest_options.h b/native/jni/src/suggest/core/suggest_options.h
index 1b21aafcf..1b21aafcf 100644
--- a/native/jni/src/suggest_options.h
+++ b/native/jni/src/suggest/core/suggest_options.h
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
index b212fe101..e21b318e6 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
@@ -19,7 +19,6 @@
#include <stdint.h>
-#include "char_utils.h"
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
@@ -27,6 +26,7 @@
#include "suggest/core/policy/traversal.h"
#include "suggest/core/session/dic_traverse_session.h"
#include "suggest/policyimpl/typing/scoring_params.h"
+#include "utils/char_utils.h"
namespace latinime {
class TypingTraversal : public Traversal {
@@ -64,9 +64,9 @@ class TypingTraversal : public Traversal {
}
const int point0Index = dicNode->getInputIndex(0);
const int currentBaseLowerCodePoint =
- toBaseLowerCase(childDicNode->getNodeCodePoint());
+ CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint());
const int typedBaseLowerCodePoint =
- toBaseLowerCase(traverseSession->getProximityInfoState(0)
+ CharUtils::toBaseLowerCase(traverseSession->getProximityInfoState(0)
->getPrimaryCodePointAt(point0Index));
return (currentBaseLowerCodePoint != typedBaseLowerCodePoint);
}
@@ -172,7 +172,7 @@ class TypingTraversal : public Traversal {
}
const int c = dicNode->getOutputWordBuf()[0];
const bool shortCappedWord = dicNode->getDepth()
- < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && isAsciiUpper(c);
+ < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && CharUtils::isAsciiUpper(c);
return !shortCappedWord
|| probability >= ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED;
}
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
index cb6abd574..17fa11082 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
@@ -23,6 +23,7 @@
#include "suggest/core/policy/weighting.h"
#include "suggest/core/session/dic_traverse_session.h"
#include "suggest/policyimpl/typing/scoring_params.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -98,9 +99,9 @@ class TypingWeighting : public Weighting {
bool isProximityDicNode(const DicTraverseSession *const traverseSession,
const DicNode *const dicNode) const {
const int pointIndex = dicNode->getInputIndex(0);
- const int primaryCodePoint = toBaseLowerCase(
+ const int primaryCodePoint = CharUtils::toBaseLowerCase(
traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex));
- const int dicNodeChar = toBaseLowerCase(dicNode->getNodeCodePoint());
+ const int dicNodeChar = CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint());
return primaryCodePoint != dicNodeChar;
}
@@ -145,7 +146,7 @@ class TypingWeighting : public Weighting {
float getNewWordBigramCost(const DicTraverseSession *const traverseSession,
const DicNode *const dicNode,
MultiBigramMap *const multiBigramMap) const {
- return DicNodeUtils::getBigramNodeImprobability(traverseSession->getOffsetDict(),
+ return DicNodeUtils::getBigramNodeImprobability(traverseSession->getBinaryDictionaryInfo(),
dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE;
}
diff --git a/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h b/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h
index ec1457455..81614bc9c 100644
--- a/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h
+++ b/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h
@@ -17,8 +17,8 @@
#ifndef LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H
#define LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H
-#include "char_utils.h"
#include "suggest/policyimpl/utils/edit_distance_policy.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -31,8 +31,8 @@ class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy {
~DamerauLevenshteinEditDistancePolicy() {}
AK_FORCE_INLINE float getSubstitutionCost(const int index0, const int index1) const {
- const int c0 = toBaseLowerCase(mString0[index0]);
- const int c1 = toBaseLowerCase(mString1[index1]);
+ const int c0 = CharUtils::toBaseLowerCase(mString0[index0]);
+ const int c1 = CharUtils::toBaseLowerCase(mString1[index1]);
return (c0 == c1) ? 0.0f : 1.0f;
}
@@ -45,10 +45,10 @@ class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy {
}
AK_FORCE_INLINE bool allowTransposition(const int index0, const int index1) const {
- const int c0 = toBaseLowerCase(mString0[index0]);
- const int c1 = toBaseLowerCase(mString1[index1]);
- if (index0 > 0 && index1 > 0 && c0 == toBaseLowerCase(mString1[index1 - 1])
- && c1 == toBaseLowerCase(mString0[index0 - 1])) {
+ const int c0 = CharUtils::toBaseLowerCase(mString0[index0]);
+ const int c1 = CharUtils::toBaseLowerCase(mString1[index1]);
+ if (index0 > 0 && index1 > 0 && c0 == CharUtils::toBaseLowerCase(mString1[index1 - 1])
+ && c1 == CharUtils::toBaseLowerCase(mString0[index0 - 1])) {
return true;
}
return false;
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index 1133256c4..5820a1d0e 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -18,13 +18,15 @@
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
-#include "char_utils.h"
#include "defines.h"
+#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/dictionary.h"
#include "suggest/core/dictionary/digraph_utils.h"
+#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/dictionary/terminal_attributes.h"
#include "suggest/core/layout/proximity_info.h"
+#include "utils/char_utils.h"
#include "unigram_dictionary.h"
#include "words_priority_queue.h"
#include "words_priority_queue_pool.h"
@@ -32,8 +34,9 @@
namespace latinime {
// TODO: check the header
-UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags)
- : DICT_ROOT(streamStart), ROOT_POS(0),
+UnigramDictionary::UnigramDictionary(
+ const BinaryDictionaryInfo *const binaryDicitonaryInfo, const uint8_t dictFlags)
+ : mBinaryDicitonaryInfo(binaryDicitonaryInfo),
MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), DICT_FLAGS(dictFlags) {
if (DEBUG_DICT) {
AKLOGI("UnigramDictionary - constructor");
@@ -315,9 +318,10 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
correction->setCorrectionParams(0, 0, 0,
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
doAutoCompletion, maxErrors);
- int rootPosition = ROOT_POS;
+ int rootPosition = mBinaryDicitonaryInfo->getRootPosition();
// Get the number of children of root, then increment the position
- int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition);
+ int childCount = BinaryFormat::getGroupCountAndForwardPointer(
+ mBinaryDicitonaryInfo->getDictRoot(), &rootPosition);
int outputIndex = 0;
correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0));
@@ -696,8 +700,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
int pos = startPos;
int codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
- int baseChar = toBaseLowerCase(codePoint);
- const int wChar = toBaseLowerCase(inWord[startInputIndex]);
+ int baseChar = CharUtils::toBaseLowerCase(codePoint);
+ const int wChar = CharUtils::toBaseLowerCase(inWord[startInputIndex]);
if (baseChar != wChar) {
*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
@@ -709,8 +713,9 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
if (hasMultipleChars) {
codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
while (NOT_A_CODE_POINT != codePoint) {
- baseChar = toBaseLowerCase(codePoint);
- if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
+ baseChar = CharUtils::toBaseLowerCase(codePoint);
+ if (inputIndex + 1 >= inputSize
+ || CharUtils::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
*outPos = BinaryFormat::skipOtherCharacters(root, pos);
*outInputIndex = startInputIndex;
return false;
@@ -746,7 +751,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con
int newWord[MAX_WORD_LENGTH];
int depth = 0;
int maxFreq = -1;
- const uint8_t *const root = DICT_ROOT;
+ const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot();
int stackChildCount[MAX_WORD_LENGTH];
int stackInputIndex[MAX_WORD_LENGTH];
int stackSiblingPos[MAX_WORD_LENGTH];
@@ -805,7 +810,7 @@ int UnigramDictionary::getMostProbableWordLikeInner(const int *const inWord, con
}
int UnigramDictionary::getProbability(const int *const inWord, const int length) const {
- const uint8_t *const root = DICT_ROOT;
+ const uint8_t *const root = mBinaryDicitonaryInfo->getDictRoot();
int pos = BinaryFormat::getTerminalPosition(root, inWord, length,
false /* forceLowerCaseSearch */);
if (NOT_VALID_WORD == pos) {
@@ -822,7 +827,7 @@ int UnigramDictionary::getProbability(const int *const inWord, const int length)
if (hasMultipleChars) {
pos = BinaryFormat::skipOtherCharacters(root, pos);
} else {
- BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
+ BinaryFormat::getCodePointAndForwardPointer(root, &pos);
}
const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
return unigramProbability;
@@ -864,7 +869,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
// - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not.
// - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children)
// - FLAG_HAS_BIGRAMS: whether this node has bigrams or not
- const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos);
+ const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
+ mBinaryDicitonaryInfo->getDictRoot(), &pos);
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
@@ -875,7 +881,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
// else if FLAG_IS_TERMINAL: the probability
// else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address
// Note that you can't have a node that both is not a terminal and has no children.
- int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos);
+ int c = BinaryFormat::getCodePointAndForwardPointer(
+ mBinaryDicitonaryInfo->getDictRoot(), &pos);
ASSERT(NOT_A_CODE_POINT != c);
// We are going to loop through each character and make it look like it's a different
@@ -889,8 +896,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
// We prefetch the next char. If 'c' is the last char of this node, we will have
// NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node
// should behave as a terminal or not and whether we have children.
- const int nextc = hasMultipleChars
- ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT;
+ const int nextc = hasMultipleChars ? BinaryFormat::getCodePointAndForwardPointer(
+ mBinaryDicitonaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
const bool isLastChar = (NOT_A_CODE_POINT == nextc);
// If there are more chars in this nodes, then this virtual node is not a terminal.
// If we are on the last char, this virtual node is a terminal if this node is.
@@ -910,11 +917,11 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
// We don't have to output other values because we return false, as in
// "don't traverse children".
if (!isLastChar) {
- pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos);
+ pos = BinaryFormat::skipOtherCharacters(mBinaryDicitonaryInfo->getDictRoot(), pos);
}
pos = BinaryFormat::skipProbability(flags, pos);
- *nextSiblingPosition =
- BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
+ *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
+ mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
return false;
}
@@ -927,15 +934,15 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
if (isTerminalNode) {
// The probability should be here, because we come here only if this is actually
// a terminal node, and we are on its last char.
- const int unigramProbability =
- BinaryFormat::readProbabilityWithoutMovingPointer(DICT_ROOT, pos);
+ const int unigramProbability = BinaryFormat::readProbabilityWithoutMovingPointer(
+ mBinaryDicitonaryInfo->getDictRoot(), pos);
const int childrenAddressPos = BinaryFormat::skipProbability(flags, pos);
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
- TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
+ TerminalAttributes terminalAttributes(mBinaryDicitonaryInfo, flags, attributesPos);
// bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
// bigramFilter is a bloom filter of said frequencies for even faster rejection.
- const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
- unigramProbability);
+ const int probability = ProbabilityUtils::getProbability(
+ initialPos, bigramMap, bigramFilter, unigramProbability);
onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
currentWordIndex);
@@ -951,16 +958,16 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
// remaining char in this group for there can't be any.
if (!hasChildren) {
pos = BinaryFormat::skipProbability(flags, pos);
- *nextSiblingPosition =
- BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
+ *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
+ mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
return false;
}
// Optimization: Prune out words that are too long compared to how much was typed.
if (correction->needsToPrune()) {
pos = BinaryFormat::skipProbability(flags, pos);
- *nextSiblingPosition =
- BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
+ *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
+ mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
if (DEBUG_DICT_FULL) {
AKLOGI("Traversing was pruned.");
}
@@ -979,9 +986,12 @@ bool UnigramDictionary::processCurrentNode(const int initialPos,
// Once this is read, we still need to output the number of nodes in the immediate children of
// this node, so we read and output it before returning true, as in "please traverse children".
pos = BinaryFormat::skipProbability(flags, pos);
- int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos);
- *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos);
- *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos);
+ int childrenPos = BinaryFormat::readChildrenPosition(
+ mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
+ *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(
+ mBinaryDicitonaryInfo->getDictRoot(), flags, pos);
+ *newCount = BinaryFormat::getGroupCountAndForwardPointer(
+ mBinaryDicitonaryInfo->getDictRoot(), &childrenPos);
*newChildrenPosition = childrenPos;
return true;
}
diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index a50503256..4edd1f847 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -25,6 +25,7 @@
namespace latinime {
+class BinaryDictionaryInfo;
class Correction;
class ProximityInfo;
class TerminalAttributes;
@@ -39,7 +40,10 @@ class UnigramDictionary {
static const int FLAG_MULTIPLE_SUGGEST_ABORT = 0;
static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1;
static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2;
- UnigramDictionary(const uint8_t *const streamStart, const unsigned int dictFlags);
+
+ UnigramDictionary(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
+ const uint8_t dictFlags);
+ virtual ~UnigramDictionary();
int getProbability(const int *const inWord, const int length) const;
int getBigramPosition(int pos, int *word, int offset, int length) const;
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
@@ -48,7 +52,6 @@ class UnigramDictionary {
const bool useFullEditDistance, int *outWords, int *frequencies,
int *outputTypes) const;
int getDictFlags() const { return DICT_FLAGS; }
- virtual ~UnigramDictionary();
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(UnigramDictionary);
@@ -108,8 +111,7 @@ class UnigramDictionary {
const int outputWordLength, int *freqArray, int *wordLengthArray,
int *outputWord) const;
- const uint8_t *const DICT_ROOT;
- const int ROOT_POS;
+ const BinaryDictionaryInfo *const mBinaryDicitonaryInfo;
const int MAX_DIGRAPH_SEARCH_DEPTH;
const int DICT_FLAGS;
};
diff --git a/native/jni/src/char_utils.cpp b/native/jni/src/utils/char_utils.cpp
index e219beb62..0e7039610 100644
--- a/native/jni/src/char_utils.cpp
+++ b/native/jni/src/utils/char_utils.cpp
@@ -14,9 +14,10 @@
* limitations under the License.
*/
+#include "utils/char_utils.h"
+
#include <cstdlib>
-#include "char_utils.h"
#include "defines.h"
namespace latinime {
@@ -36,8 +37,7 @@ struct LatinCapitalSmallPair {
* $ apt-get install libicu-dev
*
* 3. Build the following code
- * (You need this file, char_utils.h, and defines.h)
- * $ g++ -o char_utils -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc
+ * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc
*/
#ifdef UPDATING_CHAR_UTILS
#include <stdio.h>
@@ -47,7 +47,7 @@ extern "C" int main() {
for (unsigned short c = 0; c < 0xFFFF; c++) {
if (c <= 0x7F) continue;
const unsigned short icu4cLowerC = u_tolower(c);
- const unsigned short myLowerC = latin_tolower(c);
+ const unsigned short myLowerC = CharUtils::latin_tolower(c);
if (c != icu4cLowerC) {
#ifdef CONFIRMING_CHAR_UTILS
if (icu4cLowerC != myLowerC) {
@@ -70,7 +70,7 @@ extern "C" int main() {
*
* 5. Update the SORTED_CHAR_MAP[] array below with the output above.
* Then, rebuild with -DCONFIRMING_CHAR_UTILS and confirm the program exits successfully.
- * $ g++ -o char_utils -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc
+ * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc
* $ ./char_utils
* $
*/
@@ -1054,7 +1054,7 @@ static int compare_pair_capital(const void *a, const void *b) {
- static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital);
}
-unsigned short latin_tolower(const unsigned short c) {
+/* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) {
struct LatinCapitalSmallPair *p =
static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP,
NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital));
@@ -1063,7 +1063,7 @@ unsigned short latin_tolower(const unsigned short c) {
/*
* Table mapping most combined Latin, Greek, and Cyrillic characters
- * to their base characters. If c is in range, BASE_CHARS[c] == c
+ * to their base characters. If c is in range, CharUtils::BASE_CHARS[c] == c
* if c is not a combined character, or the base character if it
* is combined.
*
@@ -1074,7 +1074,7 @@ unsigned short latin_tolower(const unsigned short c) {
* for ($j = $i; $j < $i + 8; $j++) { \
* printf("0x%04X, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }'
*/
-const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = {
+/* static */ const unsigned short CharUtils::BASE_CHARS[CharUtils::BASE_CHARS_SIZE] = {
/* U+0000 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
/* U+0008 */ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
/* U+0010 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h
new file mode 100644
index 000000000..2e735a81c
--- /dev/null
+++ b/native/jni/src/utils/char_utils.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_CHAR_UTILS_H
+#define LATINIME_CHAR_UTILS_H
+
+#include <cctype>
+
+#include "defines.h"
+
+namespace latinime {
+
+class CharUtils {
+ public:
+ static AK_FORCE_INLINE bool isAsciiUpper(int c) {
+ // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
+ // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
+ return (c >= 'A' && c <= 'Z');
+ }
+
+ static AK_FORCE_INLINE int toAsciiLower(int c) {
+ return c - 'A' + 'a';
+ }
+
+ static AK_FORCE_INLINE bool isAscii(int c) {
+ return isascii(c) != 0;
+ }
+
+ static AK_FORCE_INLINE int toLowerCase(const int c) {
+ if (isAsciiUpper(c)) {
+ return toAsciiLower(c);
+ }
+ if (isAscii(c)) {
+ return c;
+ }
+ return static_cast<int>(latin_tolower(static_cast<unsigned short>(c)));
+ }
+
+ static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
+ return toLowerCase(toBaseCodePoint(c));
+ }
+
+ static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) {
+ // TODO: Do not hardcode here
+ return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
+ }
+
+ static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
+ int size = 0;
+ for (; size < arraySize; ++size) {
+ if (codePoints[size] == '\0') {
+ break;
+ }
+ }
+ return size;
+ }
+
+ static AK_FORCE_INLINE int toBaseCodePoint(int c) {
+ if (c < BASE_CHARS_SIZE) {
+ return static_cast<int>(BASE_CHARS[c]);
+ }
+ return c;
+ }
+
+ static unsigned short latin_tolower(const unsigned short c);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
+
+ /**
+ * Table mapping most combined Latin, Greek, and Cyrillic characters
+ * to their base characters. If c is in range, BASE_CHARS[c] == c
+ * if c is not a combined character, or the base character if it
+ * is combined.
+ */
+ static const int BASE_CHARS_SIZE = 0x0500;
+ static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
+};
+} // namespace latinime
+#endif // LATINIME_CHAR_UTILS_H
diff --git a/native/jni/src/hash_map_compat.h b/native/jni/src/utils/hash_map_compat.h
index a1e982bc4..a1e982bc4 100644
--- a/native/jni/src/hash_map_compat.h
+++ b/native/jni/src/utils/hash_map_compat.h