diff options
25 files changed, 132 insertions, 552 deletions
diff --git a/Android.mk b/Android.mk index aa869112c..17eeba8f0 100644 --- a/Android.mk +++ b/Android.mk @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -subdirs := native java tests tools +subdirs := common native java tests tools include $(call all-named-subdir-makefiles, $(subdirs)) diff --git a/common/Android.mk b/common/Android.mk new file mode 100644 index 000000000..99aed4c5d --- /dev/null +++ b/common/Android.mk @@ -0,0 +1,26 @@ +# Copyright (C) 2014 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOCAL_PATH:= $(call my-dir) +include $(CLEAR_VARS) +LOCAL_SRC_FILES := $(call all-java-files-under, src) +LOCAL_MODULE := latinime-common +LOCAL_SDK_VERSION := 21 +include $(BUILD_STATIC_JAVA_LIBRARY) + +# Also build a host side library +include $(CLEAR_VARS) +LOCAL_MODULE := latinime-common-host +LOCAL_SRC_FILES := $(call all-java-files-under, src) +include $(BUILD_HOST_JAVA_LIBRARY) diff --git a/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java b/common/src/com/android/inputmethod/latin/common/CodePointUtils.java index a270ee774..38aba7bd2 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java +++ b/common/src/com/android/inputmethod/latin/common/CodePointUtils.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.android.inputmethod.latin.makedict; +package com.android.inputmethod.latin.common; import java.util.Random; diff --git a/java/Android.mk b/java/Android.mk index 0d12c45fe..a2c5697d3 100644 --- a/java/Android.mk +++ b/java/Android.mk @@ -25,7 +25,8 @@ LOCAL_CERTIFICATE := shared LOCAL_JNI_SHARED_LIBRARIES := libjni_latinime -LOCAL_STATIC_JAVA_LIBRARIES := android-common inputmethod-common android-support-v4 jsr305 +LOCAL_STATIC_JAVA_LIBRARIES := \ + android-common inputmethod-common android-support-v4 jsr305 latinime-common # Do not compress dictionary files to mmap dict data runtime LOCAL_AAPT_FLAGS := -0 .dict diff --git a/java/res/values-ca/strings.xml b/java/res/values-ca/strings.xml index da93a58cf..17615e538 100644 --- a/java/res/values-ca/strings.xml +++ b/java/res/values-ca/strings.xml @@ -78,7 +78,7 @@ <string name="voice_input_disabled_summary" msgid="8141750303464726129">"No hi ha cap mètode d\'introducció activat. Comprova la configuració d\'Idioma i introducció de text."</string> <string name="configure_input_method" msgid="373356270290742459">"Configura mètodes d\'entrada"</string> <string name="language_selection_title" msgid="3666971864764478269">"Idiomes"</string> - <string name="help_and_feedback" msgid="5328219371839879161">"Ajuda i opinió"</string> + <string name="help_and_feedback" msgid="5328219371839879161">"Ajuda i suggeriments"</string> <string name="select_language" msgid="5709487854987078367">"Idiomes"</string> <string name="hint_add_to_dictionary" msgid="573678656946085380">"Torna a tocar per desar"</string> <string name="hint_add_to_dictionary_without_word" msgid="3040385779511255101">"Toca aquí per desar."</string> diff --git a/java/res/values-fi/strings.xml b/java/res/values-fi/strings.xml index 3739cdf79..b8fcd8131 100644 --- a/java/res/values-fi/strings.xml +++ b/java/res/values-fi/strings.xml @@ -74,7 +74,7 @@ <string name="gesture_floating_preview_text_summary" msgid="4472696213996203533">"Näytä ehdotettu sana piirron aikana"</string> <string name="gesture_space_aware" msgid="2078291600664682496">"Ilmausele"</string> <string name="gesture_space_aware_summary" msgid="4371385818348528538">"Lisää välilyöntejä eleiden aikana liukumalla välilyöntinäppäim."</string> - <string name="voice_input" msgid="3583258583521397548">"Äänisyöteavain"</string> + <string name="voice_input" msgid="3583258583521397548">"Äänisyötenäppäin"</string> <string name="voice_input_disabled_summary" msgid="8141750303464726129">"Äänen syöttötapoja ei ole otettu käyttöön. Tarkista Kieli ja syöttötapa -asetukset."</string> <string name="configure_input_method" msgid="373356270290742459">"Määritä syöttötavat"</string> <string name="language_selection_title" msgid="3666971864764478269">"Kielet"</string> diff --git a/java/res/values-vi/strings.xml b/java/res/values-vi/strings.xml index 3c803a366..1f9590ef1 100644 --- a/java/res/values-vi/strings.xml +++ b/java/res/values-vi/strings.xml @@ -25,7 +25,7 @@ <string name="use_contacts_for_spellchecking_option_summary" msgid="8754413382543307713">"Trình kiểm tra chính tả sử dụng các mục nhập từ danh sách liên hệ của bạn"</string> <string name="vibrate_on_keypress" msgid="5258079494276955460">"Rung khi nhấn phím"</string> <string name="sound_on_keypress" msgid="6093592297198243644">"Âm thanh khi nhấn phím"</string> - <string name="popup_on_keypress" msgid="123894815723512944">"Cửa sổ bật lên khi nhấn phím"</string> + <string name="popup_on_keypress" msgid="123894815723512944">"Bật lên khi nhấn phím"</string> <string name="settings_screen_preferences" msgid="2696713156722014624">"Tùy chọn"</string> <string name="settings_screen_accounts" msgid="7570397912370223287">"Tài khoản và bảo mật"</string> <string name="settings_screen_appearance" msgid="9153102634339912029">"Giao diện và bố cục"</string> @@ -48,8 +48,8 @@ <string name="use_contacts_dict_summary" msgid="6599983334507879959">"Sử dụng tên từ Danh bạ cho các đề xuất và chỉnh sửa"</string> <string name="use_personalized_dicts" msgid="5167396352105467626">"Đề xuất được cá nhân hóa"</string> <string name="enable_metrics_logging" msgid="5506372337118822837">"Cải thiện <xliff:g id="APPLICATION_NAME">%s</xliff:g>"</string> - <string name="use_double_space_period" msgid="8781529969425082860">"Dấu cách đôi"</string> - <string name="use_double_space_period_summary" msgid="6532892187247952799">"Nhấn đúp vào phím cách sẽ chèn thêm một dấu sau dấu cách"</string> + <string name="use_double_space_period" msgid="8781529969425082860">"Nhấn đúp phím cách chèn dấu chấm câu"</string> + <string name="use_double_space_period_summary" msgid="6532892187247952799">"Nhấn đúp phím cách sẽ chèn thêm một dấu chấm câu, theo sau là dấu cách"</string> <string name="auto_cap" msgid="1719746674854628252">"Tự động viết hoa"</string> <string name="auto_cap_summary" msgid="7934452761022946874">"Viết hoa chữ đầu tiên của mỗi câu"</string> <string name="edit_personal_dictionary" msgid="3996910038952940420">"Từ điển cá nhân"</string> diff --git a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java index b129c3e40..e7808e46e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java +++ b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java @@ -87,7 +87,7 @@ public final class WordProperty implements Comparable<WordProperty> { final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts, final boolean isBeginningOfSentence, final int[] probabilityInfo, final ArrayList<int[][]> ngramPrevWordsArray, - final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray, + final ArrayList<boolean[]> ngramPrevWordIsBeginningOfSentenceArray, final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo, final ArrayList<int[]> shortcutTargets, final ArrayList<Integer> shortcutProbabilities) { @@ -102,16 +102,22 @@ public final class WordProperty implements Comparable<WordProperty> { mHasNgrams = hasBigram; final int relatedNgramCount = ngramTargets.size(); - final WordInfo currentWordInfo = - mIsBeginningOfSentence ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO - : new WordInfo(mWord); - final NgramContext ngramContext = new NgramContext(currentWordInfo); for (int i = 0; i < relatedNgramCount; i++) { final String ngramTargetString = StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i)); final WeightedString ngramTarget = new WeightedString(ngramTargetString, createProbabilityInfoFromArray(ngramProbabilityInfo.get(i))); - // TODO: Support n-gram. + final int[][] prevWords = ngramPrevWordsArray.get(i); + final boolean[] isBeginningOfSentenceArray = + ngramPrevWordIsBeginningOfSentenceArray.get(i); + final WordInfo[] wordInfoArray = new WordInfo[prevWords.length]; + for (int j = 0; j < prevWords.length; j++) { + wordInfoArray[j] = isBeginningOfSentenceArray[j] + ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO + : new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray( + prevWords[j])); + } + final NgramContext ngramContext = new NgramContext(wordInfoArray); ngrams.add(new NgramProperty(ngramTarget, ngramContext)); } mNgrams = ngrams.isEmpty() ? null : ngrams; @@ -126,6 +132,7 @@ public final class WordProperty implements Comparable<WordProperty> { } // TODO: Remove + @UsedForTesting public ArrayList<WeightedString> getBigrams() { if (null == mNgrams) { return null; diff --git a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java index 248246232..4e0f5f583 100644 --- a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java @@ -17,6 +17,7 @@ package com.android.inputmethod.latin.utils; import com.android.inputmethod.latin.makedict.DictionaryHeader; +import com.android.inputmethod.latin.makedict.NgramProperty; import com.android.inputmethod.latin.makedict.ProbabilityInfo; import com.android.inputmethod.latin.makedict.WeightedString; import com.android.inputmethod.latin.makedict.WordProperty; @@ -26,6 +27,8 @@ import java.util.HashMap; public class CombinedFormatUtils { public static final String DICTIONARY_TAG = "dictionary"; public static final String BIGRAM_TAG = "bigram"; + public static final String NGRAM_TAG = "ngram"; + public static final String NGRAM_PREV_WORD_TAG = "prev_word"; public static final String SHORTCUT_TAG = "shortcut"; public static final String PROBABILITY_TAG = "f"; public static final String HISTORICAL_INFO_TAG = "historicalInfo"; @@ -76,12 +79,19 @@ public class CombinedFormatUtils { } } if (wordProperty.mHasNgrams) { - // TODO: Support ngram. - for (final WeightedString bigram : wordProperty.getBigrams()) { - builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord); + for (final NgramProperty ngramProperty : wordProperty.mNgrams) { + builder.append(" " + NGRAM_TAG + "=" + ngramProperty.mTargetWord.mWord); builder.append(","); - builder.append(formatProbabilityInfo(bigram.mProbabilityInfo)); + builder.append(formatProbabilityInfo(ngramProperty.mTargetWord.mProbabilityInfo)); builder.append("\n"); + for (int i = 0; i < ngramProperty.mNgramContext.getPrevWordCount(); i++) { + builder.append(" " + NGRAM_PREV_WORD_TAG + "[" + i + "]=" + + ngramProperty.mNgramContext.getNthPrevWord(i + 1)); + if (ngramProperty.mNgramContext.isNthPrevWordBeginningOfSontence(i + 1)) { + builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true"); + } + builder.append("\n"); + } } } return builder.toString(); diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 461d1d859..9239c8400 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -327,8 +327,8 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz, static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints, - jbooleanArray outFlags, jintArray outProbabilityInfo, jobject /* outNgramPrevWordsArray */, - jobject /* outNgramPrevWordIsBeginningOfSentenceArray */, jobject outNgramTargets, + jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray, + jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets, jobject outNgramProbabilityInfo, jobject outShortcutTargets, jobject outShortcutProbabilities) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); @@ -352,6 +352,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, const WordProperty wordProperty = dictionary->getWordProperty( CodePointArrayView(wordCodePoints, codePointCount)); wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo, + outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray, outNgramTargets, outNgramProbabilityInfo, outShortcutTargets, outShortcutProbabilities); } diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.cpp b/native/jni/src/suggest/core/dictionary/property/word_property.cpp index a707f1ba2..019f0880f 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.cpp +++ b/native/jni/src/suggest/core/dictionary/property/word_property.cpp @@ -22,8 +22,9 @@ namespace latinime { void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, - jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, - jobject outBigramProbabilities, jobject outShortcutTargets, + jbooleanArray outFlags, jintArray outProbabilityInfo, + jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray, + jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets, jobject outShortcutProbabilities) const { JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(), @@ -43,16 +44,39 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, jclass arrayListClass = env->FindClass("java/util/ArrayList"); jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); - // Output bigrams. - // TODO: Support n-gram + // Output ngrams. + jclass intArrayClass = env->FindClass("[I"); for (const auto &ngramProperty : mNgrams) { - const std::vector<int> *const word1CodePoints = ngramProperty.getTargetCodePoints(); - jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size()); - JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */, - word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(), - false /* needsNullTermination */); - env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray); - env->DeleteLocalRef(bigramWord1CodePointArray); + const NgramContext *const ngramContext = ngramProperty.getNgramContext(); + jobjectArray prevWordWordCodePointsArray = env->NewObjectArray( + ngramContext->getPrevWordCount(), intArrayClass, nullptr); + jbooleanArray prevWordIsBeginningOfSentenceArray = + env->NewBooleanArray(ngramContext->getPrevWordCount()); + for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) { + const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1); + jintArray prevWordCodePoints = env->NewIntArray(codePoints.size()); + JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */, + codePoints.size(), codePoints.data(), codePoints.size(), + false /* needsNullTermination */); + env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints); + env->DeleteLocalRef(prevWordCodePoints); + JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i, + ngramContext->isNthPrevWordBeginningOfSentence(i + 1)); + } + env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray); + env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId, + prevWordIsBeginningOfSentenceArray); + env->DeleteLocalRef(prevWordWordCodePointsArray); + env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray); + + const std::vector<int> *const targetWordCodePoints = ngramProperty.getTargetCodePoints(); + jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size()); + JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */, + targetWordCodePoints->size(), targetWordCodePoints->data(), + targetWordCodePoints->size(), false /* needsNullTermination */); + env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray); + env->DeleteLocalRef(targetWordCodePointArray); + const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo(); int bigramProbabilityInfo[] = {ngramProperty.getProbability(), ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(), @@ -60,7 +84,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); - env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray); + env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray); env->DeleteLocalRef(bigramProbabilityInfoArray); } diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/suggest/core/dictionary/property/word_property.h index 01b8987b5..b5314faaa 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.h +++ b/native/jni/src/suggest/core/dictionary/property/word_property.h @@ -39,8 +39,10 @@ class WordProperty { mNgrams(*ngrams) {} void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, - jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, - jobject outShortcutTargets, jobject outShortcutProbabilities) const; + jintArray outProbabilityInfo, jobject outNgramPrevWordsArray, + jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets, + jobject outNgramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities) const; const UnigramProperty *getUnigramProperty() const { return &mUnigramProperty; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp index b96290437..509bd683b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -90,8 +90,8 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in // probabilityEntry. const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); - return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(), - unigramProbabilityEntry.isBlacklisted(), + return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), + unigramProbabilityEntry.isNotAWord(), unigramProbabilityEntry.isPossiblyOffensive()); } // Cannot find the word. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 193326d82..249d822b2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -488,9 +488,6 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty( AKLOGE("getWordProperty is called for invalid word."); return WordProperty(); } - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); const LanguageModelDictContent *const languageModelDictContent = mBuffers->getLanguageModelDictContent(); // Fetch ngram information. @@ -541,12 +538,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty( shortcutProbability); } } - const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry( - ptNodeParams.getTerminalId()); + const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( + WordIdArrayView(), wordId, mHeaderPolicy); + const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), - probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(), - probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(), + wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), + wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), *historicalInfo, std::move(shortcuts)); return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams); } diff --git a/tests/Android.mk b/tests/Android.mk index a084ad10d..7810184d3 100644 --- a/tests/Android.mk +++ b/tests/Android.mk @@ -24,7 +24,7 @@ LOCAL_AAPT_FLAGS += -0 .dict # Do not compress test data file LOCAL_AAPT_FLAGS += -0 .txt -LOCAL_STATIC_JAVA_LIBRARIES := mockito-target android-support-test +LOCAL_STATIC_JAVA_LIBRARIES := android-support-test latinime-common mockito-target # Include all test java files. LOCAL_SRC_FILES := $(call all-java-files-under, src) diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index 991dd0b28..15f7568c8 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -21,8 +21,8 @@ import android.test.suitebuilder.annotation.LargeTest; import android.util.Pair; import com.android.inputmethod.latin.NgramContext.WordInfo; +import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; -import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.DictDecoder; import com.android.inputmethod.latin.makedict.DictionaryHeader; import com.android.inputmethod.latin.makedict.FormatSpec; diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 5d6378937..5a72e417e 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -22,7 +22,7 @@ import android.text.TextUtils; import android.util.Pair; import com.android.inputmethod.latin.NgramContext.WordInfo; -import com.android.inputmethod.latin.makedict.CodePointUtils; +import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.makedict.DictionaryHeader; import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.WeightedString; diff --git a/tests/src/com/android/inputmethod/latin/LatinImeStressTests.java b/tests/src/com/android/inputmethod/latin/LatinImeStressTests.java index f5e993de8..22114b7a0 100644 --- a/tests/src/com/android/inputmethod/latin/LatinImeStressTests.java +++ b/tests/src/com/android/inputmethod/latin/LatinImeStressTests.java @@ -18,7 +18,7 @@ package com.android.inputmethod.latin; import android.test.suitebuilder.annotation.LargeTest; -import com.android.inputmethod.latin.makedict.CodePointUtils; +import com.android.inputmethod.latin.common.CodePointUtils; import java.util.Random; diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java index d1cb14196..a35fa13ce 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java @@ -23,6 +23,7 @@ import android.util.Pair; import android.util.SparseArray; import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; diff --git a/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java b/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java index afabbbd38..6ccb79d76 100644 --- a/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java @@ -31,7 +31,7 @@ import com.android.inputmethod.latin.DictionaryFacilitator; import com.android.inputmethod.latin.ExpandableBinaryDictionary; import com.android.inputmethod.latin.RichInputMethodManager; import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback; -import com.android.inputmethod.latin.makedict.CodePointUtils; +import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.settings.SpacingAndPunctuations; import android.test.AndroidTestCase; diff --git a/tools/dicttool/Android.mk b/tools/dicttool/Android.mk index 7f34ccf20..ff6c1e433 100644 --- a/tools/dicttool/Android.mk +++ b/tools/dicttool/Android.mk @@ -93,7 +93,7 @@ LOCAL_SRC_FILES := $(LOCAL_TOOL_SRC_FILES) \ $(call all-java-files-under, $(DICTTOOL_ONDEVICE_TESTS_DIR)) LOCAL_JAVA_LIBRARIES := junit -LOCAL_STATIC_JAVA_LIBRARIES := jsr305lib +LOCAL_STATIC_JAVA_LIBRARIES := jsr305lib latinime-common-host LOCAL_REQUIRED_MODULES := $(LATINIME_HOST_NATIVE_LIBNAME) LOCAL_JAR_MANIFEST := etc/manifest.txt LOCAL_MODULE := dicttool_aosp diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java index 3ef03f4bd..4c7187fcd 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java @@ -22,8 +22,6 @@ import com.android.inputmethod.latin.makedict.DictDecoder; import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; -import org.xml.sax.SAXException; - import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; @@ -36,8 +34,6 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.util.ArrayList; -import javax.xml.parsers.ParserConfigurationException; - /** * Class grouping utilities for offline dictionary making. * @@ -177,14 +173,6 @@ public final class BinaryDictOffdeviceUtils { System.out.println("Size : " + file.length() + " bytes"); } try { - if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) { - if (report) { - System.out.println("Format : XML unigram list"); - } - return XmlDictInputOutput.readDictionaryXml( - new BufferedInputStream(new FileInputStream(file)), - null /* shortcuts */, null /* bigrams */); - } final DecoderChainSpec decodedSpec = getRawDictionaryOrNull(file); if (null == decodedSpec) { throw new RuntimeException("Does not seem to be a dictionary file " + filename); @@ -209,8 +197,7 @@ public final class BinaryDictOffdeviceUtils { System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); } return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); - } catch (final IOException | SAXException | ParserConfigurationException | - UnsupportedFormatException e) { + } catch (final IOException | UnsupportedFormatException e) { throw new RuntimeException("Can't read file " + filename, e); } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java index 2925fdc34..e04751ddc 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -27,8 +27,6 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException; import com.android.inputmethod.latin.makedict.Ver2DictEncoder; import com.android.inputmethod.latin.makedict.Ver4DictEncoder; -import org.xml.sax.SAXException; - import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -41,8 +39,6 @@ import java.io.InputStreamReader; import java.util.Arrays; import java.util.LinkedList; -import javax.xml.parsers.ParserConfigurationException; - /** * Main class/method for DictionaryMaker. */ @@ -52,10 +48,7 @@ public class DictionaryMaker { private static final String OPTION_VERSION_2 = "-2"; private static final String OPTION_VERSION_4 = "-4"; private static final String OPTION_INPUT_SOURCE = "-s"; - private static final String OPTION_INPUT_BIGRAM_XML = "-b"; - private static final String OPTION_INPUT_SHORTCUT_XML = "-c"; private static final String OPTION_OUTPUT_BINARY = "-d"; - private static final String OPTION_OUTPUT_XML = "-x"; private static final String OPTION_OUTPUT_COMBINED = "-o"; private static final String OPTION_HELP = "-h"; private static final String OPTION_CODE_POINT_TABLE = "-t"; @@ -63,11 +56,7 @@ public class DictionaryMaker { private static final String OPTION_CODE_POINT_TABLE_ON = "on"; public final String mInputBinary; public final String mInputCombined; - public final String mInputUnigramXml; - public final String mInputShortcutXml; - public final String mInputBigramXml; public final String mOutputBinary; - public final String mOutputXml; public final String mOutputCombined; public final int mOutputBinaryFormatVersion; public final int mCodePointTableMode; @@ -76,39 +65,20 @@ public class DictionaryMaker { checkHasExactlyOneInput(); checkHasAtLeastOneOutput(); checkNotSameFile(mInputBinary, mOutputBinary); - checkNotSameFile(mInputBinary, mOutputXml); checkNotSameFile(mInputCombined, mOutputBinary); - checkNotSameFile(mInputCombined, mOutputXml); - checkNotSameFile(mInputUnigramXml, mOutputBinary); - checkNotSameFile(mInputUnigramXml, mOutputXml); - checkNotSameFile(mInputUnigramXml, mOutputCombined); - checkNotSameFile(mInputShortcutXml, mOutputBinary); - checkNotSameFile(mInputShortcutXml, mOutputXml); - checkNotSameFile(mInputShortcutXml, mOutputCombined); - checkNotSameFile(mInputBigramXml, mOutputBinary); - checkNotSameFile(mInputBigramXml, mOutputXml); - checkNotSameFile(mInputBigramXml, mOutputCombined); - checkNotSameFile(mOutputBinary, mOutputXml); checkNotSameFile(mOutputBinary, mOutputCombined); - checkNotSameFile(mOutputXml, mOutputCombined); } private void checkHasExactlyOneInput() { - if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) { + if (null == mInputBinary && null == mInputCombined) { throw new RuntimeException("No input file specified"); - } else if ((null != mInputUnigramXml && null != mInputBinary) - || (null != mInputUnigramXml && null != mInputCombined) - || (null != mInputBinary && null != mInputCombined)) { + } else if (null != mInputBinary && null != mInputCombined) { throw new RuntimeException("Several input files specified"); - } else if ((null != mInputBinary || null != mInputCombined) - && (null != mInputBigramXml || null != mInputShortcutXml)) { - throw new RuntimeException("Separate bigrams/shortcut files are only supported" - + " with XML input (other formats include bigrams and shortcuts already)"); } } private void checkHasAtLeastOneOutput() { - if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) { + if (null == mOutputBinary && null == mOutputCombined) { throw new RuntimeException("No output specified"); } } @@ -131,16 +101,14 @@ public class DictionaryMaker { public static String getHelp() { return "Usage: makedict " - + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] " + "| [-s <combined format input]" - + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] " + + "| [-s <binary input>] [-d <binary output>]" + " [-o <combined output>] [-t <code point table switch: on/off/auto>]" + "[-2] [-3] [-4]\n" + "\n" + " Converts a source dictionary file to one or several outputs.\n" - + " Source can be an XML file, with an optional XML bigrams file, or a\n" - + " binary dictionary file.\n" - + " Binary version 2 (Jelly Bean), 3, 4, XML and\n" + + " Source can be a binary dictionary file or a combined format file.\n" + + " Binary version 2 (Jelly Bean), 3, 4, and\n" + " combined format outputs are supported."; } @@ -151,11 +119,7 @@ public class DictionaryMaker { } String inputBinary = null; String inputCombined = null; - String inputUnigramXml = null; - String inputShortcutXml = null; - String inputBigramXml = null; String outputBinary = null; - String outputXml = null; String outputCombined = null; int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201. // Don't use code point table by default. @@ -180,9 +144,7 @@ public class DictionaryMaker { String argValue = args.get(0); args.remove(0); if (OPTION_INPUT_SOURCE.equals(arg)) { - if (XmlDictInputOutput.isXmlUnigramDictionary(argValue)) { - inputUnigramXml = argValue; - } else if (CombinedInputOutput.isCombinedDictionary(argValue)) { + if (CombinedInputOutput.isCombinedDictionary(argValue)) { inputCombined = argValue; } else if (BinaryDictDecoderUtils.isBinaryDictionary(argValue)) { inputBinary = argValue; @@ -190,14 +152,8 @@ public class DictionaryMaker { throw new IllegalArgumentException( "Unknown format for file " + argValue); } - } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) { - inputShortcutXml = argValue; - } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) { - inputBigramXml = argValue; } else if (OPTION_OUTPUT_BINARY.equals(arg)) { outputBinary = argValue; - } else if (OPTION_OUTPUT_XML.equals(arg)) { - outputXml = argValue; } else if (OPTION_OUTPUT_COMBINED.equals(arg)) { outputCombined = argValue; } else if (OPTION_CODE_POINT_TABLE.equals(arg)) { @@ -214,13 +170,13 @@ public class DictionaryMaker { } } } else { - if (null == inputBinary && null == inputUnigramXml) { + if (null == inputBinary) { if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) { inputBinary = arg; } else if (CombinedInputOutput.isCombinedDictionary(arg)) { inputCombined = arg; } else { - inputUnigramXml = arg; + throw new IllegalArgumentException("Unknown format for file " + arg); } } else if (null == outputBinary) { outputBinary = arg; @@ -232,11 +188,7 @@ public class DictionaryMaker { mInputBinary = inputBinary; mInputCombined = inputCombined; - mInputUnigramXml = inputUnigramXml; - mInputShortcutXml = inputShortcutXml; - mInputBigramXml = inputBigramXml; mOutputBinary = outputBinary; - mOutputXml = outputXml; mOutputCombined = outputCombined; mOutputBinaryFormatVersion = outputBinaryFormatVersion; mCodePointTableMode = codePointTableMode; @@ -245,8 +197,7 @@ public class DictionaryMaker { } public static void main(String[] args) - throws FileNotFoundException, ParserConfigurationException, SAXException, IOException, - UnsupportedFormatException { + throws FileNotFoundException, IOException, UnsupportedFormatException { final Arguments parsedArgs = new Arguments(args); FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs); writeOutputToParsedArgs(parsedArgs, dictionary); @@ -259,14 +210,11 @@ public class DictionaryMaker { * @return the read dictionary. */ private static FusionDictionary readInputFromParsedArgs(final Arguments args) - throws IOException, UnsupportedFormatException, ParserConfigurationException, - SAXException, FileNotFoundException { + throws IOException, UnsupportedFormatException, FileNotFoundException { if (null != args.mInputBinary) { return readBinaryFile(args.mInputBinary); } else if (null != args.mInputCombined) { return readCombinedFile(args.mInputCombined); - } else if (null != args.mInputUnigramXml) { - return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); } else { throw new RuntimeException("No input file specified"); } @@ -314,30 +262,6 @@ public class DictionaryMaker { } /** - * Read a dictionary from a unigram XML file, and optionally a bigram XML file. - * - * @param unigramXmlFilename the name of the unigram XML file. May not be null. - * @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none. - * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams. - * @return the read dictionary. - * @throws FileNotFoundException if one of the files can't be found - * @throws SAXException if one or more of the XML files is not well-formed - * @throws IOException if one the input files can't be read - * @throws ParserConfigurationException if the system can't create a SAX parser - */ - private static FusionDictionary readXmlFile(final String unigramXmlFilename, - final String shortcutXmlFilename, final String bigramXmlFilename) - throws FileNotFoundException, SAXException, IOException, ParserConfigurationException { - try ( - final BufferedInputStream unigrams = getBufferedFileInputStream(unigramXmlFilename); - final BufferedInputStream shortcuts = getBufferedFileInputStream(shortcutXmlFilename); - final BufferedInputStream bigrams = getBufferedFileInputStream(bigramXmlFilename); - ) { - return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams); - } - } - - /** * Invoke the right output method according to args. * * This will write the passed dictionary to the file(s) passed in the command line arguments. @@ -353,9 +277,6 @@ public class DictionaryMaker { writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion, args.mCodePointTableMode); } - if (null != args.mOutputXml) { - writeXmlDictionary(args.mOutputXml, dict); - } if (null != args.mOutputCombined) { writeCombinedDictionary(args.mOutputCombined, dict); } @@ -387,21 +308,6 @@ public class DictionaryMaker { } /** - * Write the dictionary in XML format to the specified filename. - * - * @param outputFilename the name of the file to write to. - * @param dict the dictionary to write. - * @throws FileNotFoundException if the output file can't be created. - * @throws IOException if the output file can't be written to. - */ - private static void writeXmlDictionary(final String outputFilename, - final FusionDictionary dict) throws FileNotFoundException, IOException { - try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) { - XmlDictInputOutput.writeDictionaryXml(writer, dict); - } - } - - /** * Write the dictionary in the combined format to the specified filename. * * @param outputFilename the name of the file to write to. diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java index 808e1d4c8..0b1fb88bc 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java @@ -20,8 +20,6 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException; import java.io.FileNotFoundException; import java.io.IOException; -import javax.xml.parsers.ParserConfigurationException; -import org.xml.sax.SAXException; public class Makedict extends Dicttool.Command { public static final String COMMAND = "makedict"; @@ -35,8 +33,7 @@ public class Makedict extends Dicttool.Command { } @Override - public void run() throws FileNotFoundException, IOException, ParserConfigurationException, - SAXException, UnsupportedFormatException { + public void run() throws FileNotFoundException, IOException, UnsupportedFormatException { DictionaryMaker.main(mArgs); } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java deleted file mode 100644 index 7f3337949..000000000 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (C) 2011 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package com.android.inputmethod.latin.dicttool; - -import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; -import com.android.inputmethod.latin.makedict.FusionDictionary; -import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; -import com.android.inputmethod.latin.makedict.ProbabilityInfo; -import com.android.inputmethod.latin.makedict.WeightedString; -import com.android.inputmethod.latin.makedict.WordProperty; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.TreeSet; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -/** - * Reads and writes XML files for a FusionDictionary. - * - * All functions in this class are static. - */ -public class XmlDictInputOutput { - - private static final String ROOT_TAG = "wordlist"; - private static final String WORD_TAG = "w"; - private static final String BIGRAM_TAG = "bigram"; - private static final String SHORTCUT_TAG = "shortcut"; - private static final String PROBABILITY_ATTR = "f"; - private static final String WORD_ATTR = "word"; - private static final String NOT_A_WORD_ATTR = "not_a_word"; - - /** - * SAX handler for a unigram XML file. - */ - static private class UnigramHandler extends DefaultHandler { - // Parser states - private static final int START = 1; - private static final int WORD = 2; - private static final int UNKNOWN = 3; - private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1; - - FusionDictionary mDictionary; - int mState; // the state of the parser - int mFreq; // the currently read freq - String mWord; // the current word - final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; - - /** - * Create the handler. - * - * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. - */ - public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) { - mDictionary = null; - mShortcutsMap = shortcuts; - mWord = ""; - mState = START; - mFreq = 0; - } - - public FusionDictionary getFinalDictionary() { - final FusionDictionary dict = mDictionary; - for (final String shortcutOnly : mShortcutsMap.keySet()) { - if (dict.hasWord(shortcutOnly)) continue; - dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY), - mShortcutsMap.get(shortcutOnly), true /* isNotAWord */, - false /* isPossiblyOffensive */); - } - mDictionary = null; - mShortcutsMap.clear(); - mWord = ""; - mState = START; - mFreq = 0; - return dict; - } - - @Override - public void startElement(String uri, String localName, String qName, Attributes attrs) { - if (WORD_TAG.equals(localName)) { - mState = WORD; - mWord = ""; - for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { - final String attrName = attrs.getLocalName(attrIndex); - if (PROBABILITY_ATTR.equals(attrName)) { - mFreq = Integer.parseInt(attrs.getValue(attrIndex)); - } - } - } else if (ROOT_TAG.equals(localName)) { - final HashMap<String, String> attributes = new HashMap<>(); - for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { - final String attrName = attrs.getLocalName(attrIndex); - attributes.put(attrName, attrs.getValue(attrIndex)); - } - mDictionary = new FusionDictionary(new PtNodeArray(), - new DictionaryOptions(attributes)); - } else { - mState = UNKNOWN; - } - } - - @Override - public void characters(char[] ch, int start, int length) { - if (WORD == mState) { - // The XML parser is free to return text in arbitrary chunks one after the - // other. In particular, this happens in some implementations when it finds - // an escape code like "&". - mWord += String.copyValueOf(ch, start, length); - } - } - - @Override - public void endElement(String uri, String localName, String qName) { - if (WORD == mState) { - mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord), - false /* isNotAWord */, false /* isPossiblyOffensive */); - mState = START; - } - } - } - - static private class AssociativeListHandler extends DefaultHandler { - private final String SRC_TAG; - private final String SRC_ATTRIBUTE; - private final String DST_TAG; - private final String DST_ATTRIBUTE; - private final String DST_FREQ; - - // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX - private final static int XML_MAX = 256; - // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX - private final static int MEMORY_MAX = 256; - private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; - - private String mSrc; - private final HashMap<String, ArrayList<WeightedString>> mAssocMap; - - public AssociativeListHandler(final String srcTag, final String srcAttribute, - final String dstTag, final String dstAttribute, final String dstFreq) { - SRC_TAG = srcTag; - SRC_ATTRIBUTE = srcAttribute; - DST_TAG = dstTag; - DST_ATTRIBUTE = dstAttribute; - DST_FREQ = dstFreq; - mSrc = null; - mAssocMap = new HashMap<>(); - } - - @Override - public void startElement(String uri, String localName, String qName, Attributes attrs) { - if (SRC_TAG.equals(localName)) { - mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); - } else if (DST_TAG.equals(localName)) { - String dst = attrs.getValue(uri, DST_ATTRIBUTE); - int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ)); - WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); - ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); - if (null == bigramList) bigramList = new ArrayList<>(); - bigramList.add(bigram); - mAssocMap.put(mSrc, bigramList); - } - } - - protected int getValueFromFreqString(final String freqString) { - return Integer.parseInt(freqString); - } - - // This may return an empty map, but will never return null. - public HashMap<String, ArrayList<WeightedString>> getAssocMap() { - return mAssocMap; - } - } - - /** - * SAX handler for a bigram XML file. - */ - static private class BigramHandler extends AssociativeListHandler { - private final static String BIGRAM_W1_TAG = "bi"; - private final static String BIGRAM_W2_TAG = "w"; - private final static String BIGRAM_W1_ATTRIBUTE = "w1"; - private final static String BIGRAM_W2_ATTRIBUTE = "w2"; - private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; - - public BigramHandler() { - super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, - BIGRAM_FREQ_ATTRIBUTE); - } - - // As per getAssocMap(), this never returns null. - public HashMap<String, ArrayList<WeightedString>> getBigramMap() { - return getAssocMap(); - } - } - - /** - * SAX handler for a shortcut & whitelist XML file. - */ - static private class ShortcutAndWhitelistHandler extends AssociativeListHandler { - private final static String ENTRY_TAG = "entry"; - private final static String ENTRY_ATTRIBUTE = "shortcut"; - private final static String TARGET_TAG = "target"; - private final static String REPLACEMENT_ATTRIBUTE = "replacement"; - private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; - private final static String WHITELIST_MARKER = "whitelist"; - private final static int WHITELIST_FREQ_VALUE = 15; - private final static int MIN_FREQ = 0; - private final static int MAX_FREQ = 14; - - public ShortcutAndWhitelistHandler() { - super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, - TARGET_PRIORITY_ATTRIBUTE); - } - - @Override - protected int getValueFromFreqString(final String freqString) { - if (WHITELIST_MARKER.equals(freqString)) { - return WHITELIST_FREQ_VALUE; - } - final int intValue = super.getValueFromFreqString(freqString); - if (intValue < MIN_FREQ || intValue > MAX_FREQ) { - throw new RuntimeException("Shortcut freq out of range. Accepted range is " - + MIN_FREQ + ".." + MAX_FREQ); - } - return intValue; - } - - // As per getAssocMap(), this never returns null. - public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() { - return getAssocMap(); - } - } - - /** - * Basic test to find out whether the file is in the unigram XML format or not. - * - * Concretely this only tests the header line. - * - * @param filename The name of the file to test. - * @return true if the file is in the unigram XML format, false otherwise - */ - public static boolean isXmlUnigramDictionary(final String filename) { - try (final BufferedReader reader = new BufferedReader( - new InputStreamReader(new FileInputStream(filename), "UTF-8"))) { - final String firstLine = reader.readLine(); - return firstLine.matches("^\\s*<wordlist .*>\\s*$"); - } catch (final IOException e) { - return false; - } - } - - /** - * Reads a dictionary from an XML file. - * - * This is the public method that will parse an XML file and return the corresponding memory - * representation. - * - * @param unigrams the file to read the data from. - * @param shortcuts the file to read the shortcuts & whitelist from, or null. - * @param bigrams the file to read the bigrams from, or null. - * @return the in-memory representation of the dictionary. - */ - public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams, - final BufferedInputStream shortcuts, final BufferedInputStream bigrams) - throws SAXException, IOException, ParserConfigurationException { - final SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - final SAXParser parser = factory.newSAXParser(); - final BigramHandler bigramHandler = new BigramHandler(); - if (null != bigrams) parser.parse(bigrams, bigramHandler); - - final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler = - new ShortcutAndWhitelistHandler(); - if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler); - - final UnigramHandler unigramHandler = - new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap()); - parser.parse(unigrams, unigramHandler); - final FusionDictionary dict = unigramHandler.getFinalDictionary(); - final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap(); - for (final String firstWord : bigramMap.keySet()) { - if (!dict.hasWord(firstWord)) continue; - final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); - for (final WeightedString bigram : bigramList) { - if (!dict.hasWord(bigram.mWord)) continue; - dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo); - } - } - return dict; - } - - /** - * Reads a dictionary in the first, legacy XML format - * - * This method reads data from the parser and creates a new FusionDictionary with it. - * The format parsed by this method is the format used before Ice Cream Sandwich, - * which has no support for bigrams or shortcuts/whitelist. - * It is important to note that this method expects the parser to have already eaten - * the first, all-encompassing tag. - * - * @param xpp the parser to read the data from. - * @return the parsed dictionary. - */ - - /** - * Writes a dictionary to an XML file. - * - * The output format is the "second" format, which supports bigrams and shortcuts/whitelist. - * - * @param destination a destination stream to write to. - * @param dict the dictionary to write. - */ - public static void writeDictionaryXml(final BufferedWriter destination, - final FusionDictionary dict) throws IOException { - final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>(); - for (WordProperty wordProperty : dict) { - wordPropertiesInDict.add(wordProperty); - } - // TODO: use an XMLSerializer if this gets big - destination.write("<wordlist format=\"2\""); - for (final String key : dict.mOptions.mAttributes.keySet()) { - final String value = dict.mOptions.mAttributes.get(key); - destination.write(" " + key + "=\"" + value + "\""); - } - destination.write(">\n"); - destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); - for (WordProperty wordProperty : wordPropertiesInDict) { - destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord - + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability() - + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") - + "\">"); - if (wordProperty.mHasShortcuts) { - destination.write("\n"); - for (WeightedString target : wordProperty.mShortcutTargets) { - destination.write(" <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\"" - + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG - + ">\n"); - } - destination.write(" "); - } - if (wordProperty.mHasNgrams) { - destination.write("\n"); - for (WeightedString bigram : wordProperty.getBigrams()) { - destination.write(" <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\"" - + bigram.getProbability() + "\">" + bigram.mWord - + "</" + BIGRAM_TAG + ">\n"); - } - destination.write(" "); - } - destination.write("</" + WORD_TAG + ">\n"); - } - destination.write("</wordlist>\n"); - destination.close(); - } -} |