diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/utils')
6 files changed, 134 insertions, 177 deletions
diff --git a/java/src/com/android/inputmethod/latin/utils/DistracterFilter.java b/java/src/com/android/inputmethod/latin/utils/DistracterFilter.java index 355d00dac..525212c96 100644 --- a/java/src/com/android/inputmethod/latin/utils/DistracterFilter.java +++ b/java/src/com/android/inputmethod/latin/utils/DistracterFilter.java @@ -16,14 +16,16 @@ package com.android.inputmethod.latin.utils; -import java.util.List; -import java.util.Locale; - import android.view.inputmethod.InputMethodSubtype; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.NgramContext; +import java.util.List; +import java.util.Locale; + +import javax.annotation.Nonnull; + public interface DistracterFilter { /** * Determine whether a word is a distracter to words in dictionaries. @@ -68,8 +70,9 @@ public interface DistracterFilter { public static boolean shouldBeHandledAsOov(final int handlingType) { return (handlingType & SHOULD_BE_HANDLED_AS_OOV) != 0; } - }; + } + @Nonnull public static final DistracterFilter EMPTY_DISTRACTER_FILTER = new DistracterFilter() { @Override public boolean isDistracterToWordsInDictionaries(NgramContext ngramContext, diff --git a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java b/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java deleted file mode 100644 index 3e5cb33ca..000000000 --- a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.utils; - -import android.util.Log; - -import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.Dictionary; -import com.android.inputmethod.latin.NgramContext; -import com.android.inputmethod.latin.settings.SpacingAndPunctuations; -import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType; - -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -// Note: this class is used as a parameter type of a native method. You should be careful when you -// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). -public final class LanguageModelParam { - private static final String TAG = LanguageModelParam.class.getSimpleName(); - private static final boolean DEBUG = false; - private static final boolean DEBUG_TOKEN = false; - - // For now, these probability values are being referred to only when we add new entries to - // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or - // non-0. Thus, it's not meaningful to compare 10, 100, and so on. - // TODO: Revise the logic in ForgettingCurveUtils in native code. - private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100; - private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; - private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10; - private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; - - public final CharSequence mTargetWord; - public final int[] mWord0; - public final int[] mWord1; - // TODO: this needs to be a list of shortcuts - public final int[] mShortcutTarget; - public final int mUnigramProbability; - public final int mBigramProbability; - public final int mShortcutProbability; - public final boolean mIsNotAWord; - public final boolean mIsPossiblyOffensive; - // Time stamp in seconds. - public final int mTimestamp; - - // Constructor for unigram. TODO: support shortcuts - @UsedForTesting - public LanguageModelParam(final CharSequence word, final int unigramProbability, - final int timestamp) { - this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp); - } - - // Constructor for unigram and bigram. - @UsedForTesting - public LanguageModelParam(final CharSequence word0, final CharSequence word1, - final int unigramProbability, final int bigramProbability, - final int timestamp) { - mTargetWord = word1; - mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0); - mWord1 = StringUtils.toCodePointArray(word1); - mShortcutTarget = null; - mUnigramProbability = unigramProbability; - mBigramProbability = bigramProbability; - mShortcutProbability = Dictionary.NOT_A_PROBABILITY; - mIsNotAWord = false; - mIsPossiblyOffensive = false; - mTimestamp = timestamp; - } - - // Process a list of words and return a list of {@link LanguageModelParam} objects. - public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom( - final List<String> tokens, final int timestamp, - final SpacingAndPunctuations spacingAndPunctuations, final Locale locale, - final DistracterFilter distracterFilter) { - final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>(); - final int N = tokens.size(); - NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; - for (int i = 0; i < N; ++i) { - final String tempWord = tokens.get(i); - if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { - // just skip this token - if (DEBUG_TOKEN) { - Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); - } - continue; - } - if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( - tempWord, spacingAndPunctuations)) { - if (DEBUG_TOKEN) { - Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" - + tempWord + "\""); - } - // Sentence terminator found. Split. - ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; - continue; - } - if (DEBUG_TOKEN) { - Log.d(TAG, "--- word: \"" + tempWord + "\""); - } - final LanguageModelParam languageModelParam = - detectWhetherVaildWordOrNotAndGetLanguageModelParam( - ngramContext, tempWord, timestamp, locale, distracterFilter); - if (languageModelParam == null) { - continue; - } - languageModelParams.add(languageModelParam); - ngramContext = ngramContext.getNextNgramContext( - new NgramContext.WordInfo(tempWord)); - } - return languageModelParams; - } - - private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam( - final NgramContext ngramContext, final String targetWord, final int timestamp, - final Locale locale, final DistracterFilter distracterFilter) { - if (locale == null) { - return null; - } - final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext, - targetWord, locale); - final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ? - targetWord.toLowerCase(locale) : targetWord; - if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) { - // The word is a distracter. - return null; - } - return createAndGetLanguageModelParamOfWord(ngramContext, word, timestamp, - !HandlingType.shouldBeHandledAsOov(wordHandlingType)); - } - - private static LanguageModelParam createAndGetLanguageModelParamOfWord( - final NgramContext ngramContext, final String word, final int timestamp, - final boolean isValidWord) { - final int unigramProbability = isValidWord ? - UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; - if (!ngramContext.isValid()) { - if (DEBUG) { - Log.d(TAG, "--- add unigram: current(" - + (isValidWord ? "Valid" : "OOV") + ") = " + word); - } - return new LanguageModelParam(word, unigramProbability, timestamp); - } - if (DEBUG) { - Log.d(TAG, "--- add bigram: prev = " + ngramContext + ", current(" - + (isValidWord ? "Valid" : "OOV") + ") = " + word); - } - final int bigramProbability = isValidWord ? - BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD; - return new LanguageModelParam(ngramContext.getNthPrevWord(1 /* n */), word, - unigramProbability, bigramProbability, timestamp); - } -} diff --git a/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java b/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java index 95a1f0fb2..ba436777d 100644 --- a/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java @@ -16,14 +16,16 @@ package com.android.inputmethod.latin.utils; -import java.util.Arrays; -import java.util.regex.Pattern; - import com.android.inputmethod.latin.Constants; import com.android.inputmethod.latin.NgramContext; import com.android.inputmethod.latin.NgramContext.WordInfo; import com.android.inputmethod.latin.settings.SpacingAndPunctuations; +import java.util.Arrays; +import java.util.regex.Pattern; + +import javax.annotation.Nonnull; + public final class NgramContextUtils { private NgramContextUtils() { // Intentional empty constructor for utility class. @@ -52,6 +54,7 @@ public final class NgramContextUtils { // (n = 2) "abc|" -> beginning-of-sentence // (n = 2) "abc |" -> beginning-of-sentence // (n = 2) "abc. def|" -> beginning-of-sentence + @Nonnull public static NgramContext getNgramContextFromNthPreviousWord(final CharSequence prev, final SpacingAndPunctuations spacingAndPunctuations, final int n) { if (prev == null) return NgramContext.EMPTY_PREV_WORDS_INFO; diff --git a/java/src/com/android/inputmethod/latin/utils/ResourceUtils.java b/java/src/com/android/inputmethod/latin/utils/ResourceUtils.java index 093c5a6c1..d1fc642f3 100644 --- a/java/src/com/android/inputmethod/latin/utils/ResourceUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/ResourceUtils.java @@ -110,7 +110,6 @@ public final class ResourceUtils { * are true for the specified key value pairs. * * For example, "condition,constant" has the following format. - * (See {@link ResourceUtilsTests#testFindConstantForKeyValuePairsRegexp()}) * - HARDWARE=mako,constantForNexus4 * - MODEL=Nexus 4:MANUFACTURER=LGE,constantForNexus4 * - ,defaultConstant @@ -119,6 +118,7 @@ public final class ResourceUtils { * @param conditionConstantArray an array of "condition,constant" elements to be searched. * @return the constant part of the matched "condition,constant" element. Returns null if no * condition matches. + * @see com.android.inputmethod.latin.utils.ResourceUtilsTests#testFindConstantForKeyValuePairsRegexp() */ @UsedForTesting static String findConstantForKeyValuePairs(final HashMap<String, String> keyValuePairs, diff --git a/java/src/com/android/inputmethod/latin/utils/StringUtils.java b/java/src/com/android/inputmethod/latin/utils/StringUtils.java index bbcef990d..bc068ac53 100644 --- a/java/src/com/android/inputmethod/latin/utils/StringUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/StringUtils.java @@ -521,12 +521,12 @@ public final class StringUtils { * {@code charSequence.toString().split(regex, preserveTrailingEmptySegments ? -1 : 0)} * except that the spans are preserved in the result array. * </p> - * @param input the character sequence to be split. + * @param charSequence the character sequence to be split. * @param regex the regex pattern to be used as the separator. * @param preserveTrailingEmptySegments {@code true} to preserve the trailing empty * segments. Otherwise, trailing empty segments will be removed before being returned. - * @return the array which contains the result. All the spans in the {@param input} is - * preserved. + * @return the array which contains the result. All the spans in the <code>charSequence</code> + * is preserved. */ @UsedForTesting public static CharSequence[] split(final CharSequence charSequence, final String regex, diff --git a/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java b/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java new file mode 100644 index 000000000..644fda57f --- /dev/null +++ b/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.utils; + +import android.util.Log; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.NgramContext; +import com.android.inputmethod.latin.settings.SpacingAndPunctuations; +import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +// Note: this class is used as a parameter type of a native method. You should be careful when you +// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). +public final class WordInputEventForPersonalization { + private static final String TAG = WordInputEventForPersonalization.class.getSimpleName(); + private static final boolean DEBUG_TOKEN = false; + + public final int[] mTargetWord; + public final int mPrevWordsCount; + public final int[][] mPrevWordArray = new int[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM][]; + public final boolean[] mIsPrevWordBeginningOfSentenceArray = + new boolean[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + public final boolean mIsValid; + // Time stamp in seconds. + public final int mTimestamp; + + @UsedForTesting + public WordInputEventForPersonalization(final CharSequence targetWord, + final NgramContext ngramContext, final boolean isValid, final int timestamp) { + mTargetWord = StringUtils.toCodePointArray(targetWord); + mPrevWordsCount = ngramContext.getPrevWordCount(); + ngramContext.outputToArray(mPrevWordArray, mIsPrevWordBeginningOfSentenceArray); + mIsValid = isValid; + mTimestamp = timestamp; + } + + // Process a list of words and return a list of {@link WordInputEventForPersonalization} + // objects. + public static ArrayList<WordInputEventForPersonalization> createInputEventFrom( + final List<String> tokens, final int timestamp, + final SpacingAndPunctuations spacingAndPunctuations, final Locale locale, + final DistracterFilter distracterFilter) { + final ArrayList<WordInputEventForPersonalization> inputEvents = new ArrayList<>(); + final int N = tokens.size(); + NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; + for (int i = 0; i < N; ++i) { + final String tempWord = tokens.get(i); + if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { + // just skip this token + if (DEBUG_TOKEN) { + Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); + } + continue; + } + if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( + tempWord, spacingAndPunctuations)) { + if (DEBUG_TOKEN) { + Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" + + tempWord + "\""); + } + // Sentence terminator found. Split. + // TODO: Detect whether the context is beginning-of-sentence. + ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; + continue; + } + if (DEBUG_TOKEN) { + Log.d(TAG, "--- word: \"" + tempWord + "\""); + } + final WordInputEventForPersonalization inputEvent = + detectWhetherVaildWordOrNotAndGetInputEvent( + ngramContext, tempWord, timestamp, locale, distracterFilter); + if (inputEvent == null) { + continue; + } + inputEvents.add(inputEvent); + ngramContext = ngramContext.getNextNgramContext(new NgramContext.WordInfo(tempWord)); + } + return inputEvents; + } + + private static WordInputEventForPersonalization detectWhetherVaildWordOrNotAndGetInputEvent( + final NgramContext ngramContext, final String targetWord, final int timestamp, + final Locale locale, final DistracterFilter distracterFilter) { + if (locale == null) { + return null; + } + final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext, + targetWord, locale); + final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ? + targetWord.toLowerCase(locale) : targetWord; + if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) { + // The word is a distracter. + return null; + } + return new WordInputEventForPersonalization(word, ngramContext, + !HandlingType.shouldBeHandledAsOov(wordHandlingType), timestamp); + } +} |