diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/utils')
4 files changed, 76 insertions, 220 deletions
diff --git a/java/src/com/android/inputmethod/latin/utils/DictionaryInfoUtils.java b/java/src/com/android/inputmethod/latin/utils/DictionaryInfoUtils.java index 306735779..a15556511 100644 --- a/java/src/com/android/inputmethod/latin/utils/DictionaryInfoUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/DictionaryInfoUtils.java @@ -29,7 +29,7 @@ import com.android.inputmethod.latin.BinaryDictionaryGetter; import com.android.inputmethod.latin.Constants; import com.android.inputmethod.latin.R; import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; -import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.DictionaryHeader; import com.android.inputmethod.latin.settings.SpacingAndPunctuations; import java.io.File; @@ -282,7 +282,7 @@ public class DictionaryInfoUtils { BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR + locale.getLanguage().toString(); } - public static FileHeader getDictionaryFileHeaderOrNull(final File file) { + public static DictionaryHeader getDictionaryFileHeaderOrNull(final File file) { return BinaryDictIOUtils.getDictionaryFileHeaderOrNull(file, 0, file.length()); } @@ -294,7 +294,7 @@ public class DictionaryInfoUtils { */ private static DictionaryInfo createDictionaryInfoFromFileAddress( final AssetFileAddress fileAddress) { - final FileHeader header = BinaryDictIOUtils.getDictionaryFileHeaderOrNull( + final DictionaryHeader header = BinaryDictIOUtils.getDictionaryFileHeaderOrNull( new File(fileAddress.mFilename), fileAddress.mOffset, fileAddress.mLength); if (header == null) { return null; diff --git a/java/src/com/android/inputmethod/latin/utils/StringUtils.java b/java/src/com/android/inputmethod/latin/utils/StringUtils.java index c632a71a9..e7932b5a6 100644 --- a/java/src/com/android/inputmethod/latin/utils/StringUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/StringUtils.java @@ -46,7 +46,7 @@ public final class StringUtils { public static String newSingleCodePointString(int codePoint) { if (Character.charCount(codePoint) == 1) { - // Optimization: avoid creating an temporary array for characters that are + // Optimization: avoid creating a temporary array for characters that are // represented by a single char value return String.valueOf((char) codePoint); } @@ -205,6 +205,24 @@ public final class StringUtils { return codePoints; } + /** + * Construct a String from a code point array + * + * @param codePoints a code point array that is null terminated when its logical length is + * shorter than the array length. + * @return a string constructed from the code point array. + */ + public static String getStringFromNullTerminatedCodePointArray(final int[] codePoints) { + int stringLength = codePoints.length; + for (int i = 0; i < codePoints.length; i++) { + if (codePoints[i] == 0) { + stringLength = i; + break; + } + } + return new String(codePoints, 0 /* offset */, stringLength); + } + // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE. public static int getCapitalizationType(final String text) { // If the first char is not uppercase, then the word is either all lower case or diff --git a/java/src/com/android/inputmethod/latin/utils/UserHistoryDictIOUtils.java b/java/src/com/android/inputmethod/latin/utils/UserHistoryDictIOUtils.java deleted file mode 100644 index 7af03da59..000000000 --- a/java/src/com/android/inputmethod/latin/utils/UserHistoryDictIOUtils.java +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.utils; - -import android.util.Log; - -import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; -import com.android.inputmethod.latin.makedict.DictDecoder; -import com.android.inputmethod.latin.makedict.DictEncoder; -import com.android.inputmethod.latin.makedict.FormatSpec; -import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; -import com.android.inputmethod.latin.makedict.FusionDictionary; -import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; -import com.android.inputmethod.latin.makedict.PendingAttribute; -import com.android.inputmethod.latin.makedict.UnsupportedFormatException; -import com.android.inputmethod.latin.personalization.UserHistoryDictionaryBigramList; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map.Entry; -import java.util.TreeMap; -import java.util.concurrent.TimeUnit; - -/** - * Reads and writes Binary files for a UserHistoryDictionary. - * - * All the methods in this class are static. - */ -public final class UserHistoryDictIOUtils { - private static final String TAG = UserHistoryDictIOUtils.class.getSimpleName(); - private static final boolean DEBUG = false; - - public interface OnAddWordListener { - /** - * Callback to be notified when a word is added to the dictionary. - * @param word The added word. - * @param shortcutTarget A shortcut target for this word, or null if none. - * @param frequency The frequency for this word. - * @param shortcutFreq The frequency of the shortcut (0~15, with 15 = whitelist). - * Unspecified if shortcutTarget is null - do not rely on its value. - */ - public void setUnigram(final String word, final String shortcutTarget, final int frequency, - final int shortcutFreq); - public void setBigram(final String word1, final String word2, final int frequency); - } - - @UsedForTesting - public interface BigramDictionaryInterface { - public int getFrequency(final String word1, final String word2); - } - - /** - * Writes dictionary to file. - */ - @UsedForTesting - public static void writeDictionary(final DictEncoder dictEncoder, - final BigramDictionaryInterface dict, final UserHistoryDictionaryBigramList bigrams, - final FormatOptions formatOptions, final HashMap<String, String> options) { - final FusionDictionary fusionDict = constructFusionDictionary(dict, bigrams, options); - fusionDict.addOptionAttribute(FormatSpec.FileHeader.USES_FORGETTING_CURVE_KEY, - FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE); - fusionDict.addOptionAttribute(FormatSpec.FileHeader.DICTIONARY_DATE_KEY, - String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()))); - try { - dictEncoder.writeDictionary(fusionDict, formatOptions); - Log.d(TAG, "end writing"); - } catch (IOException e) { - Log.e(TAG, "IO exception while writing file", e); - } catch (UnsupportedFormatException e) { - Log.e(TAG, "Unsupported format", e); - } - } - - /** - * Constructs a new FusionDictionary from BigramDictionaryInterface. - */ - @UsedForTesting - static FusionDictionary constructFusionDictionary(final BigramDictionaryInterface dict, - final UserHistoryDictionaryBigramList bigrams, final HashMap<String, String> options) { - final FusionDictionary fusionDict = new FusionDictionary(new PtNodeArray(), - new FusionDictionary.DictionaryOptions(options)); - int profTotal = 0; - for (final String word1 : bigrams.keySet()) { - final HashMap<String, Byte> word1Bigrams = bigrams.getBigrams(word1); - for (final String word2 : word1Bigrams.keySet()) { - final int freq = dict.getFrequency(word1, word2); - if (freq == -1) { - // don't add this bigram. - continue; - } - if (DEBUG) { - if (word1 == null) { - Log.d(TAG, "add unigram: " + word2 + "," + Integer.toString(freq)); - } else { - Log.d(TAG, "add bigram: " + word1 - + "," + word2 + "," + Integer.toString(freq)); - } - profTotal++; - } - if (word1 == null) { // unigram - fusionDict.add(word2, freq, null, false /* isNotAWord */); - } else { // bigram - if (FusionDictionary.findWordInTree(fusionDict.mRootNodeArray, word1) == null) { - fusionDict.add(word1, 2, null, false /* isNotAWord */); - } - fusionDict.setBigram(word1, word2, freq); - } - bigrams.updateBigram(word1, word2, (byte)freq); - } - } - if (DEBUG) { - Log.d(TAG, "add " + profTotal + "words"); - } - return fusionDict; - } - - /** - * Reads dictionary from file. - */ - public static void readDictionaryBinary(final DictDecoder dictDecoder, - final OnAddWordListener dict) { - final TreeMap<Integer, String> unigrams = CollectionUtils.newTreeMap(); - final TreeMap<Integer, Integer> frequencies = CollectionUtils.newTreeMap(); - final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams = CollectionUtils.newTreeMap(); - try { - dictDecoder.readUnigramsAndBigramsBinary(unigrams, frequencies, bigrams); - } catch (IOException e) { - Log.e(TAG, "IO exception while reading file", e); - } catch (UnsupportedFormatException e) { - Log.e(TAG, "Unsupported format", e); - } catch (ArrayIndexOutOfBoundsException e) { - Log.e(TAG, "ArrayIndexOutOfBoundsException while reading file", e); - } - addWordsFromWordMap(unigrams, frequencies, bigrams, dict); - } - - /** - * Adds all unigrams and bigrams in maps to OnAddWordListener. - */ - @UsedForTesting - static void addWordsFromWordMap(final TreeMap<Integer, String> unigrams, - final TreeMap<Integer, Integer> frequencies, - final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams, - final OnAddWordListener to) { - for (Entry<Integer, String> entry : unigrams.entrySet()) { - final String word1 = entry.getValue(); - final int unigramFrequency = frequencies.get(entry.getKey()); - to.setUnigram(word1, null /* shortcutTarget */, unigramFrequency, 0 /* shortcutFreq */); - final ArrayList<PendingAttribute> attrList = bigrams.get(entry.getKey()); - if (attrList != null) { - for (final PendingAttribute attr : attrList) { - final String word2 = unigrams.get(attr.mAddress); - if (word1 == null || word2 == null) { - Log.e(TAG, "Invalid bigram pair detected: " + word1 + ", " + word2); - continue; - } - to.setBigram(word1, word2, - BinaryDictIOUtils.reconstructBigramFrequency(unigramFrequency, - attr.mFrequency)); - } - } - } - - } -} diff --git a/java/src/com/android/inputmethod/latin/utils/WordProperty.java b/java/src/com/android/inputmethod/latin/utils/WordProperty.java index ba9b114b0..da56b213f 100644 --- a/java/src/com/android/inputmethod/latin/utils/WordProperty.java +++ b/java/src/com/android/inputmethod/latin/utils/WordProperty.java @@ -20,6 +20,7 @@ package com.android.inputmethod.latin.utils; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.BinaryDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.ProbabilityInfo; import java.util.ArrayList; @@ -37,32 +38,12 @@ public class WordProperty { public final ArrayList<ProbabilityInfo> mBigramProbabilityInfo = CollectionUtils.newArrayList(); public final ArrayList<WeightedString> mShortcutTargets = CollectionUtils.newArrayList(); - // TODO: Use this kind of Probability class for dictionary read/write code under the makedict - // package. - public static final class ProbabilityInfo { - public final int mProbability; - // wTimestamp, mLevel and mCount are historical info. These values are depend on the - // implementation in native code; thus, we must not use them and have any assumptions about - // them except for tests. - public final int mTimestamp; - public final int mLevel; - public final int mCount; - - public ProbabilityInfo(final int[] probabilityInfo) { - mProbability = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_PROBABILITY_INDEX]; - mTimestamp = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX]; - mLevel = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_LEVEL_INDEX]; - mCount = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_COUNT_INDEX]; - } - } - - private static int getCodePointCount(final int[] codePoints) { - for (int i = 0; i < codePoints.length; i++) { - if (codePoints[i] == 0) { - return i; - } - } - return codePoints.length; + private static ProbabilityInfo createProbabilityInfoFromArray(final int[] probabilityInfo) { + return new ProbabilityInfo( + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_PROBABILITY_INDEX], + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX], + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_LEVEL_INDEX], + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_COUNT_INDEX]); } // This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY. @@ -72,20 +53,19 @@ public class WordProperty { final ArrayList<int[]> bigramTargets, final ArrayList<int[]> bigramProbabilityInfo, final ArrayList<int[]> shortcutTargets, final ArrayList<Integer> shortcutProbabilities) { - mCodePoints = new String(codePoints, 0 /* offset */, getCodePointCount(codePoints)); + mCodePoints = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints); mIsNotAWord = isNotAWord; mIsBlacklisted = isBlacklisted; mHasBigrams = hasBigram; mHasShortcuts = hasShortcuts; - mProbabilityInfo = new ProbabilityInfo(probabilityInfo); + mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo); final int bigramTargetCount = bigramTargets.size(); for (int i = 0; i < bigramTargetCount; i++) { - final int[] bigramTargetCodePointArray = bigramTargets.get(i); - final String bigramTargetString = new String(bigramTargetCodePointArray, - 0 /* offset */, getCodePointCount(bigramTargetCodePointArray)); + final String bigramTargetString = + StringUtils.getStringFromNullTerminatedCodePointArray(bigramTargets.get(i)); final ProbabilityInfo bigramProbability = - new ProbabilityInfo(bigramProbabilityInfo.get(i)); + createProbabilityInfoFromArray(bigramProbabilityInfo.get(i)); mBigramTargets.add( new WeightedString(bigramTargetString, bigramProbability.mProbability)); mBigramProbabilityInfo.add(bigramProbability); @@ -93,9 +73,8 @@ public class WordProperty { final int shortcutTargetCount = shortcutTargets.size(); for (int i = 0; i < shortcutTargetCount; i++) { - final int[] shortcutTargetCodePointArray = shortcutTargets.get(i); - final String shortcutTargetString = new String(shortcutTargetCodePointArray, - 0 /* offset */, getCodePointCount(shortcutTargetCodePointArray)); + final String shortcutTargetString = + StringUtils.getStringFromNullTerminatedCodePointArray(shortcutTargets.get(i)); mShortcutTargets.add( new WeightedString(shortcutTargetString, shortcutProbabilities.get(i))); } @@ -105,4 +84,44 @@ public class WordProperty { public boolean isValid() { return mProbabilityInfo.mProbability != BinaryDictionary.NOT_A_PROBABILITY; } + + @Override + public String toString() { + // TODO: Move this logic to CombinedInputOutput. + final StringBuffer builder = new StringBuffer(); + builder.append(" word=" + mCodePoints); + builder.append(","); + builder.append("f=" + mProbabilityInfo.mProbability); + if (mIsNotAWord) { + builder.append(","); + builder.append("not_a_word=true"); + } + if (mIsBlacklisted) { + builder.append(","); + builder.append("blacklisted=true"); + } + if (mProbabilityInfo.mTimestamp != BinaryDictionary.NOT_A_VALID_TIMESTAMP) { + builder.append(","); + builder.append("historicalInfo=" + mProbabilityInfo); + } + builder.append("\n"); + for (int i = 0; i < mBigramTargets.size(); i++) { + builder.append(" bigram=" + mBigramTargets.get(i).mWord); + builder.append(","); + builder.append("f=" + mBigramTargets.get(i).mFrequency); + if (mBigramProbabilityInfo.get(i).mTimestamp + != BinaryDictionary.NOT_A_VALID_TIMESTAMP) { + builder.append(","); + builder.append("historicalInfo=" + mBigramProbabilityInfo.get(i)); + } + builder.append("\n"); + } + for (int i = 0; i < mShortcutTargets.size(); i++) { + builder.append(" shortcut=" + mShortcutTargets.get(i).mWord); + builder.append(","); + builder.append("f=" + mShortcutTargets.get(i).mFrequency); + builder.append("\n"); + } + return builder.toString(); + } }
\ No newline at end of file |