diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin')
4 files changed, 93 insertions, 21 deletions
diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java b/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java index 063243e1b..dd11aaa37 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionaryGetter.java @@ -16,6 +16,8 @@ package com.android.inputmethod.latin; +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; + import android.content.Context; import android.content.SharedPreferences; import android.content.pm.PackageManager.NameNotFoundException; @@ -23,6 +25,7 @@ import android.content.res.AssetFileDescriptor; import android.util.Log; import java.io.File; +import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.HashMap; import java.util.Locale; @@ -51,6 +54,9 @@ class BinaryDictionaryGetter { private static final String MAIN_DICTIONARY_CATEGORY = "main"; public static final String ID_CATEGORY_SEPARATOR = ":"; + // The key considered to read the version attribute in a dictionary file. + private static String VERSION_KEY = "version"; + // Prevents this from being instantiated private BinaryDictionaryGetter() {} @@ -336,6 +342,42 @@ class BinaryDictionaryGetter { return MAIN_DICTIONARY_CATEGORY.equals(idArray[0]); } + // ## HACK ## we prevent usage of a dictionary before version 18 for English only. The reason + // for this is, since those do not include whitelist entries, the new code with an old version + // of the dictionary would lose whitelist functionality. + private static boolean hackCanUseDictionaryFile(final Locale locale, final File f) { + // Only for English - other languages didn't have a whitelist, hence this + // ad-hock ## HACK ## + if (!Locale.ENGLISH.getLanguage().equals(locale.getLanguage())) return true; + + try { + // Read the version of the file + final RandomAccessFile raf = new RandomAccessFile(f, "r"); + final int magic = raf.readInt(); + if (magic != BinaryDictInputOutput.VERSION_2_MAGIC_NUMBER) { + return false; + } + final int formatVersion = raf.readInt(); + final int headerSize = raf.readInt(); + final HashMap<String, String> options = new HashMap<String, String>(); + BinaryDictInputOutput.populateOptionsFromFile(raf, headerSize, options); + final String version = options.get(VERSION_KEY); + if (null == version) { + // No version in the options : the format is unexpected + return false; + } + // Version 18 is the first one to include the whitelist + // Obviously this is a big ## HACK ## + return Integer.parseInt(version) >= 18; + } catch (java.io.FileNotFoundException e) { + return false; + } catch (java.io.IOException e) { + return false; + } catch (NumberFormatException e) { + return false; + } + } + /** * Returns a list of file addresses for a given locale, trying relevant methods in order. * @@ -366,14 +408,15 @@ class BinaryDictionaryGetter { // cachedWordLists may not be null, see doc for getCachedDictionaryList for (final File f : cachedWordLists) { final String wordListId = getWordListIdFromFileName(f.getName()); - if (isMainWordListId(wordListId)) { + final boolean canUse = f.canRead() && hackCanUseDictionaryFile(locale, f); + if (canUse && isMainWordListId(wordListId)) { foundMainDict = true; } if (!dictPackSettings.isWordListActive(wordListId)) continue; - if (f.canRead()) { + if (canUse) { fileList.add(AssetFileAddress.makeFromFileName(f.getPath())); } else { - Log.e(TAG, "Found a cached dictionary file but cannot read it"); + Log.e(TAG, "Found a cached dictionary file but cannot read or use it"); } } diff --git a/java/src/com/android/inputmethod/latin/LatinIME.java b/java/src/com/android/inputmethod/latin/LatinIME.java index 884e6db29..c20f3a3a9 100644 --- a/java/src/com/android/inputmethod/latin/LatinIME.java +++ b/java/src/com/android/inputmethod/latin/LatinIME.java @@ -2001,6 +2001,7 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen private CharSequence addToUserHistoryDictionary(final CharSequence suggestion) { if (TextUtils.isEmpty(suggestion)) return null; + if (mSuggest == null) return null; // If correction is not enabled, we don't add words to the user history dictionary. // That's to avoid unintended additions in some sensitive fields, or fields that diff --git a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java index 3bb670c9a..d516e72ad 100644 --- a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java +++ b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java @@ -52,14 +52,14 @@ public class UserHistoryDictionary extends ExpandableDictionary { private static final int FREQUENCY_FOR_TYPED = 2; /** Maximum number of pairs. Pruning will start when databases goes above this number. */ - private static int sMaxHistoryBigrams = 10000; + public static final int sMaxHistoryBigrams = 10000; /** * When it hits maximum bigram pair, it will delete until you are left with * only (sMaxHistoryBigrams - sDeleteHistoryBigrams) pairs. * Do not keep this number small to avoid deleting too often. */ - private static int sDeleteHistoryBigrams = 1000; + public static final int sDeleteHistoryBigrams = 1000; /** * Database version should increase if the database structure changes @@ -109,12 +109,8 @@ public class UserHistoryDictionary extends ExpandableDictionary { private static DatabaseHelper sOpenHelper = null; - public void setDatabaseMax(int maxHistoryBigram) { - sMaxHistoryBigrams = maxHistoryBigram; - } - - public void setDatabaseDelete(int deleteHistoryBigram) { - sDeleteHistoryBigrams = deleteHistoryBigram; + public String getLocale() { + return mLocale; } public synchronized static UserHistoryDictionary getInstance( @@ -502,9 +498,11 @@ public class UserHistoryDictionary extends ExpandableDictionary { needsToSave(fc, isValid, addLevel0Bigram)) { freq = fc; } else { + // Delete this entry freq = -1; } } else { + // Delete this entry freq = -1; } } @@ -541,6 +539,7 @@ public class UserHistoryDictionary extends ExpandableDictionary { getContentValues(word1, word2, mLocale)); pairId = pairIdLong.intValue(); } + // Eliminate freq == 0 because that word is profanity. if (freq > 0) { if (PROFILE_SAVE_RESTORE) { ++profInsert; diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 2c3eee74c..b23b7db34 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -124,7 +124,7 @@ public class BinaryDictInputOutput { */ private static final int VERSION_1_MAGIC_NUMBER = 0x78B1; - private static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; + public static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; private static final int MINIMUM_SUPPORTED_VERSION = 1; private static final int MAXIMUM_SUPPORTED_VERSION = 2; private static final int NOT_A_VERSION_NUMBER = -1; @@ -783,10 +783,10 @@ public class BinaryDictInputOutput { // their lower bound and exclude their higher bound so we need to have the first step // start at exactly 1 unit higher than floor(unigramFreq + half a step). // Note : to reconstruct the score, the dictionary reader will need to divide - // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise, and add - // (discretizedFrequency + 0.5) times this value to get the median value of the step, - // which is the best approximation. This is how we get the most precise result with - // only four bits. + // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, + // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best + // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the + // step pointed by the discretized frequency. final float stepSize = (MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5f + MAX_BIGRAM_FREQUENCY); final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); @@ -1328,6 +1328,21 @@ public class BinaryDictInputOutput { } /** + * Reads options from a file and populate a map with their contents. + * + * The file is read at the current file pointer, so the caller must take care the pointer + * is in the right place before calling this. + */ + public static void populateOptionsFromFile(final RandomAccessFile source, final long headerSize, + final HashMap<String, String> options) throws IOException { + while (source.getFilePointer() < headerSize) { + final String key = CharEncoding.readString(source); + final String value = CharEncoding.readString(source); + options.put(key, value); + } + } + + /** * Reads a random access file and returns the memory representation of the dictionary. * * This high-level method takes a binary file and reads its contents, populating a @@ -1358,11 +1373,7 @@ public class BinaryDictInputOutput { } else { headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16) + (source.readUnsignedByte() << 8) + source.readUnsignedByte(); - while (source.getFilePointer() < headerSize) { - final String key = CharEncoding.readString(source); - final String value = CharEncoding.readString(source); - options.put(key, value); - } + populateOptionsFromFile(source, headerSize, options); source.seek(headerSize); } @@ -1410,4 +1421,22 @@ public class BinaryDictInputOutput { return false; } } + + /** + * Calculate bigram frequency from compressed value + * + * @see #makeBigramFlags + * + * @param unigramFrequency + * @param bigramFrequency compressed frequency + * @return approximate bigram frequency + */ + public static int reconstructBigramFrequency(final int unigramFrequency, + final int bigramFrequency) { + final float stepSize = (MAX_TERMINAL_FREQUENCY - unigramFrequency) + / (1.5f + MAX_BIGRAM_FREQUENCY); + final float resultFreqFloat = (float)unigramFrequency + + stepSize * (bigramFrequency + 1.0f); + return (int)resultFreqFloat; + } } |