diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
5 files changed, 146 insertions, 127 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java b/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java index df447fd75..644818ba6 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java @@ -38,15 +38,13 @@ public final class DictionaryHeader { public static final String DICTIONARY_DATE_KEY = "date"; public static final String HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; public static final String USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE"; - public static final String FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY = - "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP"; public static final String FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; - public static final String FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY = - "FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS"; - public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; - public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; + public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_ENTRY_COUNT"; + public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_ENTRY_COUNT"; + public static final String MAX_TRIGRAM_COUNT_KEY = "MAX_TRIGRAM_ENTRY_COUNT"; public static final String ATTRIBUTE_VALUE_TRUE = "1"; + public static final String CODE_POINT_TABLE_KEY = "codePointTable"; public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions, final FormatOptions formatOptions) throws UnsupportedFormatException { diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index a2ae74b20..78d79ae50 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -17,7 +17,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.common.Constants; import java.util.Date; import java.util.HashMap; @@ -36,9 +36,7 @@ public final class FormatSpec { * sion * * o | - * p | not used 3 bits - * t | each unigram and bigram entry has a time stamp? - * i | 1 bit, 1 = yes, 0 = no : CONTAINS_TIMESTAMP_FLAG + * p | not used, 2 bytes. * o | * nflags * @@ -48,7 +46,7 @@ public final class FormatSpec { * d | * ersize * - * | attributes list + * attributes list * * attributes list is: * <key> = | string of characters at the char format described below, with the terminator used @@ -86,27 +84,16 @@ public final class FormatSpec { */ /* Node (FusionDictionary.PtNode) layout is as follows: - * | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED - * | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES - * | 01 = yes : FLAG_IS_MOVED - * f | the new address is stored in the same place as the parent address - * l | is deleted? 10 = yes : FLAG_IS_DELETED + * | CHILDREN_ADDRESS_TYPE 2 bits, 11 : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES + * | 10 : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES + * f | 01 : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE + * l | 00 : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS * a | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS * g | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD - * | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED - * - * p | - * a | parent address, 3byte - * r | 1 byte = bbbbbbbb match - * e | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) - * n | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte - * t | This address is relative to the head of the PtNode. - * a | If the node doesn't have a parent, this field is set to 0. - * d | - * dress + * | is possibly offensive ? 1 bit, 1 = yes, 0 = no : FLAG_IS_POSSIBLY_OFFENSIVE * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers @@ -121,15 +108,10 @@ public final class FormatSpec { * q | * * c | - * h | children address, 3 bytes - * i | 1 byte = bbbbbbbb match - * l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) - * d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte - * r | if this node doesn't have children, this field is set to 0. - * e | (see BinaryDictEncoderUtils#writeVariableSignedAddress) - * n | This address is relative to the position of this field. - * a | - * ddress + * h | children address, CHILDREN_ADDRESS_TYPE bytes + * i | This address is relative to the position of this field. + * l | + * drenaddress * * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS * | shortcut string list @@ -179,17 +161,17 @@ public final class FormatSpec { public static final int MAGIC_NUMBER = 0x9BC13AFE; static final int NOT_A_VERSION_NUMBER = -1; - static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3; - static final int FIRST_VERSION_WITH_TERMINAL_ID = 4; // These MUST have the same values as the relevant constants in format_utils.h. - // From version 4 on, we use version * 100 + revision as a version number. That allows + // From version 2.01 on, we use version * 100 + revision as a version number. That allows // us to change the format during development while having testing devices remove // older files with each upgrade, while still having a readable versioning scheme. // When we bump up the dictionary format version, we should update // ExpandableDictionary.needsToMigrateDictionary() and // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). public static final int VERSION2 = 2; + public static final int VERSION201 = 201; + public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201; // Dictionary version used for testing. public static final int VERSION4_ONLY_FOR_TESTING = 399; public static final int VERSION401 = 401; @@ -202,9 +184,6 @@ public final class FormatSpec { // use it in the reading code. static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH; - static final int PARENT_ADDRESS_SIZE = 3; - static final int FORWARD_LINK_ADDRESS_SIZE = 3; - // These flags are used only in the static dictionary. static final int MASK_CHILDREN_ADDRESS_TYPE = 0xC0; static final int FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS = 0x00; @@ -218,14 +197,7 @@ public final class FormatSpec { static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; static final int FLAG_HAS_BIGRAMS = 0x04; static final int FLAG_IS_NOT_A_WORD = 0x02; - static final int FLAG_IS_BLACKLISTED = 0x01; - - // These flags are used only in the dynamic dictionary. - static final int MASK_MOVE_AND_DELETE_FLAG = 0xC0; - static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40; - static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; - static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; - static final int FLAG_IS_DELETED = 0x80; + static final int FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80; static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40; @@ -240,52 +212,12 @@ public final class FormatSpec { static final int PTNODE_TERMINATOR_SIZE = 1; static final int PTNODE_FLAGS_SIZE = 1; static final int PTNODE_FREQUENCY_SIZE = 1; - static final int PTNODE_TERMINAL_ID_SIZE = 4; static final int PTNODE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1; static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2; - // These values are used only by version 4 or later. They MUST match the definitions in - // ver4_dict_constants.cpp. - static final String TRIE_FILE_EXTENSION = ".trie"; - public static final String HEADER_FILE_EXTENSION = ".header"; - static final String FREQ_FILE_EXTENSION = ".freq"; - // tat = Terminal Address Table - static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; - static final String BIGRAM_FILE_EXTENSION = ".bigram"; - static final String SHORTCUT_FILE_EXTENSION = ".shortcut"; - static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; - static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; - static final int FLAGS_IN_FREQ_FILE_SIZE = 1; - static final int FREQUENCY_AND_FLAGS_SIZE = 2; - static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; - static final int UNIGRAM_TIMESTAMP_SIZE = 4; - static final int UNIGRAM_COUNTER_SIZE = 1; - static final int UNIGRAM_LEVEL_SIZE = 1; - - // With the English main dictionary as of October 2013, the size of bigram address table is - // is 345KB with the block size being 16. - // This is 54% of that of full address table. - static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; - static final int BIGRAM_CONTENT_COUNT = 1; - static final int BIGRAM_FREQ_CONTENT_INDEX = 0; - static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; - static final int BIGRAM_TIMESTAMP_SIZE = 4; - static final int BIGRAM_COUNTER_SIZE = 1; - static final int BIGRAM_LEVEL_SIZE = 1; - - static final int SHORTCUT_CONTENT_COUNT = 1; - static final int SHORTCUT_CONTENT_INDEX = 0; - // With the English main dictionary as of October 2013, the size of shortcut address table is - // 26KB with the block size being 64. - // This is only 4.4% of that of full address table. - static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; - static final String SHORTCUT_CONTENT_ID = "_shortcut"; - static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; - static final int NO_PARENT_ADDRESS = 0; - static final int NO_FORWARD_LINK_ADDRESS = 0; static final int INVALID_CHARACTER = -1; static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127 @@ -302,14 +234,13 @@ public final class FormatSpec { // This option needs to be the same numeric value as the one in binary_format.h. static final int NOT_VALID_WORD = -99; - static final int SIGNED_CHILDREN_ADDRESS_SIZE = 3; static final int UINT8_MAX = 0xFF; static final int UINT16_MAX = 0xFFFF; static final int UINT24_MAX = 0xFFFFFF; - static final int SINT24_MAX = 0x7FFFFF; static final int MSB8 = 0x80; - static final int MSB24 = 0x800000; + static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; /** * Options about file format. diff --git a/java/src/com/android/inputmethod/latin/makedict/NgramProperty.java b/java/src/com/android/inputmethod/latin/makedict/NgramProperty.java new file mode 100644 index 000000000..b1d19dc3c --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/NgramProperty.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.latin.NgramContext; + +public class NgramProperty { + public final WeightedString mTargetWord; + public final NgramContext mNgramContext; + + public NgramProperty(final WeightedString targetWord, final NgramContext ngramContext) { + mTargetWord = targetWord; + mNgramContext = ngramContext; + } + + @Override + public int hashCode() { + return mTargetWord.hashCode() ^ mNgramContext.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if (!(o instanceof NgramProperty)) return false; + final NgramProperty n = (NgramProperty)o; + return mTargetWord.equals(n.mTargetWord) && mNgramContext.equals(n.mNgramContext); + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/ProbabilityInfo.java b/java/src/com/android/inputmethod/latin/makedict/ProbabilityInfo.java index 5fcbb6357..03c2ece1d 100644 --- a/java/src/com/android/inputmethod/latin/makedict/ProbabilityInfo.java +++ b/java/src/com/android/inputmethod/latin/makedict/ProbabilityInfo.java @@ -40,11 +40,8 @@ public final class ProbabilityInfo { if (probabilityInfo2 == null) { return probabilityInfo1; } - if (probabilityInfo1.mProbability > probabilityInfo2.mProbability) { - return probabilityInfo1; - } else { - return probabilityInfo2; - } + return (probabilityInfo1.mProbability > probabilityInfo2.mProbability) ? probabilityInfo1 + : probabilityInfo2; } public ProbabilityInfo(final int probability) { @@ -67,9 +64,8 @@ public final class ProbabilityInfo { public int hashCode() { if (hasHistoricalInfo()) { return Arrays.hashCode(new Object[] { mProbability, mTimestamp, mLevel, mCount }); - } else { - return Arrays.hashCode(new Object[] { mProbability }); } + return Arrays.hashCode(new Object[] { mProbability }); } @Override diff --git a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java index cd78e2235..e7808e46e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java +++ b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java @@ -18,12 +18,17 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.Dictionary; +import com.android.inputmethod.latin.NgramContext; +import com.android.inputmethod.latin.NgramContext.WordInfo; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import com.android.inputmethod.latin.utils.StringUtils; import java.util.ArrayList; import java.util.Arrays; +import javax.annotation.Nullable; + /** * Utility class for a word with a probability. * @@ -33,29 +38,38 @@ public final class WordProperty implements Comparable<WordProperty> { public final String mWord; public final ProbabilityInfo mProbabilityInfo; public final ArrayList<WeightedString> mShortcutTargets; - public final ArrayList<WeightedString> mBigrams; + public final ArrayList<NgramProperty> mNgrams; // TODO: Support mIsBeginningOfSentence. public final boolean mIsBeginningOfSentence; public final boolean mIsNotAWord; - public final boolean mIsBlacklistEntry; + public final boolean mIsPossiblyOffensive; public final boolean mHasShortcuts; - public final boolean mHasBigrams; + public final boolean mHasNgrams; private int mHashCode = 0; + // TODO: Support n-gram. @UsedForTesting public WordProperty(final String word, final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, - final ArrayList<WeightedString> bigrams, - final boolean isNotAWord, final boolean isBlacklistEntry) { + @Nullable final ArrayList<WeightedString> bigrams, + final boolean isNotAWord, final boolean isPossiblyOffensive) { mWord = word; mProbabilityInfo = probabilityInfo; mShortcutTargets = shortcutTargets; - mBigrams = bigrams; + if (null == bigrams) { + mNgrams = null; + } else { + mNgrams = new ArrayList<>(); + final NgramContext ngramContext = new NgramContext(new WordInfo(mWord)); + for (final WeightedString bigramTarget : bigrams) { + mNgrams.add(new NgramProperty(bigramTarget, ngramContext)); + } + } mIsBeginningOfSentence = false; mIsNotAWord = isNotAWord; - mIsBlacklistEntry = isBlacklistEntry; - mHasBigrams = bigrams != null && !bigrams.isEmpty(); + mIsPossiblyOffensive = isPossiblyOffensive; + mHasNgrams = bigrams != null && !bigrams.isEmpty(); mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty(); } @@ -70,28 +84,43 @@ public final class WordProperty implements Comparable<WordProperty> { // Construct word property using information from native code. // This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY. public WordProperty(final int[] codePoints, final boolean isNotAWord, - final boolean isBlacklisted, final boolean hasBigram, final boolean hasShortcuts, + final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts, final boolean isBeginningOfSentence, final int[] probabilityInfo, - final ArrayList<int[]> bigramTargets, final ArrayList<int[]> bigramProbabilityInfo, + final ArrayList<int[][]> ngramPrevWordsArray, + final ArrayList<boolean[]> ngramPrevWordIsBeginningOfSentenceArray, + final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo, final ArrayList<int[]> shortcutTargets, final ArrayList<Integer> shortcutProbabilities) { mWord = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints); mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo); mShortcutTargets = new ArrayList<>(); - mBigrams = new ArrayList<>(); + final ArrayList<NgramProperty> ngrams = new ArrayList<>(); mIsBeginningOfSentence = isBeginningOfSentence; mIsNotAWord = isNotAWord; - mIsBlacklistEntry = isBlacklisted; + mIsPossiblyOffensive = isPossiblyOffensive; mHasShortcuts = hasShortcuts; - mHasBigrams = hasBigram; - - final int bigramTargetCount = bigramTargets.size(); - for (int i = 0; i < bigramTargetCount; i++) { - final String bigramTargetString = - StringUtils.getStringFromNullTerminatedCodePointArray(bigramTargets.get(i)); - mBigrams.add(new WeightedString(bigramTargetString, - createProbabilityInfoFromArray(bigramProbabilityInfo.get(i)))); + mHasNgrams = hasBigram; + + final int relatedNgramCount = ngramTargets.size(); + for (int i = 0; i < relatedNgramCount; i++) { + final String ngramTargetString = + StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i)); + final WeightedString ngramTarget = new WeightedString(ngramTargetString, + createProbabilityInfoFromArray(ngramProbabilityInfo.get(i))); + final int[][] prevWords = ngramPrevWordsArray.get(i); + final boolean[] isBeginningOfSentenceArray = + ngramPrevWordIsBeginningOfSentenceArray.get(i); + final WordInfo[] wordInfoArray = new WordInfo[prevWords.length]; + for (int j = 0; j < prevWords.length; j++) { + wordInfoArray[j] = isBeginningOfSentenceArray[j] + ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO + : new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray( + prevWords[j])); + } + final NgramContext ngramContext = new NgramContext(wordInfoArray); + ngrams.add(new NgramProperty(ngramTarget, ngramContext)); } + mNgrams = ngrams.isEmpty() ? null : ngrams; final int shortcutTargetCount = shortcutTargets.size(); for (int i = 0; i < shortcutTargetCount; i++) { @@ -102,6 +131,21 @@ public final class WordProperty implements Comparable<WordProperty> { } } + // TODO: Remove + @UsedForTesting + public ArrayList<WeightedString> getBigrams() { + if (null == mNgrams) { + return null; + } + final ArrayList<WeightedString> bigrams = new ArrayList<>(); + for (final NgramProperty ngram : mNgrams) { + if (ngram.mNgramContext.getPrevWordCount() == 1) { + bigrams.add(ngram.mTargetWord); + } + } + return bigrams; + } + public int getProbability() { return mProbabilityInfo.mProbability; } @@ -110,10 +154,10 @@ public final class WordProperty implements Comparable<WordProperty> { return Arrays.hashCode(new Object[] { word.mWord, word.mProbabilityInfo, - word.mShortcutTargets.hashCode(), - word.mBigrams.hashCode(), + word.mShortcutTargets, + word.mNgrams, word.mIsNotAWord, - word.mIsBlacklistEntry + word.mIsPossiblyOffensive }); } @@ -142,9 +186,17 @@ public final class WordProperty implements Comparable<WordProperty> { if (!(o instanceof WordProperty)) return false; WordProperty w = (WordProperty)o; return mProbabilityInfo.equals(w.mProbabilityInfo) && mWord.equals(w.mWord) - && mShortcutTargets.equals(w.mShortcutTargets) && mBigrams.equals(w.mBigrams) - && mIsNotAWord == w.mIsNotAWord && mIsBlacklistEntry == w.mIsBlacklistEntry - && mHasBigrams == w.mHasBigrams && mHasShortcuts && w.mHasBigrams; + && mShortcutTargets.equals(w.mShortcutTargets) && equals(mNgrams, w.mNgrams) + && mIsNotAWord == w.mIsNotAWord && mIsPossiblyOffensive == w.mIsPossiblyOffensive + && mHasNgrams == w.mHasNgrams && mHasShortcuts && w.mHasNgrams; + } + + // TDOO: Have a utility method like java.util.Objects.equals. + private static <T> boolean equals(final ArrayList<T> a, final ArrayList<T> b) { + if (null == a) { + return null == b; + } + return a.equals(b); } @Override @@ -157,7 +209,7 @@ public final class WordProperty implements Comparable<WordProperty> { @UsedForTesting public boolean isValid() { - return getProbability() != BinaryDictionary.NOT_A_PROBABILITY; + return getProbability() != Dictionary.NOT_A_PROBABILITY; } @Override |