diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
4 files changed, 83 insertions, 100 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java b/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java index df447fd75..3f9ffd285 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java @@ -47,6 +47,7 @@ public final class DictionaryHeader { public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; public static final String ATTRIBUTE_VALUE_TRUE = "1"; + public static final String CODE_POINT_TABLE_KEY = "codePointTable"; public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions, final FormatOptions formatOptions) throws UnsupportedFormatException { diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index a2ae74b20..34edfa0da 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -36,9 +36,7 @@ public final class FormatSpec { * sion * * o | - * p | not used 3 bits - * t | each unigram and bigram entry has a time stamp? - * i | 1 bit, 1 = yes, 0 = no : CONTAINS_TIMESTAMP_FLAG + * p | not used, 2 bytes. * o | * nflags * @@ -48,7 +46,7 @@ public final class FormatSpec { * d | * ersize * - * | attributes list + * attributes list * * attributes list is: * <key> = | string of characters at the char format described below, with the terminator used @@ -86,11 +84,10 @@ public final class FormatSpec { */ /* Node (FusionDictionary.PtNode) layout is as follows: - * | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED - * | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES - * | 01 = yes : FLAG_IS_MOVED - * f | the new address is stored in the same place as the parent address - * l | is deleted? 10 = yes : FLAG_IS_DELETED + * | CHILDREN_ADDRESS_TYPE 2 bits, 11 : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES + * | 10 : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES + * f | 01 : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE + * l | 00 : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS * a | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS * g | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS @@ -98,16 +95,6 @@ public final class FormatSpec { * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD * | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED * - * p | - * a | parent address, 3byte - * r | 1 byte = bbbbbbbb match - * e | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) - * n | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte - * t | This address is relative to the head of the PtNode. - * a | If the node doesn't have a parent, this field is set to 0. - * d | - * dress - * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers * a | end 1 byte, = 0 @@ -121,15 +108,10 @@ public final class FormatSpec { * q | * * c | - * h | children address, 3 bytes - * i | 1 byte = bbbbbbbb match - * l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) - * d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte - * r | if this node doesn't have children, this field is set to 0. - * e | (see BinaryDictEncoderUtils#writeVariableSignedAddress) - * n | This address is relative to the position of this field. - * a | - * ddress + * h | children address, CHILDREN_ADDRESS_TYPE bytes + * i | This address is relative to the position of this field. + * l | + * drenaddress * * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS * | shortcut string list @@ -179,17 +161,17 @@ public final class FormatSpec { public static final int MAGIC_NUMBER = 0x9BC13AFE; static final int NOT_A_VERSION_NUMBER = -1; - static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3; - static final int FIRST_VERSION_WITH_TERMINAL_ID = 4; // These MUST have the same values as the relevant constants in format_utils.h. - // From version 4 on, we use version * 100 + revision as a version number. That allows + // From version 2.01 on, we use version * 100 + revision as a version number. That allows // us to change the format during development while having testing devices remove // older files with each upgrade, while still having a readable versioning scheme. // When we bump up the dictionary format version, we should update // ExpandableDictionary.needsToMigrateDictionary() and // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). public static final int VERSION2 = 2; + public static final int VERSION201 = 201; + public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201; // Dictionary version used for testing. public static final int VERSION4_ONLY_FOR_TESTING = 399; public static final int VERSION401 = 401; @@ -202,9 +184,6 @@ public final class FormatSpec { // use it in the reading code. static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH; - static final int PARENT_ADDRESS_SIZE = 3; - static final int FORWARD_LINK_ADDRESS_SIZE = 3; - // These flags are used only in the static dictionary. static final int MASK_CHILDREN_ADDRESS_TYPE = 0xC0; static final int FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS = 0x00; @@ -220,13 +199,6 @@ public final class FormatSpec { static final int FLAG_IS_NOT_A_WORD = 0x02; static final int FLAG_IS_BLACKLISTED = 0x01; - // These flags are used only in the dynamic dictionary. - static final int MASK_MOVE_AND_DELETE_FLAG = 0xC0; - static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40; - static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; - static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; - static final int FLAG_IS_DELETED = 0x80; - static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80; static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40; static final int MASK_BIGRAM_ATTR_ADDRESS_TYPE = 0x30; @@ -240,52 +212,12 @@ public final class FormatSpec { static final int PTNODE_TERMINATOR_SIZE = 1; static final int PTNODE_FLAGS_SIZE = 1; static final int PTNODE_FREQUENCY_SIZE = 1; - static final int PTNODE_TERMINAL_ID_SIZE = 4; static final int PTNODE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1; static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2; - // These values are used only by version 4 or later. They MUST match the definitions in - // ver4_dict_constants.cpp. - static final String TRIE_FILE_EXTENSION = ".trie"; - public static final String HEADER_FILE_EXTENSION = ".header"; - static final String FREQ_FILE_EXTENSION = ".freq"; - // tat = Terminal Address Table - static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; - static final String BIGRAM_FILE_EXTENSION = ".bigram"; - static final String SHORTCUT_FILE_EXTENSION = ".shortcut"; - static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; - static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; - static final int FLAGS_IN_FREQ_FILE_SIZE = 1; - static final int FREQUENCY_AND_FLAGS_SIZE = 2; - static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; - static final int UNIGRAM_TIMESTAMP_SIZE = 4; - static final int UNIGRAM_COUNTER_SIZE = 1; - static final int UNIGRAM_LEVEL_SIZE = 1; - - // With the English main dictionary as of October 2013, the size of bigram address table is - // is 345KB with the block size being 16. - // This is 54% of that of full address table. - static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; - static final int BIGRAM_CONTENT_COUNT = 1; - static final int BIGRAM_FREQ_CONTENT_INDEX = 0; - static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; - static final int BIGRAM_TIMESTAMP_SIZE = 4; - static final int BIGRAM_COUNTER_SIZE = 1; - static final int BIGRAM_LEVEL_SIZE = 1; - - static final int SHORTCUT_CONTENT_COUNT = 1; - static final int SHORTCUT_CONTENT_INDEX = 0; - // With the English main dictionary as of October 2013, the size of shortcut address table is - // 26KB with the block size being 64. - // This is only 4.4% of that of full address table. - static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; - static final String SHORTCUT_CONTENT_ID = "_shortcut"; - static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; - static final int NO_PARENT_ADDRESS = 0; - static final int NO_FORWARD_LINK_ADDRESS = 0; static final int INVALID_CHARACTER = -1; static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127 @@ -302,14 +234,13 @@ public final class FormatSpec { // This option needs to be the same numeric value as the one in binary_format.h. static final int NOT_VALID_WORD = -99; - static final int SIGNED_CHILDREN_ADDRESS_SIZE = 3; static final int UINT8_MAX = 0xFF; static final int UINT16_MAX = 0xFFFF; static final int UINT24_MAX = 0xFFFFFF; - static final int SINT24_MAX = 0x7FFFFF; static final int MSB8 = 0x80; - static final int MSB24 = 0x800000; + static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; /** * Options about file format. diff --git a/java/src/com/android/inputmethod/latin/makedict/NgramProperty.java b/java/src/com/android/inputmethod/latin/makedict/NgramProperty.java new file mode 100644 index 000000000..99e0e273f --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/NgramProperty.java @@ -0,0 +1,26 @@ +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.latin.NgramContext; + +public class NgramProperty { + public final WeightedString mTargetWord; + public final NgramContext mNgramContext; + + public NgramProperty(final WeightedString targetWord, final NgramContext ngramContext) { + mTargetWord = targetWord; + mNgramContext = ngramContext; + } + + @Override + public int hashCode() { + return mTargetWord.hashCode() ^ mNgramContext.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if (!(o instanceof NgramProperty)) return false; + final NgramProperty n = (NgramProperty)o; + return mTargetWord.equals(n.mTargetWord) && mNgramContext.equals(n.mNgramContext); + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java index cd78e2235..46705f9db 100644 --- a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java +++ b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java @@ -18,6 +18,8 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.NgramContext; +import com.android.inputmethod.latin.NgramContext.WordInfo; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import com.android.inputmethod.latin.utils.StringUtils; @@ -33,16 +35,17 @@ public final class WordProperty implements Comparable<WordProperty> { public final String mWord; public final ProbabilityInfo mProbabilityInfo; public final ArrayList<WeightedString> mShortcutTargets; - public final ArrayList<WeightedString> mBigrams; + public final ArrayList<NgramProperty> mNgrams; // TODO: Support mIsBeginningOfSentence. public final boolean mIsBeginningOfSentence; public final boolean mIsNotAWord; public final boolean mIsBlacklistEntry; public final boolean mHasShortcuts; - public final boolean mHasBigrams; + public final boolean mHasNgrams; private int mHashCode = 0; + // TODO: Support n-gram. @UsedForTesting public WordProperty(final String word, final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> shortcutTargets, @@ -51,11 +54,17 @@ public final class WordProperty implements Comparable<WordProperty> { mWord = word; mProbabilityInfo = probabilityInfo; mShortcutTargets = shortcutTargets; - mBigrams = bigrams; + mNgrams = new ArrayList<>(); + final NgramContext ngramContext = new NgramContext(new WordInfo(mWord)); + if (bigrams != null) { + for (final WeightedString bigramTarget : bigrams) { + mNgrams.add(new NgramProperty(bigramTarget, ngramContext)); + } + } mIsBeginningOfSentence = false; mIsNotAWord = isNotAWord; mIsBlacklistEntry = isBlacklistEntry; - mHasBigrams = bigrams != null && !bigrams.isEmpty(); + mHasNgrams = bigrams != null && !bigrams.isEmpty(); mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty(); } @@ -78,19 +87,24 @@ public final class WordProperty implements Comparable<WordProperty> { mWord = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints); mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo); mShortcutTargets = new ArrayList<>(); - mBigrams = new ArrayList<>(); + mNgrams = new ArrayList<>(); mIsBeginningOfSentence = isBeginningOfSentence; mIsNotAWord = isNotAWord; mIsBlacklistEntry = isBlacklisted; mHasShortcuts = hasShortcuts; - mHasBigrams = hasBigram; - - final int bigramTargetCount = bigramTargets.size(); - for (int i = 0; i < bigramTargetCount; i++) { - final String bigramTargetString = + mHasNgrams = hasBigram; + + final int relatedNgramCount = bigramTargets.size(); + final WordInfo currentWordInfo = + mIsBeginningOfSentence ? WordInfo.BEGINNING_OF_SENTENCE : new WordInfo(mWord); + final NgramContext ngramContext = new NgramContext(currentWordInfo); + for (int i = 0; i < relatedNgramCount; i++) { + final String ngramTargetString = StringUtils.getStringFromNullTerminatedCodePointArray(bigramTargets.get(i)); - mBigrams.add(new WeightedString(bigramTargetString, - createProbabilityInfoFromArray(bigramProbabilityInfo.get(i)))); + final WeightedString ngramTarget = new WeightedString(ngramTargetString, + createProbabilityInfoFromArray(bigramProbabilityInfo.get(i))); + // TODO: Support n-gram. + mNgrams.add(new NgramProperty(ngramTarget, ngramContext)); } final int shortcutTargetCount = shortcutTargets.size(); @@ -102,6 +116,17 @@ public final class WordProperty implements Comparable<WordProperty> { } } + // TODO: Remove + public ArrayList<WeightedString> getBigrams() { + final ArrayList<WeightedString> bigrams = new ArrayList<>(); + for (final NgramProperty ngram : mNgrams) { + if (ngram.mNgramContext.getPrevWordCount() == 1) { + bigrams.add(ngram.mTargetWord); + } + } + return bigrams; + } + public int getProbability() { return mProbabilityInfo.mProbability; } @@ -110,8 +135,8 @@ public final class WordProperty implements Comparable<WordProperty> { return Arrays.hashCode(new Object[] { word.mWord, word.mProbabilityInfo, - word.mShortcutTargets.hashCode(), - word.mBigrams.hashCode(), + word.mShortcutTargets, + word.mNgrams, word.mIsNotAWord, word.mIsBlacklistEntry }); @@ -142,9 +167,9 @@ public final class WordProperty implements Comparable<WordProperty> { if (!(o instanceof WordProperty)) return false; WordProperty w = (WordProperty)o; return mProbabilityInfo.equals(w.mProbabilityInfo) && mWord.equals(w.mWord) - && mShortcutTargets.equals(w.mShortcutTargets) && mBigrams.equals(w.mBigrams) + && mShortcutTargets.equals(w.mShortcutTargets) && mNgrams.equals(w.mNgrams) && mIsNotAWord == w.mIsNotAWord && mIsBlacklistEntry == w.mIsBlacklistEntry - && mHasBigrams == w.mHasBigrams && mHasShortcuts && w.mHasBigrams; + && mHasNgrams == w.mHasNgrams && mHasShortcuts && w.mHasNgrams; } @Override |