diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
3 files changed, 363 insertions, 69 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 7b0231a6b..6210c923e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -17,13 +17,17 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.IOException; +import java.io.OutputStream; import java.util.ArrayList; +import java.util.Iterator; import java.util.Map; import java.util.Stack; @@ -90,7 +94,9 @@ public final class BinaryDictIOUtils { final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags, formatOptions); - if (!isMovedGroup + final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags, + formatOptions); + if (!isMovedGroup && !isDeletedGroup && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); frequencies.put(info.mOriginalAddress, info.mFrequency); @@ -165,19 +171,19 @@ public final class BinaryDictIOUtils { if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; do { - int groupOffset = buffer.position() - header.mHeaderSize; final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); - groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount); - boolean foundNextCharGroup = false; for (int i = 0; i < charGroupCount; ++i) { final int charGroupPos = buffer.position(); final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, buffer.position(), header.mFormatOptions); - if (BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, - header.mFormatOptions)) { - continue; - } + final boolean isMovedGroup = + BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, + header.mFormatOptions); + final boolean isDeletedGroup = + BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags, + header.mFormatOptions); + if (isMovedGroup) continue; boolean same = true; for (int p = 0, j = word.offsetByCodePoints(0, wordPos); p < currentInfo.mCharacters.length; @@ -192,7 +198,8 @@ public final class BinaryDictIOUtils { if (same) { // found the group matches the word. if (wordPos + currentInfo.mCharacters.length == wordLen) { - if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) { + if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL + || isDeletedGroup) { return FormatSpec.NOT_VALID_WORD; } else { return charGroupPos; @@ -206,7 +213,6 @@ public final class BinaryDictIOUtils { buffer.position(currentInfo.mChildrenAddress); break; } - groupOffset = currentInfo.mEndAddress; } // If we found the next char group, it is under the file pointer. @@ -228,6 +234,10 @@ public final class BinaryDictIOUtils { return FormatSpec.NOT_VALID_WORD; } + private static int markAsDeleted(final int flags) { + return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; + } + /** * Delete the word from the binary file. * @@ -245,21 +255,58 @@ public final class BinaryDictIOUtils { buffer.position(wordPosition); final int flags = buffer.readUnsignedByte(); - final int newFlags = flags ^ FormatSpec.FLAG_IS_TERMINAL; buffer.position(wordPosition); - buffer.put((byte)newFlags); + buffer.put((byte)markAsDeleted(flags)); } - private static void putSInt24(final FusionDictionaryBufferInterface buffer, + /** + * @return the size written, in bytes. Always 3 bytes. + */ + private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer, final int value) { final int absValue = Math.abs(value); buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); buffer.put((byte)((absValue >> 8) & 0xFF)); buffer.put((byte)(absValue & 0xFF)); + return 3; + } + + /** + * @return the size written, in bytes. Always 3 bytes. + */ + private static int writeSInt24ToStream(final OutputStream destination, final int value) + throws IOException { + final int absValue = Math.abs(value); + destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); + destination.write((byte)((absValue >> 8) & 0xFF)); + destination.write((byte)(absValue & 0xFF)); + return 3; + } + + /** + * @return the size written, in bytes. 1, 2, or 3 bytes. + */ + private static int writeVariableAddress(final OutputStream destination, final int value) + throws IOException { + switch (BinaryDictInputOutput.getByteSize(value)) { + case 1: + destination.write((byte)value); + break; + case 2: + destination.write((byte)(0xFF & (value >> 8))); + destination.write((byte)(0xFF & value)); + break; + case 3: + destination.write((byte)(0xFF & (value >> 16))); + destination.write((byte)(0xFF & (value >> 8))); + destination.write((byte)(0xFF & value)); + break; + } + return BinaryDictInputOutput.getByteSize(value); } /** - * Update a parent address in a CharGroup that is addressed by groupOriginAddress. + * Update a parent address in a CharGroup that is referred to by groupOriginAddress. * * @param buffer the buffer to write. * @param groupOriginAddress the address of the group. @@ -275,8 +322,227 @@ public final class BinaryDictIOUtils { throw new RuntimeException("this file format does not support parent addresses"); } final int flags = buffer.readUnsignedByte(); + if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) { + // if the group is moved, the parent address is stored in the destination group. + // We are guaranteed to process the destination group later, so there is no need to + // update anything here. + buffer.position(originalPosition); + return; + } + if (DBG) { + MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress); + } final int parentOffset = newParentAddress - groupOriginAddress; - putSInt24(buffer, parentOffset); + writeSInt24ToBuffer(buffer, parentOffset); + buffer.position(originalPosition); + } + + private static void skipCharGroup(final FusionDictionaryBufferInterface buffer, + final FormatOptions formatOptions) { + final int flags = buffer.readUnsignedByte(); + BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { + final int shortcutsSize = buffer.readUnsignedShort(); + buffer.position(buffer.position() + shortcutsSize + - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE); + } + if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + final int bigramFlags = buffer.readUnsignedByte(); + switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + buffer.readUnsignedByte(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + buffer.readUnsignedShort(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + buffer.readUnsignedInt24(); + break; + } + if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break; + } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + throw new RuntimeException("Too many bigrams in a group."); + } + } + } + + /** + * Update parent addresses in a Node that is referred to by nodeOriginAddress. + * + * @param buffer the buffer to be modified. + * @param nodeOriginAddress the address of a modified Node. + * @param newParentAddress the address to be written + * @param formatOptions file format options + */ + public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(nodeOriginAddress); + do { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) { + updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions); + skipCharGroup(buffer, formatOptions); + } + final int forwardLinkAddress = buffer.readUnsignedInt24(); + buffer.position(forwardLinkAddress); + } while (formatOptions.mSupportsDynamicUpdate + && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); + buffer.position(originalPosition); + } + + private static void skipString(final FusionDictionaryBufferInterface buffer, + final boolean hasMultipleChars) { + if (hasMultipleChars) { + int character = CharEncoding.readChar(buffer); + while (character != FormatSpec.INVALID_CHARACTER) { + character = CharEncoding.readChar(buffer); + } + } else { + CharEncoding.readChar(buffer); + } + } + + /** + * Write a string to a stream. + * + * @param destination the stream to write. + * @param word the string to be written. + * @return the size written, in bytes. + * @throws IOException + */ + private static int writeString(final OutputStream destination, final String word) + throws IOException { + int size = 0; + final int length = word.length(); + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + if (CharEncoding.getCharSize(codePoint) == 1) { + destination.write((byte)codePoint); + size++; + } else { + destination.write((byte)(0xFF & (codePoint >> 16))); + destination.write((byte)(0xFF & (codePoint >> 8))); + destination.write((byte)(0xFF & codePoint)); + size += 3; + } + } + destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); + size++; + return size; + } + + /** + * Update a children address in a CharGroup that is addressed by groupOriginAddress. + * + * @param buffer the buffer to write. + * @param groupOriginAddress the address of the group. + * @param newChildrenAddress the absolute address of the child. + * @param formatOptions file format options. + */ + public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer, + final int groupOriginAddress, final int newChildrenAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(groupOriginAddress); + final int flags = buffer.readUnsignedByte(); + final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS + ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); + writeSInt24ToBuffer(buffer, childrenOffset); buffer.position(originalPosition); } + + /** + * Write a char group to an output stream. + * A char group is an in-memory representation of a node in trie. + * A char group info is an on-disk representation of a node. + * + * @param destination the stream to write. + * @param info the char group info to be written. + * @return the size written, in bytes. + */ + public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info) + throws IOException { + int size = 1; + destination.write((byte)info.mFlags); + final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ? + FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress; + size += writeSInt24ToStream(destination, parentOffset); + + for (int i = 0; i < info.mCharacters.length; ++i) { + if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) { + destination.write((byte)info.mCharacters[i]); + size++; + } else { + size += writeSInt24ToStream(destination, info.mCharacters[i]); + } + } + if (info.mCharacters.length > 1) { + destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); + size++; + } + + if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { + destination.write((byte)info.mFrequency); + size++; + } + + final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ? + 0 : info.mChildrenAddress - info.mOriginalAddress; + writeSInt24ToStream(destination, childrenOffset); + + if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) { + final int shortcutListSize = + BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); + destination.write((byte)(shortcutListSize >> 8)); + destination.write((byte)(shortcutListSize & 0xFF)); + size += 2; + final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + destination.write((byte)BinaryDictInputOutput.makeShortcutFlags( + shortcutIterator.hasNext(), target.mFrequency)); + size++; + size += writeString(destination, target.mWord); + } + } + + if (info.mBigrams != null) { + // TODO: Consolidate this code with the code that computes the size of the bigram list + // in BinaryDictionaryInputOutput#computeActualNodeSize + for (int i = 0; i < info.mBigrams.size(); ++i) { + final int bigramOffset = info.mBigrams.get(i).mAddress - info.mOriginalAddress; + final int bigramFrequency = info.mBigrams.get(i).mFrequency; + int bigramFlags = (i < info.mBigrams.size() - 1) + ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0; + bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0; + switch (BinaryDictInputOutput.getByteSize(bigramOffset)) { + case 1: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + break; + } + bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY; + destination.write((byte)bigramFlags); + size++; + size += writeVariableAddress(destination, Math.abs(bigramOffset)); + } + } + return size; + } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index b431a4da9..78171d03c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -124,8 +124,7 @@ public final class BinaryDictInputOutput { /** * A class grouping utility function for our specific character encoding. */ - private static final class CharEncoding { - + static final class CharEncoding { private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; @@ -154,7 +153,7 @@ public final class BinaryDictInputOutput { * @param character the character code. * @return the size in binary encoded-form, either 1 or 3 bytes. */ - private static int getCharSize(final int character) { + static int getCharSize(final int character) { // See char encoding in FusionDictionary.java if (fitsOnOneByte(character)) return 1; if (FormatSpec.INVALID_CHARACTER == character) return 1; @@ -263,7 +262,7 @@ public final class BinaryDictInputOutput { * @param buffer the buffer, positioned over an encoded character. * @return the character code. */ - private static int readChar(final FusionDictionaryBufferInterface buffer) { + static int readChar(final FusionDictionaryBufferInterface buffer) { int character = buffer.readUnsignedByte(); if (!fitsOnOneByte(character)) { if (FormatSpec.GROUP_CHARACTERS_TERMINATOR == character) { @@ -338,7 +337,7 @@ public final class BinaryDictInputOutput { * This is known in advance and does not change according to position in the file * like address lists do. */ - private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { + static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { if (null == shortcutList) return 0; int size = FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE; for (final WeightedString shortcut : shortcutList) { @@ -403,6 +402,14 @@ public final class BinaryDictInputOutput { } /** + * Helper method to check whether the group is deleted. + */ + public static boolean isDeletedGroup(final int flags, final FormatOptions formatOptions) { + return formatOptions.mSupportsDynamicUpdate + && ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED); + } + + /** * Helper method to check whether the dictionary can be updated dynamically. */ public static boolean supportsDynamicUpdate(final FormatOptions options) { @@ -439,7 +446,7 @@ public final class BinaryDictInputOutput { * @param address the address * @return the byte size. */ - private static int getByteSize(final int address) { + static int getByteSize(final int address) { assert(address <= UINT24_MAX); if (!hasChildrenAddress(address)) { return 0; @@ -721,53 +728,60 @@ public final class BinaryDictInputOutput { return 3; } - private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, - final int childrenOffset, final FormatOptions formatOptions) { + /** + * Makes the flag value for a char group. + * + * @param hasMultipleChars whether the group has multiple chars. + * @param isTerminal whether the group is terminal. + * @param childrenAddressSize the size of a children address. + * @param hasShortcuts whether the group has shortcuts. + * @param hasBigrams whether the group has bigrams. + * @param isNotAWord whether the group is not a word. + * @param isBlackListEntry whether the group is a blacklist entry. + * @param formatOptions file format options. + * @return the flags + */ + static int makeCharGroupFlags(final boolean hasMultipleChars, final boolean isTerminal, + final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams, + final boolean isNotAWord, final boolean isBlackListEntry, + final FormatOptions formatOptions) { byte flags = 0; - if (group.mChars.length > 1) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; - if (group.mFrequency >= 0) { - flags |= FormatSpec.FLAG_IS_TERMINAL; - } - if (null != group.mChildren) { - final int byteSize = formatOptions.mSupportsDynamicUpdate - ? FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE : getByteSize(childrenOffset); - switch (byteSize) { - case 1: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; - break; - case 2: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; - break; - case 3: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; - break; - default: - throw new RuntimeException("Node with a strange address"); - } - } else if (formatOptions.mSupportsDynamicUpdate) { - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; - } - if (null != group.mShortcutTargets) { - if (DBG && 0 == group.mShortcutTargets.size()) { - throw new RuntimeException("0-sized shortcut list must be null"); - } - flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; - } - if (null != group.mBigrams) { - if (DBG && 0 == group.mBigrams.size()) { - throw new RuntimeException("0-sized bigram list must be null"); + if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; + if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; + if (formatOptions.mSupportsDynamicUpdate) { + flags |= FormatSpec.FLAG_IS_NOT_MOVED; + } else if (true) { + switch (childrenAddressSize) { + case 1: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; + break; + case 0: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS; + break; + default: + throw new RuntimeException("Node with a strange address"); } - flags |= FormatSpec.FLAG_HAS_BIGRAMS; - } - if (group.mIsNotAWord) { - flags |= FormatSpec.FLAG_IS_NOT_A_WORD; - } - if (group.mIsBlacklistEntry) { - flags |= FormatSpec.FLAG_IS_BLACKLISTED; } + if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; + if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; + if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; + if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED; return flags; } + private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, + final int childrenOffset, final FormatOptions formatOptions) { + return (byte) makeCharGroupFlags(group.mChars.length > 1, group.mFrequency >= 0, + getByteSize(childrenOffset), group.mShortcutTargets != null, group.mBigrams != null, + group.mIsNotAWord, group.mIsBlacklistEntry, formatOptions); + } + /** * Makes the flag value for a bigram. * @@ -859,7 +873,7 @@ public final class BinaryDictInputOutput { * @param frequency the frequency of the attribute, 0..15 * @return the flags */ - private static final int makeShortcutFlags(final boolean more, final int frequency) { + static final int makeShortcutFlags(final boolean more, final int frequency) { return (more ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY); } @@ -897,6 +911,7 @@ public final class BinaryDictInputOutput { */ private static int writePlacedNode(final FusionDictionary dict, byte[] buffer, final Node node, final FormatOptions formatOptions) { + // TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup int index = node.mCachedAddress; final int groupCount = node.mData.size(); @@ -1194,7 +1209,7 @@ public final class BinaryDictInputOutput { } } - private static int readChildrenAddress(final FusionDictionaryBufferInterface buffer, + static int readChildrenAddress(final FusionDictionaryBufferInterface buffer, final int optionFlags, final FormatOptions options) { if (options.mSupportsDynamicUpdate) { final int address = buffer.readUnsignedInt24(); @@ -1219,7 +1234,7 @@ public final class BinaryDictInputOutput { } } - private static int readParentAddress(final FusionDictionaryBufferInterface buffer, + static int readParentAddress(final FusionDictionaryBufferInterface buffer, final FormatOptions formatOptions) { if (supportsDynamicUpdate(formatOptions)) { final int parentAddress = buffer.readUnsignedInt24(); @@ -1290,7 +1305,8 @@ public final class BinaryDictInputOutput { ArrayList<PendingAttribute> bigrams = null; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); - while (true) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { final int bigramFlags = buffer.readUnsignedByte(); ++addressPointer; final int sign = 0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE) @@ -1318,6 +1334,9 @@ public final class BinaryDictInputOutput { bigramAddress)); if (0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break; } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + MakedictLog.d("too many bigrams in a group."); + } } return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, parentAddress, childrenAddress, shortcutTargets, bigrams); diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index b3fbb9fb5..ca851c6a9 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -59,9 +59,11 @@ public final class FormatSpec { * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES * g | ELSE - * s | is moved ? 2 bits, 11 = no - * | 01 = yes + * s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED + * | This must be the same as FLAG_GROUP_ADDRESS_TYPE_THREEBYTES + * | 01 = yes : FLAG_IS_MOVED * | the new address is stored in the same place as the parent address + * | is deleted? 10 = yes : FLAG_IS_DELETED * | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS * | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS @@ -170,6 +172,7 @@ public final class FormatSpec { static final int PARENT_ADDRESS_SIZE = 3; static final int FORWARD_LINK_ADDRESS_SIZE = 3; + // These flags are used only in the static dictionary. static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; @@ -183,7 +186,12 @@ public final class FormatSpec { static final int FLAG_HAS_BIGRAMS = 0x04; static final int FLAG_IS_NOT_A_WORD = 0x02; static final int FLAG_IS_BLACKLISTED = 0x01; - static final int FLAG_IS_MOVED = 0x40; + + // These flags are used only in the dynamic dictionary. + static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40; + static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; + static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; + static final int FLAG_IS_DELETED = 0x80; static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; @@ -210,6 +218,7 @@ public final class FormatSpec { static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 + static final int MAX_BIGRAMS_IN_A_GROUP = 10000; static final int MAX_TERMINAL_FREQUENCY = 255; static final int MAX_BIGRAM_FREQUENCY = 15; |