diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
3 files changed, 164 insertions, 61 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 5ed910c50..6210c923e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -28,7 +28,6 @@ import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Stack; @@ -95,7 +94,9 @@ public final class BinaryDictIOUtils { final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags, formatOptions); - if (!isMovedGroup + final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags, + formatOptions); + if (!isMovedGroup && !isDeletedGroup && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); frequencies.put(info.mOriginalAddress, info.mFrequency); @@ -170,19 +171,19 @@ public final class BinaryDictIOUtils { if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; do { - int groupOffset = buffer.position() - header.mHeaderSize; final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); - groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount); - boolean foundNextCharGroup = false; for (int i = 0; i < charGroupCount; ++i) { final int charGroupPos = buffer.position(); final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, buffer.position(), header.mFormatOptions); - if (BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, - header.mFormatOptions)) { - continue; - } + final boolean isMovedGroup = + BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, + header.mFormatOptions); + final boolean isDeletedGroup = + BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags, + header.mFormatOptions); + if (isMovedGroup) continue; boolean same = true; for (int p = 0, j = word.offsetByCodePoints(0, wordPos); p < currentInfo.mCharacters.length; @@ -197,7 +198,8 @@ public final class BinaryDictIOUtils { if (same) { // found the group matches the word. if (wordPos + currentInfo.mCharacters.length == wordLen) { - if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) { + if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL + || isDeletedGroup) { return FormatSpec.NOT_VALID_WORD; } else { return charGroupPos; @@ -211,7 +213,6 @@ public final class BinaryDictIOUtils { buffer.position(currentInfo.mChildrenAddress); break; } - groupOffset = currentInfo.mEndAddress; } // If we found the next char group, it is under the file pointer. @@ -233,6 +234,10 @@ public final class BinaryDictIOUtils { return FormatSpec.NOT_VALID_WORD; } + private static int markAsDeleted(final int flags) { + return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; + } + /** * Delete the word from the binary file. * @@ -250,9 +255,8 @@ public final class BinaryDictIOUtils { buffer.position(wordPosition); final int flags = buffer.readUnsignedByte(); - final int newFlags = flags ^ FormatSpec.FLAG_IS_TERMINAL; buffer.position(wordPosition); - buffer.put((byte)newFlags); + buffer.put((byte)markAsDeleted(flags)); } /** @@ -302,7 +306,7 @@ public final class BinaryDictIOUtils { } /** - * Update a parent address in a CharGroup that is addressed by groupOriginAddress. + * Update a parent address in a CharGroup that is referred to by groupOriginAddress. * * @param buffer the buffer to write. * @param groupOriginAddress the address of the group. @@ -318,11 +322,82 @@ public final class BinaryDictIOUtils { throw new RuntimeException("this file format does not support parent addresses"); } final int flags = buffer.readUnsignedByte(); + if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) { + // if the group is moved, the parent address is stored in the destination group. + // We are guaranteed to process the destination group later, so there is no need to + // update anything here. + buffer.position(originalPosition); + return; + } + if (DBG) { + MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress); + } final int parentOffset = newParentAddress - groupOriginAddress; writeSInt24ToBuffer(buffer, parentOffset); buffer.position(originalPosition); } + private static void skipCharGroup(final FusionDictionaryBufferInterface buffer, + final FormatOptions formatOptions) { + final int flags = buffer.readUnsignedByte(); + BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { + final int shortcutsSize = buffer.readUnsignedShort(); + buffer.position(buffer.position() + shortcutsSize + - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE); + } + if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + final int bigramFlags = buffer.readUnsignedByte(); + switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + buffer.readUnsignedByte(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + buffer.readUnsignedShort(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + buffer.readUnsignedInt24(); + break; + } + if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break; + } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + throw new RuntimeException("Too many bigrams in a group."); + } + } + } + + /** + * Update parent addresses in a Node that is referred to by nodeOriginAddress. + * + * @param buffer the buffer to be modified. + * @param nodeOriginAddress the address of a modified Node. + * @param newParentAddress the address to be written + * @param formatOptions file format options + */ + public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(nodeOriginAddress); + do { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) { + updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions); + skipCharGroup(buffer, formatOptions); + } + final int forwardLinkAddress = buffer.readUnsignedInt24(); + buffer.position(forwardLinkAddress); + } while (formatOptions.mSupportsDynamicUpdate + && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); + buffer.position(originalPosition); + } + private static void skipString(final FusionDictionaryBufferInterface buffer, final boolean hasMultipleChars) { if (hasMultipleChars) { @@ -380,7 +455,7 @@ public final class BinaryDictIOUtils { final int flags = buffer.readUnsignedByte(); final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); - if ((FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); writeSInt24ToBuffer(buffer, childrenOffset); diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index ba29523c8..78171d03c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -402,6 +402,14 @@ public final class BinaryDictInputOutput { } /** + * Helper method to check whether the group is deleted. + */ + public static boolean isDeletedGroup(final int flags, final FormatOptions formatOptions) { + return formatOptions.mSupportsDynamicUpdate + && ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED); + } + + /** * Helper method to check whether the dictionary can be updated dynamically. */ public static boolean supportsDynamicUpdate(final FormatOptions options) { @@ -720,53 +728,60 @@ public final class BinaryDictInputOutput { return 3; } - private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, - final int childrenOffset, final FormatOptions formatOptions) { + /** + * Makes the flag value for a char group. + * + * @param hasMultipleChars whether the group has multiple chars. + * @param isTerminal whether the group is terminal. + * @param childrenAddressSize the size of a children address. + * @param hasShortcuts whether the group has shortcuts. + * @param hasBigrams whether the group has bigrams. + * @param isNotAWord whether the group is not a word. + * @param isBlackListEntry whether the group is a blacklist entry. + * @param formatOptions file format options. + * @return the flags + */ + static int makeCharGroupFlags(final boolean hasMultipleChars, final boolean isTerminal, + final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams, + final boolean isNotAWord, final boolean isBlackListEntry, + final FormatOptions formatOptions) { byte flags = 0; - if (group.mChars.length > 1) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; - if (group.mFrequency >= 0) { - flags |= FormatSpec.FLAG_IS_TERMINAL; - } - if (null != group.mChildren) { - final int byteSize = formatOptions.mSupportsDynamicUpdate - ? FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE : getByteSize(childrenOffset); - switch (byteSize) { - case 1: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; - break; - case 2: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; - break; - case 3: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; - break; - default: - throw new RuntimeException("Node with a strange address"); - } - } else if (formatOptions.mSupportsDynamicUpdate) { - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; - } - if (null != group.mShortcutTargets) { - if (DBG && 0 == group.mShortcutTargets.size()) { - throw new RuntimeException("0-sized shortcut list must be null"); - } - flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; - } - if (null != group.mBigrams) { - if (DBG && 0 == group.mBigrams.size()) { - throw new RuntimeException("0-sized bigram list must be null"); + if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; + if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; + if (formatOptions.mSupportsDynamicUpdate) { + flags |= FormatSpec.FLAG_IS_NOT_MOVED; + } else if (true) { + switch (childrenAddressSize) { + case 1: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; + break; + case 0: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS; + break; + default: + throw new RuntimeException("Node with a strange address"); } - flags |= FormatSpec.FLAG_HAS_BIGRAMS; - } - if (group.mIsNotAWord) { - flags |= FormatSpec.FLAG_IS_NOT_A_WORD; - } - if (group.mIsBlacklistEntry) { - flags |= FormatSpec.FLAG_IS_BLACKLISTED; } + if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; + if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; + if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; + if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED; return flags; } + private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, + final int childrenOffset, final FormatOptions formatOptions) { + return (byte) makeCharGroupFlags(group.mChars.length > 1, group.mFrequency >= 0, + getByteSize(childrenOffset), group.mShortcutTargets != null, group.mBigrams != null, + group.mIsNotAWord, group.mIsBlacklistEntry, formatOptions); + } + /** * Makes the flag value for a bigram. * @@ -1194,7 +1209,7 @@ public final class BinaryDictInputOutput { } } - private static int readChildrenAddress(final FusionDictionaryBufferInterface buffer, + static int readChildrenAddress(final FusionDictionaryBufferInterface buffer, final int optionFlags, final FormatOptions options) { if (options.mSupportsDynamicUpdate) { final int address = buffer.readUnsignedInt24(); @@ -1290,7 +1305,8 @@ public final class BinaryDictInputOutput { ArrayList<PendingAttribute> bigrams = null; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); - while (true) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { final int bigramFlags = buffer.readUnsignedByte(); ++addressPointer; final int sign = 0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE) @@ -1318,6 +1334,9 @@ public final class BinaryDictInputOutput { bigramAddress)); if (0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break; } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + MakedictLog.d("too many bigrams in a group."); + } } return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, parentAddress, childrenAddress, shortcutTargets, bigrams); diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index b3fbb9fb5..ca851c6a9 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -59,9 +59,11 @@ public final class FormatSpec { * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES * g | ELSE - * s | is moved ? 2 bits, 11 = no - * | 01 = yes + * s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED + * | This must be the same as FLAG_GROUP_ADDRESS_TYPE_THREEBYTES + * | 01 = yes : FLAG_IS_MOVED * | the new address is stored in the same place as the parent address + * | is deleted? 10 = yes : FLAG_IS_DELETED * | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS * | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS @@ -170,6 +172,7 @@ public final class FormatSpec { static final int PARENT_ADDRESS_SIZE = 3; static final int FORWARD_LINK_ADDRESS_SIZE = 3; + // These flags are used only in the static dictionary. static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; @@ -183,7 +186,12 @@ public final class FormatSpec { static final int FLAG_HAS_BIGRAMS = 0x04; static final int FLAG_IS_NOT_A_WORD = 0x02; static final int FLAG_IS_BLACKLISTED = 0x01; - static final int FLAG_IS_MOVED = 0x40; + + // These flags are used only in the dynamic dictionary. + static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40; + static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; + static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; + static final int FLAG_IS_DELETED = 0x80; static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; @@ -210,6 +218,7 @@ public final class FormatSpec { static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 + static final int MAX_BIGRAMS_IN_A_GROUP = 10000; static final int MAX_TERMINAL_FREQUENCY = 255; static final int MAX_BIGRAM_FREQUENCY = 15; |