diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
9 files changed, 1243 insertions, 220 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 7a1b9dcb7..ee0e9cd7e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -16,21 +16,34 @@ package com.android.inputmethod.latin.makedict; +import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.IOException; +import java.io.OutputStream; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; import java.util.Map; import java.util.Stack; -public class BinaryDictIOUtils { +public final class BinaryDictIOUtils { private static final boolean DBG = false; + private static final int MSB24 = 0x800000; + private static final int SINT24_MAX = 0x7FFFFF; + private static final int MAX_JUMPS = 10000; - private static class Position { + private BinaryDictIOUtils() { + // This utility class is not publicly instantiable. + } + + private static final class Position { public static final int NOT_READ_GROUPCOUNT = -1; public int mAddress; @@ -77,7 +90,10 @@ public class BinaryDictIOUtils { p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup); p.mPosition = 0; } - + if (p.mNumOfCharGroup == 0) { + stack.pop(); + continue; + } CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer, p.mAddress - headerSize, formatOptions); for (int i = 0; i < info.mCharacters.length; ++i) { @@ -85,20 +101,36 @@ public class BinaryDictIOUtils { } p.mPosition++; - if (info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) { // found word + final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags, + formatOptions); + final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags, + formatOptions); + if (!isMovedGroup && !isDeletedGroup + && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); frequencies.put(info.mOriginalAddress, info.mFrequency); if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); } if (p.mPosition == p.mNumOfCharGroup) { - stack.pop(); + if (formatOptions.mSupportsDynamicUpdate) { + final int forwardLinkAddress = buffer.readUnsignedInt24(); + if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) { + // the node has a forward link. + p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT; + p.mAddress = forwardLinkAddress; + } else { + stack.pop(); + } + } else { + stack.pop(); + } } else { // the node has more groups. p.mAddress = buffer.position(); } - if (BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) { + if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) { Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); stack.push(childrenPos); } @@ -136,6 +168,7 @@ public class BinaryDictIOUtils { * @throws IOException * @throws UnsupportedFormatException */ + @UsedForTesting public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer, final String word) throws IOException, UnsupportedFormatException { if (word == null) return FormatSpec.NOT_VALID_WORD; @@ -146,48 +179,802 @@ public class BinaryDictIOUtils { final int wordLen = word.codePointCount(0, word.length()); for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; - int groupOffset = buffer.position() - header.mHeaderSize; + + do { + final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); + boolean foundNextCharGroup = false; + for (int i = 0; i < charGroupCount; ++i) { + final int charGroupPos = buffer.position(); + final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, + buffer.position(), header.mFormatOptions); + final boolean isMovedGroup = + BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, + header.mFormatOptions); + final boolean isDeletedGroup = + BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags, + header.mFormatOptions); + if (isMovedGroup) continue; + boolean same = true; + for (int p = 0, j = word.offsetByCodePoints(0, wordPos); + p < currentInfo.mCharacters.length; + ++p, j = word.offsetByCodePoints(j, 1)) { + if (wordPos + p >= wordLen + || word.codePointAt(j) != currentInfo.mCharacters[p]) { + same = false; + break; + } + } + + if (same) { + // found the group matches the word. + if (wordPos + currentInfo.mCharacters.length == wordLen) { + if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL + || isDeletedGroup) { + return FormatSpec.NOT_VALID_WORD; + } else { + return charGroupPos; + } + } + wordPos += currentInfo.mCharacters.length; + if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { + return FormatSpec.NOT_VALID_WORD; + } + foundNextCharGroup = true; + buffer.position(currentInfo.mChildrenAddress); + break; + } + } + + // If we found the next char group, it is under the file pointer. + // But if not, we are at the end of this node so we expect to have + // a forward link address that we need to consult and possibly resume + // search on the next node in the linked list. + if (foundNextCharGroup) break; + if (!header.mFormatOptions.mSupportsDynamicUpdate) { + return FormatSpec.NOT_VALID_WORD; + } + + final int forwardLinkAddress = buffer.readUnsignedInt24(); + if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + return FormatSpec.NOT_VALID_WORD; + } + buffer.position(forwardLinkAddress); + } while(true); + } + return FormatSpec.NOT_VALID_WORD; + } + + private static int markAsDeleted(final int flags) { + return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; + } + + /** + * Delete the word from the binary file. + * + * @param buffer the buffer to write. + * @param word the word we delete + * @throws IOException + * @throws UnsupportedFormatException + */ + @UsedForTesting + public static void deleteWord(final FusionDictionaryBufferInterface buffer, + final String word) throws IOException, UnsupportedFormatException { + buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + final int wordPosition = getTerminalPosition(buffer, word); + if (wordPosition == FormatSpec.NOT_VALID_WORD) return; + + buffer.position(wordPosition); + final int flags = buffer.readUnsignedByte(); + buffer.position(wordPosition); + buffer.put((byte)markAsDeleted(flags)); + } + + /** + * @return the size written, in bytes. Always 3 bytes. + */ + private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer, + final int value) { + final int absValue = Math.abs(value); + buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); + buffer.put((byte)((absValue >> 8) & 0xFF)); + buffer.put((byte)(absValue & 0xFF)); + return 3; + } + + /** + * @return the size written, in bytes. Always 3 bytes. + */ + private static int writeSInt24ToStream(final OutputStream destination, final int value) + throws IOException { + final int absValue = Math.abs(value); + destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); + destination.write((byte)((absValue >> 8) & 0xFF)); + destination.write((byte)(absValue & 0xFF)); + return 3; + } + + /** + * @return the size written, in bytes. 1, 2, or 3 bytes. + */ + private static int writeVariableAddress(final OutputStream destination, final int value) + throws IOException { + switch (BinaryDictInputOutput.getByteSize(value)) { + case 1: + destination.write((byte)value); + break; + case 2: + destination.write((byte)(0xFF & (value >> 8))); + destination.write((byte)(0xFF & value)); + break; + case 3: + destination.write((byte)(0xFF & (value >> 16))); + destination.write((byte)(0xFF & (value >> 8))); + destination.write((byte)(0xFF & value)); + break; + } + return BinaryDictInputOutput.getByteSize(value); + } + + /** + * Update a parent address in a CharGroup that is referred to by groupOriginAddress. + * + * @param buffer the buffer to write. + * @param groupOriginAddress the address of the group. + * @param newParentAddress the absolute address of the parent. + * @param formatOptions file format options. + */ + public static void updateParentAddress(final FusionDictionaryBufferInterface buffer, + final int groupOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(groupOriginAddress); + if (!formatOptions.mSupportsDynamicUpdate) { + throw new RuntimeException("this file format does not support parent addresses"); + } + final int flags = buffer.readUnsignedByte(); + if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) { + // if the group is moved, the parent address is stored in the destination group. + // We are guaranteed to process the destination group later, so there is no need to + // update anything here. + buffer.position(originalPosition); + return; + } + if (DBG) { + MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress); + } + final int parentOffset = newParentAddress - groupOriginAddress; + writeSInt24ToBuffer(buffer, parentOffset); + buffer.position(originalPosition); + } + + private static void skipCharGroup(final FusionDictionaryBufferInterface buffer, + final FormatOptions formatOptions) { + final int flags = buffer.readUnsignedByte(); + BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { + final int shortcutsSize = buffer.readUnsignedShort(); + buffer.position(buffer.position() + shortcutsSize + - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE); + } + if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + final int bigramFlags = buffer.readUnsignedByte(); + switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + buffer.readUnsignedByte(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + buffer.readUnsignedShort(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + buffer.readUnsignedInt24(); + break; + } + if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break; + } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + throw new RuntimeException("Too many bigrams in a group."); + } + } + } + + /** + * Update parent addresses in a Node that is referred to by nodeOriginAddress. + * + * @param buffer the buffer to be modified. + * @param nodeOriginAddress the address of a modified Node. + * @param newParentAddress the address to be written. + * @param formatOptions file format options. + */ + public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(nodeOriginAddress); + do { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) { + updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions); + skipCharGroup(buffer, formatOptions); + } + final int forwardLinkAddress = buffer.readUnsignedInt24(); + buffer.position(forwardLinkAddress); + } while (formatOptions.mSupportsDynamicUpdate + && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); + buffer.position(originalPosition); + } + + private static void skipString(final FusionDictionaryBufferInterface buffer, + final boolean hasMultipleChars) { + if (hasMultipleChars) { + int character = CharEncoding.readChar(buffer); + while (character != FormatSpec.INVALID_CHARACTER) { + character = CharEncoding.readChar(buffer); + } + } else { + CharEncoding.readChar(buffer); + } + } + + /** + * Write a string to a stream. + * + * @param destination the stream to write. + * @param word the string to be written. + * @return the size written, in bytes. + * @throws IOException + */ + private static int writeString(final OutputStream destination, final String word) + throws IOException { + int size = 0; + final int length = word.length(); + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + if (CharEncoding.getCharSize(codePoint) == 1) { + destination.write((byte)codePoint); + size++; + } else { + destination.write((byte)(0xFF & (codePoint >> 16))); + destination.write((byte)(0xFF & (codePoint >> 8))); + destination.write((byte)(0xFF & codePoint)); + size += 3; + } + } + destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); + size += FormatSpec.GROUP_TERMINATOR_SIZE; + return size; + } + + /** + * Update a children address in a CharGroup that is addressed by groupOriginAddress. + * + * @param buffer the buffer to write. + * @param groupOriginAddress the address of the group. + * @param newChildrenAddress the absolute address of the child. + * @param formatOptions file format options. + */ + public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer, + final int groupOriginAddress, final int newChildrenAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(groupOriginAddress); + final int flags = buffer.readUnsignedByte(); + final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS + ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); + writeSInt24ToBuffer(buffer, childrenOffset); + buffer.position(originalPosition); + } + + /** + * Write a char group to an output stream. + * A char group is an in-memory representation of a node in trie. + * A char group info is an on-disk representation of a node. + * + * @param destination the stream to write. + * @param info the char group info to be written. + * @return the size written, in bytes. + */ + public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info) + throws IOException { + int size = FormatSpec.GROUP_FLAGS_SIZE; + destination.write((byte)info.mFlags); + final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ? + FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress; + size += writeSInt24ToStream(destination, parentOffset); + + for (int i = 0; i < info.mCharacters.length; ++i) { + if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) { + destination.write((byte)info.mCharacters[i]); + size++; + } else { + size += writeSInt24ToStream(destination, info.mCharacters[i]); + } + } + if (info.mCharacters.length > 1) { + destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); + size++; + } + + if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { + destination.write((byte)info.mFrequency); + size++; + } + + if (DBG) { + MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size + + ", child=" + info.mChildrenAddress + ", characters =" + + new String(info.mCharacters, 0, info.mCharacters.length)); + } + final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ? + 0 : info.mChildrenAddress - (info.mOriginalAddress + size); + writeSInt24ToStream(destination, childrenOffset); + size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; + + if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) { + final int shortcutListSize = + BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); + destination.write((byte)(shortcutListSize >> 8)); + destination.write((byte)(shortcutListSize & 0xFF)); + size += 2; + final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + destination.write((byte)BinaryDictInputOutput.makeShortcutFlags( + shortcutIterator.hasNext(), target.mFrequency)); + size++; + size += writeString(destination, target.mWord); + } + } + + if (info.mBigrams != null) { + // TODO: Consolidate this code with the code that computes the size of the bigram list + // in BinaryDictionaryInputOutput#computeActualNodeSize + for (int i = 0; i < info.mBigrams.size(); ++i) { + + final int bigramFrequency = info.mBigrams.get(i).mFrequency; + int bigramFlags = (i < info.mBigrams.size() - 1) + ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0; + size++; + final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress + + size); + bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0; + switch (BinaryDictInputOutput.getByteSize(bigramOffset)) { + case 1: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + break; + } + bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY; + destination.write((byte)bigramFlags); + size += writeVariableAddress(destination, Math.abs(bigramOffset)); + } + } + return size; + } + + @SuppressWarnings("unused") + private static void updateForwardLink(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newNodeAddress, + final FormatOptions formatOptions) { + buffer.position(nodeOriginAddress); + int jumpCount = 0; + while (jumpCount++ < MAX_JUMPS) { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions); + final int forwardLinkAddress = buffer.readUnsignedInt24(); + if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + writeSInt24ToBuffer(buffer, newNodeAddress); + return; + } + buffer.position(forwardLinkAddress); + } + if (DBG && jumpCount >= MAX_JUMPS) { + throw new RuntimeException("too many jumps, probably a bug."); + } + } + + /** + * Helper method to move a char group to the tail of the file. + */ + private static int moveCharGroup(final OutputStream destination, + final FusionDictionaryBufferInterface buffer, final CharGroupInfo info, + final int nodeOriginAddress, final int oldGroupAddress, + final FormatOptions formatOptions) throws IOException { + updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions); + buffer.position(oldGroupAddress); + final int currentFlags = buffer.readUnsignedByte(); + buffer.position(oldGroupAddress); + buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags + & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); + int size = FormatSpec.GROUP_FLAGS_SIZE; + updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions); + size += writeNode(destination, new CharGroupInfo[] { info }); + return size; + } + + /** + * Compute the size of the char group. + */ + private static int computeGroupSize(final CharGroupInfo info, + final FormatOptions formatOptions) { + int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE + + BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters) + + BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions); + if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { + size += FormatSpec.GROUP_FREQUENCY_SIZE; + } + if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) { + size += BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); + } + if (info.mBigrams != null) { + for (final PendingAttribute attr : info.mBigrams) { + size += FormatSpec.GROUP_FLAGS_SIZE; + size += BinaryDictInputOutput.getByteSize(attr.mAddress); + } + } + return size; + } + + /** + * Write a node to the stream. + * + * @param destination the stream to write. + * @param infos groups to be written. + * @return the size written, in bytes. + * @throws IOException + */ + private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos) + throws IOException { + int size = BinaryDictInputOutput.getGroupCountSize(infos.length); + switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) { + case 1: + destination.write((byte)infos.length); + break; + case 2: + destination.write((byte)(infos.length >> 8)); + destination.write((byte)(infos.length & 0xFF)); + break; + default: + throw new RuntimeException("Invalid group count size."); + } + for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info); + writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS); + return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; + } + + /** + * Move a group that is referred to by oldGroupOrigin to the tail of the file. + * And set the children address to the byte after the group. + * + * @param nodeOrigin the address of the tail of the file. + * @param characters + * @param length + * @param flags + * @param frequency + * @param parentAddress + * @param shortcutTargets + * @param bigrams + * @param destination the stream representing the tail of the file. + * @param buffer the buffer representing the (constant-size) body of the file. + * @param oldNodeOrigin + * @param oldGroupOrigin + * @param formatOptions + * @return the size written, in bytes. + * @throws IOException + */ + private static int moveGroup(final int nodeOrigin, final int[] characters, final int length, + final int flags, final int frequency, final int parentAddress, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<PendingAttribute> bigrams, final OutputStream destination, + final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin, + final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException { + int size = 0; + final int newGroupOrigin = nodeOrigin + 1; + final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length); + final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */, + flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS, + shortcutTargets, bigrams); + size = computeGroupSize(tmpInfo, formatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size, + flags, writtenCharacters, frequency, parentAddress, + nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, + bigrams); + moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions); + return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; + } + + /** + * Insert a word into a binary dictionary. + * + * @param buffer + * @param destination + * @param word + * @param frequency + * @param bigramStrings + * @param shortcuts + * @throws IOException + * @throws UnsupportedFormatException + */ + // TODO: Support batch insertion. + // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. + @UsedForTesting + public static void insertWord(final FusionDictionaryBufferInterface buffer, + final OutputStream destination, final String word, final int frequency, + final ArrayList<WeightedString> bigramStrings, + final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, + final boolean isBlackListEntry) + throws IOException, UnsupportedFormatException { + final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>(); + if (bigramStrings != null) { + for (final WeightedString bigram : bigramStrings) { + int position = getTerminalPosition(buffer, bigram.mWord); + if (position == FormatSpec.NOT_VALID_WORD) { + // TODO: figure out what is the correct thing to do here. + } else { + bigrams.add(new PendingAttribute(bigram.mFrequency, position)); + } + } + } + + final boolean isTerminal = true; + final boolean hasBigrams = !bigrams.isEmpty(); + final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty(); + + // find the insert position of the word. + if (buffer.position() != 0) buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + + int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position(); + final int[] codePoints = FusionDictionary.getCodePoints(word); + final int wordLen = codePoints.length; + + for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { + if (wordPos >= wordLen) break; + nodeOriginAddress = buffer.position(); + int nodeParentAddress = -1; final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); - groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount); + boolean foundNextGroup = false; for (int i = 0; i < charGroupCount; ++i) { - final int charGroupPos = buffer.position(); + address = buffer.position(); final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, buffer.position(), header.mFormatOptions); - boolean same = true; - for (int p = 0, j = word.offsetByCodePoints(0, wordPos); - p < currentInfo.mCharacters.length; - ++p, j = word.offsetByCodePoints(j, 1)) { - if (wordPos + p >= wordLen - || word.codePointAt(j) != currentInfo.mCharacters[p]) { - same = false; + final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, + header.mFormatOptions); + if (isMovedGroup) continue; + nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) + ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address; + boolean matched = true; + for (int p = 0; p < currentInfo.mCharacters.length; ++p) { + if (wordPos + p >= wordLen) { + /* + * splitting + * before + * abcd - ef + * + * insert "abc" + * + * after + * abc - d - ef + */ + final int newNodeAddress = buffer.limit(); + final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, + isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */, + false /* isBlackListEntry */, header.mFormatOptions); + int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags, + frequency, nodeParentAddress, shortcuts, bigrams, destination, + buffer, nodeOriginAddress, address, header.mFormatOptions); + + final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, + currentInfo.mCharacters.length); + if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + updateParentAddresses(buffer, currentInfo.mChildrenAddress, + newNodeAddress + written + 1, header.mFormatOptions); + } + final CharGroupInfo newInfo2 = new CharGroupInfo( + newNodeAddress + written + 1, -1 /* endAddress */, + currentInfo.mFlags, characters2, currentInfo.mFrequency, + newNodeAddress + 1, currentInfo.mChildrenAddress, + currentInfo.mShortcutTargets, currentInfo.mBigrams); + writeNode(destination, new CharGroupInfo[] { newInfo2 }); + return; + } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) { + if (p > 0) { + /* + * splitting + * before + * ab - cd + * + * insert "ac" + * + * after + * a - b - cd + * | + * - c + */ + + final int newNodeAddress = buffer.limit(); + final int childrenAddress = currentInfo.mChildrenAddress; + + // move prefix + final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, + false /* isTerminal */, 0 /* childrenAddressSize*/, + false /* hasShortcut */, false /* hasBigrams */, + false /* isNotAWord */, false /* isBlackListEntry */, + header.mFormatOptions); + int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, + prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, + destination, buffer, nodeOriginAddress, address, + header.mFormatOptions); + + final int[] suffixCharacters = Arrays.copyOfRange( + currentInfo.mCharacters, p, currentInfo.mCharacters.length); + if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + updateParentAddresses(buffer, currentInfo.mChildrenAddress, + newNodeAddress + written + 1, header.mFormatOptions); + } + final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags( + suffixCharacters.length > 1, + (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0, + 0 /* childrenAddressSize */, + (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) + != 0, + (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo suffixInfo = new CharGroupInfo( + newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags, + suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1, + currentInfo.mChildrenAddress, currentInfo.mShortcutTargets, + currentInfo.mBigrams); + written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1; + + final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p, + codePoints.length); + final int flags = BinaryDictInputOutput.makeCharGroupFlags( + newCharacters.length > 1, isTerminal, + 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo( + newNodeAddress + written, -1 /* endAddress */, flags, + newCharacters, frequency, newNodeAddress + 1, + FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); + writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo }); + return; + } + matched = false; break; } } - if (same) { + if (matched) { if (wordPos + currentInfo.mCharacters.length == wordLen) { - if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) { - return FormatSpec.NOT_VALID_WORD; - } else { - return charGroupPos; - } + // the word exists in the dictionary. + // only update group. + final int newNodeAddress = buffer.limit(); + final boolean hasMultipleChars = currentInfo.mCharacters.length > 1; + final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, + -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, + nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, + bigrams); + moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address, + header.mFormatOptions); + return; } wordPos += currentInfo.mCharacters.length; if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { - return FormatSpec.NOT_VALID_WORD; + /* + * found the prefix of the word. + * make new node and link to the node from this group. + * + * before + * ab - cd + * + * insert "abcde" + * + * after + * ab - cd - e + */ + final int newNodeAddress = buffer.limit(); + updateChildrenAddress(buffer, address, newNodeAddress, + header.mFormatOptions); + final int newGroupAddress = newNodeAddress + 1; + final boolean hasMultipleChars = (wordLen - wordPos) > 1; + final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); + final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags, + characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS, + shortcuts, bigrams); + writeNode(destination, new CharGroupInfo[] { newInfo }); + return; } buffer.position(currentInfo.mChildrenAddress); + foundNextGroup = true; break; } - groupOffset = currentInfo.mEndAddress; + } - // not found - if (i >= charGroupCount - 1) { - return FormatSpec.NOT_VALID_WORD; - } + if (foundNextGroup) continue; + + // reached the end of the array. + final int linkAddressPosition = buffer.position(); + int nextLink = buffer.readUnsignedInt24(); + if ((nextLink & MSB24) != 0) { + nextLink = -(nextLink & SINT24_MAX); + } + if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + /* + * expand this node. + * + * before + * ab - cd + * + * insert "abef" + * + * after + * ab - cd + * | + * - ef + */ + + // change the forward link address. + final int newNodeAddress = buffer.limit(); + buffer.position(linkAddressPosition); + writeSInt24ToBuffer(buffer, newNodeAddress); + + final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); + final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, + -1 /* endAddress */, flags, characters, frequency, nodeParentAddress, + FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); + writeNode(destination, new CharGroupInfo[]{ newInfo }); + return; + } else { + depth--; + buffer.position(nextLink); } } - return FormatSpec.NOT_VALID_WORD; + } + + /** + * Find a word from the buffer. + * + * @param buffer the buffer representing the body of the dictionary file. + * @param word the word searched + * @return the found group + * @throws IOException + * @throws UnsupportedFormatException + */ + @UsedForTesting + public static CharGroupInfo findWordFromBuffer(final FusionDictionaryBufferInterface buffer, + final String word) throws IOException, UnsupportedFormatException { + int position = getTerminalPosition(buffer, word); + if (position != FormatSpec.NOT_VALID_WORD) { + buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + buffer.position(position); + return BinaryDictInputOutput.readCharGroup(buffer, position, header.mFormatOptions); + } + return null; } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 1d3e94bb7..d1a3c7b0a 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -16,6 +16,7 @@ package com.android.inputmethod.latin.makedict; +import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; @@ -36,7 +37,6 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.Stack; import java.util.TreeMap; /** @@ -44,7 +44,7 @@ import java.util.TreeMap; * * All the methods in this class are static. */ -public class BinaryDictInputOutput { +public final class BinaryDictInputOutput { private static final boolean DBG = MakedictLog.DBG; @@ -54,6 +54,7 @@ public class BinaryDictInputOutput { // If the number of passes exceeds this number, makedict bails with an exception on // suspicion that a bug might be causing an infinite loop. private static final int MAX_PASSES = 24; + private static final int MAX_JUMPS = 12; public interface FusionDictionaryBufferInterface { public int readUnsignedByte(); @@ -64,6 +65,7 @@ public class BinaryDictInputOutput { public void position(int newPosition); public void put(final byte b); public int limit(); + public int capacity(); } public static final class ByteBufferWrapper implements FusionDictionaryBufferInterface { @@ -75,12 +77,12 @@ public class BinaryDictInputOutput { @Override public int readUnsignedByte() { - return ((int)mBuffer.get()) & 0xFF; + return mBuffer.get() & 0xFF; } @Override public int readUnsignedShort() { - return ((int)mBuffer.getShort()) & 0xFFFF; + return mBuffer.getShort() & 0xFFFF; } @Override @@ -113,13 +115,17 @@ public class BinaryDictInputOutput { public int limit() { return mBuffer.limit(); } + + @Override + public int capacity() { + return mBuffer.capacity(); + } } /** * A class grouping utility function for our specific character encoding. */ - private static class CharEncoding { - + static final class CharEncoding { private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; @@ -148,7 +154,7 @@ public class BinaryDictInputOutput { * @param character the character code. * @return the size in binary encoded-form, either 1 or 3 bytes. */ - private static int getCharSize(final int character) { + static int getCharSize(final int character) { // See char encoding in FusionDictionary.java if (fitsOnOneByte(character)) return 1; if (FormatSpec.INVALID_CHARACTER == character) return 1; @@ -257,7 +263,7 @@ public class BinaryDictInputOutput { * @param buffer the buffer, positioned over an encoded character. * @return the character code. */ - private static int readChar(final FusionDictionaryBufferInterface buffer) { + static int readChar(final FusionDictionaryBufferInterface buffer) { int character = buffer.readUnsignedByte(); if (!fitsOnOneByte(character)) { if (FormatSpec.GROUP_CHARACTERS_TERMINATOR == character) { @@ -271,6 +277,21 @@ public class BinaryDictInputOutput { } /** + * Compute the binary size of the character array. + * + * If only one character, this is the size of this character. If many, it's the sum of their + * sizes + 1 byte for the terminator. + * + * @param characters the character array + * @return the size of the char array, including the terminator if any + */ + static int getGroupCharactersSize(final int[] characters) { + int size = CharEncoding.getCharArraySize(characters); + if (characters.length > 1) size += FormatSpec.GROUP_TERMINATOR_SIZE; + return size; + } + + /** * Compute the binary size of the character array in a group * * If only one character, this is the size of this character. If many, it's the sum of their @@ -280,9 +301,7 @@ public class BinaryDictInputOutput { * @return the size of the char array, including the terminator if any */ private static int getGroupCharactersSize(final CharGroup group) { - int size = CharEncoding.getCharArraySize(group.mChars); - if (group.hasSeveralChars()) size += FormatSpec.GROUP_TERMINATOR_SIZE; - return size; + return getGroupCharactersSize(group.mChars); } /** @@ -332,7 +351,7 @@ public class BinaryDictInputOutput { * This is known in advance and does not change according to position in the file * like address lists do. */ - private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { + static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { if (null == shortcutList) return 0; int size = FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE; for (final WeightedString shortcut : shortcutList) { @@ -376,7 +395,7 @@ public class BinaryDictInputOutput { g.mCachedSize = groupSize; size += groupSize; } - if (options.mHasLinkedListNode) { + if (options.mSupportsDynamicUpdate) { size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; } node.mCachedSize = size; @@ -390,11 +409,27 @@ public class BinaryDictInputOutput { } /** - * Helper method to check whether the CharGroup has a parent address. + * Helper method to check whether the group is moved. */ - private static boolean hasParentAddress(final FormatOptions options) { - return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_PARENT_ADDRESS - && options.mHasParentAddress; + public static boolean isMovedGroup(final int flags, final FormatOptions options) { + return options.mSupportsDynamicUpdate + && ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED); + } + + /** + * Helper method to check whether the group is deleted. + */ + public static boolean isDeletedGroup(final int flags, final FormatOptions formatOptions) { + return formatOptions.mSupportsDynamicUpdate + && ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED); + } + + /** + * Helper method to check whether the dictionary can be updated dynamically. + */ + public static boolean supportsDynamicUpdate(final FormatOptions options) { + return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_DYNAMIC_UPDATE + && options.mSupportsDynamicUpdate; } /** @@ -404,7 +439,7 @@ public class BinaryDictInputOutput { * @param options file format options. */ private static int getGroupHeaderSize(final CharGroup group, final FormatOptions options) { - if (hasParentAddress(options)) { + if (supportsDynamicUpdate(options)) { return FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE + getGroupCharactersSize(group); } else { @@ -412,6 +447,10 @@ public class BinaryDictInputOutput { } } + private static final int UINT8_MAX = 0xFF; + private static final int UINT16_MAX = 0xFFFF; + private static final int UINT24_MAX = 0xFFFFFF; + /** * Compute the size, in bytes, that an address will occupy. * @@ -422,18 +461,23 @@ public class BinaryDictInputOutput { * @param address the address * @return the byte size. */ - private static int getByteSize(final int address) { - assert(address < 0x1000000); + static int getByteSize(final int address) { + assert(address <= UINT24_MAX); if (!hasChildrenAddress(address)) { return 0; - } else if (Math.abs(address) < 0x100) { + } else if (Math.abs(address) <= UINT8_MAX) { return 1; - } else if (Math.abs(address) < 0x10000) { + } else if (Math.abs(address) <= UINT16_MAX) { return 2; } else { return 3; } } + + private static final int SINT24_MAX = 0x7FFFFF; + private static final int MSB8 = 0x80; + private static final int MSB24 = 0x800000; + // End utility methods. // This method is responsible for finding a nice ordering of the nodes that favors run-time @@ -509,13 +553,19 @@ public class BinaryDictInputOutput { } int groupSize = getGroupHeaderSize(group, formatOptions); if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE; - if (null != group.mChildren) { + if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) { + groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; + } else if (null != group.mChildren) { final int offsetBasePoint = groupSize + node.mCachedAddress + size; final int offset = group.mChildren.mCachedAddress - offsetBasePoint; // assign my address to children's parent address group.mChildren.mCachedParentAddress = group.mCachedAddress - group.mChildren.mCachedAddress; - groupSize += getByteSize(offset); + if (formatOptions.mSupportsDynamicUpdate) { + groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; + } else { + groupSize += getByteSize(offset); + } } groupSize += getShortcutListSize(group.mShortcutTargets); if (null != group.mBigrams) { @@ -530,7 +580,7 @@ public class BinaryDictInputOutput { group.mCachedSize = groupSize; size += groupSize; } - if (formatOptions.mHasLinkedListNode) { + if (formatOptions.mSupportsDynamicUpdate) { size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; } if (node.mCachedSize != size) { @@ -559,7 +609,8 @@ public class BinaryDictInputOutput { groupOffset += g.mCachedSize; } final int nodeSize = groupCountSize + groupOffset - + (formatOptions.mHasLinkedListNode ? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0); + + (formatOptions.mSupportsDynamicUpdate + ? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0); if (nodeSize != n.mCachedSize) { throw new RuntimeException("Bug : Stored and computed node size differ"); } @@ -668,49 +719,81 @@ public class BinaryDictInputOutput { } } - private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, - final int childrenOffset) { - byte flags = 0; - if (group.mChars.length > 1) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; - if (group.mFrequency >= 0) { - flags |= FormatSpec.FLAG_IS_TERMINAL; - } - if (null != group.mChildren) { - switch (getByteSize(childrenOffset)) { - case 1: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; - break; - case 2: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; - break; - case 3: - flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; - break; - default: - throw new RuntimeException("Node with a strange address"); - } - } - if (null != group.mShortcutTargets) { - if (DBG && 0 == group.mShortcutTargets.size()) { - throw new RuntimeException("0-sized shortcut list must be null"); - } - flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; + /** + * Helper method to write a variable-size signed address to a file. + * + * @param buffer the buffer to write to. + * @param index the index in the buffer to write the address to. + * @param address the address to write. + * @return the size in bytes the address actually took. + */ + private static int writeVariableSignedAddress(final byte[] buffer, int index, + final int address) { + if (!hasChildrenAddress(address)) { + buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; + } else { + final int absAddress = Math.abs(address); + buffer[index++] = (byte)((address < 0 ? MSB8 : 0) | (0xFF & (absAddress >> 16))); + buffer[index++] = (byte)(0xFF & (absAddress >> 8)); + buffer[index++] = (byte)(0xFF & absAddress); } - if (null != group.mBigrams) { - if (DBG && 0 == group.mBigrams.size()) { - throw new RuntimeException("0-sized bigram list must be null"); + return 3; + } + + /** + * Makes the flag value for a char group. + * + * @param hasMultipleChars whether the group has multiple chars. + * @param isTerminal whether the group is terminal. + * @param childrenAddressSize the size of a children address. + * @param hasShortcuts whether the group has shortcuts. + * @param hasBigrams whether the group has bigrams. + * @param isNotAWord whether the group is not a word. + * @param isBlackListEntry whether the group is a blacklist entry. + * @param formatOptions file format options. + * @return the flags + */ + static int makeCharGroupFlags(final boolean hasMultipleChars, final boolean isTerminal, + final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams, + final boolean isNotAWord, final boolean isBlackListEntry, + final FormatOptions formatOptions) { + byte flags = 0; + if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; + if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; + if (formatOptions.mSupportsDynamicUpdate) { + flags |= FormatSpec.FLAG_IS_NOT_MOVED; + } else if (true) { + switch (childrenAddressSize) { + case 1: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; + break; + case 0: + flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS; + break; + default: + throw new RuntimeException("Node with a strange address"); } - flags |= FormatSpec.FLAG_HAS_BIGRAMS; - } - if (group.mIsNotAWord) { - flags |= FormatSpec.FLAG_IS_NOT_A_WORD; - } - if (group.mIsBlacklistEntry) { - flags |= FormatSpec.FLAG_IS_BLACKLISTED; } + if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; + if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; + if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; + if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED; return flags; } + private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, + final int childrenOffset, final FormatOptions formatOptions) { + return (byte) makeCharGroupFlags(group.mChars.length > 1, group.mFrequency >= 0, + getByteSize(childrenOffset), group.mShortcutTargets != null, group.mBigrams != null, + group.mIsNotAWord, group.mIsBlacklistEntry, formatOptions); + } + /** * Makes the flag value for a bigram. * @@ -792,8 +875,7 @@ public class BinaryDictInputOutput { return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0) + (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0) + (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0) - + (formatOptions.mHasParentAddress ? FormatSpec.HAS_PARENT_ADDRESS : 0) - + (formatOptions.mHasLinkedListNode ? FormatSpec.HAS_LINKEDLIST_NODE : 0); + + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0); } /** @@ -803,11 +885,30 @@ public class BinaryDictInputOutput { * @param frequency the frequency of the attribute, 0..15 * @return the flags */ - private static final int makeShortcutFlags(final boolean more, final int frequency) { + static final int makeShortcutFlags(final boolean more, final int frequency) { return (more ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY); } + private static final int writeParentAddress(final byte[] buffer, final int index, + final int address, final FormatOptions formatOptions) { + if (supportsDynamicUpdate(formatOptions)) { + if (address == FormatSpec.NO_PARENT_ADDRESS) { + buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; + } else { + final int absAddress = Math.abs(address); + assert(absAddress <= SINT24_MAX); + buffer[index] = (byte)((address < 0 ? MSB8 : 0) + | ((absAddress >> 16) & 0xFF)); + buffer[index + 1] = (byte)((absAddress >> 8) & 0xFF); + buffer[index + 2] = (byte)(absAddress & 0xFF); + } + return index + 3; + } else { + return index; + } + } + /** * Write a node to memory. The node is expected to have its final position cached. * @@ -822,6 +923,7 @@ public class BinaryDictInputOutput { */ private static int writePlacedNode(final FusionDictionary dict, byte[] buffer, final Node node, final FormatOptions formatOptions) { + // TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup int index = node.mCachedAddress; final int groupCount = node.mData.size(); @@ -854,22 +956,15 @@ public class BinaryDictInputOutput { final int childrenOffset = null == group.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress; - byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset); + byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions); buffer[index++] = flags; - if (hasParentAddress(formatOptions)) { - if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) { - // this node is the root node. - buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; - } else { - // write parent address. (version 3) - final int actualParentAddress = Math.abs(parentAddress - + (node.mCachedAddress - group.mCachedAddress)); - buffer[index] = (byte)((actualParentAddress >> 16) & 0xFF); - buffer[index + 1] = (byte)((actualParentAddress >> 8) & 0xFF); - buffer[index + 2] = (byte)(actualParentAddress & 0xFF); - } - index += 3; + if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) { + index = writeParentAddress(buffer, index, parentAddress, formatOptions); + } else { + index = writeParentAddress(buffer, index, + parentAddress + (node.mCachedAddress - group.mCachedAddress), + formatOptions); } index = CharEncoding.writeCharArray(group.mChars, buffer, index); @@ -879,7 +974,13 @@ public class BinaryDictInputOutput { if (group.mFrequency >= 0) { buffer[index++] = (byte) group.mFrequency; } - final int shift = writeVariableAddress(buffer, index, childrenOffset); + + final int shift; + if (formatOptions.mSupportsDynamicUpdate) { + shift = writeVariableSignedAddress(buffer, index, childrenOffset); + } else { + shift = writeVariableAddress(buffer, index, childrenOffset); + } index += shift; groupAddress += shift; @@ -927,7 +1028,7 @@ public class BinaryDictInputOutput { } } - if (formatOptions.mHasLinkedListNode) { + if (formatOptions.mSupportsDynamicUpdate) { buffer[index] = buffer[index + 1] = buffer[index + 2] = FormatSpec.NO_FORWARD_LINK_ADDRESS; index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; @@ -1104,6 +1205,58 @@ public class BinaryDictInputOutput { // Input methods: Read a binary dictionary to memory. // readDictionaryBinary is the public entry point for them. + static int getChildrenAddressSize(final int optionFlags, + final FormatOptions formatOptions) { + if (formatOptions.mSupportsDynamicUpdate) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; + switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) { + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + return 1; + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + return 2; + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + return 3; + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: + default: + return 0; + } + } + + static int readChildrenAddress(final FusionDictionaryBufferInterface buffer, + final int optionFlags, final FormatOptions options) { + if (options.mSupportsDynamicUpdate) { + final int address = buffer.readUnsignedInt24(); + if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS; + if ((address & MSB24) != 0) { + return -(address & SINT24_MAX); + } else { + return address; + } + } + int address; + switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) { + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + return buffer.readUnsignedByte(); + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + return buffer.readUnsignedShort(); + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + return buffer.readUnsignedInt24(); + case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: + default: + return FormatSpec.NO_CHILDREN_ADDRESS; + } + } + + static int readParentAddress(final FusionDictionaryBufferInterface buffer, + final FormatOptions formatOptions) { + if (supportsDynamicUpdate(formatOptions)) { + final int parentAddress = buffer.readUnsignedInt24(); + final int sign = ((parentAddress & MSB24) != 0) ? -1 : 1; + return sign * (parentAddress & SINT24_MAX); + } else { + return FormatSpec.NO_PARENT_ADDRESS; + } + } + private static final int[] CHARACTER_BUFFER = new int[FormatSpec.MAX_WORD_LENGTH]; public static CharGroupInfo readCharGroup(final FusionDictionaryBufferInterface buffer, final int originalGroupAddress, final FormatOptions options) { @@ -1111,13 +1264,9 @@ public class BinaryDictInputOutput { final int flags = buffer.readUnsignedByte(); ++addressPointer; - final int parentAddress; - if (hasParentAddress(options)) { - // read the parent address. (version 3) - parentAddress = -buffer.readUnsignedInt24(); + final int parentAddress = readParentAddress(buffer, options); + if (supportsDynamicUpdate(options)) { addressPointer += 3; - } else { - parentAddress = FormatSpec.NO_PARENT_ADDRESS; } final int characters[]; @@ -1146,25 +1295,11 @@ public class BinaryDictInputOutput { } else { frequency = CharGroup.NOT_A_TERMINAL; } - int childrenAddress = addressPointer; - switch (flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) { - case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: - childrenAddress += buffer.readUnsignedByte(); - addressPointer += 1; - break; - case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: - childrenAddress += buffer.readUnsignedShort(); - addressPointer += 2; - break; - case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: - childrenAddress += buffer.readUnsignedInt24(); - addressPointer += 3; - break; - case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: - default: - childrenAddress = FormatSpec.NO_CHILDREN_ADDRESS; - break; + int childrenAddress = readChildrenAddress(buffer, flags, options); + if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + childrenAddress += addressPointer; } + addressPointer += getChildrenAddressSize(flags, options); ArrayList<WeightedString> shortcutTargets = null; if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { final int pointerBefore = buffer.position(); @@ -1182,7 +1317,8 @@ public class BinaryDictInputOutput { ArrayList<PendingAttribute> bigrams = null; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); - while (true) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { final int bigramFlags = buffer.readUnsignedByte(); ++addressPointer; final int sign = 0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE) @@ -1210,6 +1346,9 @@ public class BinaryDictInputOutput { bigramAddress)); if (0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break; } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + MakedictLog.d("too many bigrams in a group."); + } } return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, parentAddress, childrenAddress, shortcutTargets, bigrams); @@ -1232,7 +1371,8 @@ public class BinaryDictInputOutput { // of this method. Since it performs direct, unbuffered random access to the file and // may be called hundreds of thousands of times, the resulting performance is not // reasonable without some kind of cache. Thus: - private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>(); + private static TreeMap<Integer, WeightedString> wordCache = + new TreeMap<Integer, WeightedString>(); /** * Finds, as a string, the word at the address passed as an argument. * @@ -1240,18 +1380,19 @@ public class BinaryDictInputOutput { * @param headerSize the size of the header. * @param address the address to seek. * @param formatOptions file format options. - * @return the word, as a string. + * @return the word with its frequency, as a weighted string. */ - /* packages for tests */ static String getWordAtAddress( + /* package for tests */ static WeightedString getWordAtAddress( final FusionDictionaryBufferInterface buffer, final int headerSize, final int address, final FormatOptions formatOptions) { - final String cachedString = wordCache.get(address); + final WeightedString cachedString = wordCache.get(address); if (null != cachedString) return cachedString; - final String result; + final WeightedString result; final int originalPointer = buffer.position(); + buffer.position(address); - if (hasParentAddress(formatOptions)) { + if (supportsDynamicUpdate(formatOptions)) { result = getWordAtAddressWithParentAddress(buffer, headerSize, address, formatOptions); } else { result = getWordAtAddressWithoutParentAddress(buffer, headerSize, address, @@ -1263,38 +1404,53 @@ public class BinaryDictInputOutput { return result; } + // TODO: static!? This will behave erratically when used in multi-threaded code. + // We need to fix this private static int[] sGetWordBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; - private static String getWordAtAddressWithParentAddress( + private static WeightedString getWordAtAddressWithParentAddress( final FusionDictionaryBufferInterface buffer, final int headerSize, final int address, final FormatOptions options) { final StringBuilder builder = new StringBuilder(); int currentAddress = address; int index = FormatSpec.MAX_WORD_LENGTH - 1; + int frequency = Integer.MIN_VALUE; // the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH for (int count = 0; count < FormatSpec.MAX_WORD_LENGTH; ++count) { - buffer.position(currentAddress + headerSize); - final CharGroupInfo currentInfo = readCharGroup(buffer, currentAddress, options); + CharGroupInfo currentInfo; + int loopCounter = 0; + do { + buffer.position(currentAddress + headerSize); + currentInfo = readCharGroup(buffer, currentAddress, options); + if (isMovedGroup(currentInfo.mFlags, options)) { + currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress; + } + if (DBG && loopCounter++ > MAX_JUMPS) { + MakedictLog.d("Too many jumps - probably a bug"); + } + } while (isMovedGroup(currentInfo.mFlags, options)); + if (Integer.MIN_VALUE == frequency) frequency = currentInfo.mFrequency; for (int i = 0; i < currentInfo.mCharacters.length; ++i) { sGetWordBuffer[index--] = currentInfo.mCharacters[currentInfo.mCharacters.length - i - 1]; } - if (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) break; currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress; } - return new String(sGetWordBuffer, index + 1, FormatSpec.MAX_WORD_LENGTH - index - 1); + return new WeightedString( + new String(sGetWordBuffer, index + 1, FormatSpec.MAX_WORD_LENGTH - index - 1), + frequency); } - private static String getWordAtAddressWithoutParentAddress( + private static WeightedString getWordAtAddressWithoutParentAddress( final FusionDictionaryBufferInterface buffer, final int headerSize, final int address, final FormatOptions options) { buffer.position(headerSize); final int count = readCharGroupCount(buffer); int groupOffset = getGroupCountSize(count); final StringBuilder builder = new StringBuilder(); - String result = null; + WeightedString result = null; CharGroupInfo last = null; for (int i = count - 1; i >= 0; --i) { @@ -1302,7 +1458,7 @@ public class BinaryDictInputOutput { groupOffset = info.mEndAddress; if (info.mOriginalAddress == address) { builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); - result = builder.toString(); + result = new WeightedString(builder.toString(), info.mFrequency); break; // and return } if (hasChildrenAddress(info.mChildrenAddress)) { @@ -1357,14 +1513,17 @@ public class BinaryDictInputOutput { int groupOffset = nodeHeadPosition + getGroupCountSize(count); for (int i = count; i > 0; --i) { // Scan the array of CharGroup. CharGroupInfo info = readCharGroup(buffer, groupOffset, options); + if (isMovedGroup(info.mFlags, options)) continue; ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets; ArrayList<WeightedString> bigrams = null; if (null != info.mBigrams) { bigrams = new ArrayList<WeightedString>(); for (PendingAttribute bigram : info.mBigrams) { - final String word = getWordAtAddress( + final WeightedString word = getWordAtAddress( buffer, headerSize, bigram.mAddress, options); - bigrams.add(new WeightedString(word, bigram.mFrequency)); + final int reconstructedFrequency = + reconstructBigramFrequency(word.mFrequency, bigram.mFrequency); + bigrams.add(new WeightedString(word.mWord, reconstructedFrequency)); } } if (hasChildrenAddress(info.mChildrenAddress)) { @@ -1392,7 +1551,7 @@ public class BinaryDictInputOutput { } // reach the end of the array. - if (options.mHasLinkedListNode) { + if (options.mSupportsDynamicUpdate) { final int nextAddress = buffer.readUnsignedInt24(); if (nextAddress >= 0 && nextAddress < buffer.limit()) { buffer.position(nextAddress); @@ -1400,7 +1559,7 @@ public class BinaryDictInputOutput { break; } } - } while (options.mHasLinkedListNode && + } while (options.mSupportsDynamicUpdate && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); final Node node = new Node(nodeContents); @@ -1469,8 +1628,7 @@ public class BinaryDictInputOutput { 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG), 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)), new FormatOptions(version, - 0 != (optionsFlags & FormatSpec.HAS_PARENT_ADDRESS), - 0 != (optionsFlags & FormatSpec.HAS_LINKEDLIST_NODE))); + 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE))); return header; } @@ -1500,6 +1658,7 @@ public class BinaryDictInputOutput { * @param dict an optional dictionary to add words to, or null. * @return the created (or merged) dictionary. */ + @UsedForTesting public static FusionDictionary readDictionaryBinary( final FusionDictionaryBufferInterface buffer, final FusionDictionary dict) throws IOException, UnsupportedFormatException { @@ -1537,17 +1696,24 @@ public class BinaryDictInputOutput { } /** + * Helper method to pass a file name instead of a File object to isBinaryDictionary. + */ + public static boolean isBinaryDictionary(final String filename) { + final File file = new File(filename); + return isBinaryDictionary(file); + } + + /** * Basic test to find out whether the file is a binary dictionary or not. * * Concretely this only tests the magic number. * - * @param filename The name of the file to test. + * @param file The file to test. * @return true if it's a binary dictionary, false otherwise */ - public static boolean isBinaryDictionary(final String filename) { + public static boolean isBinaryDictionary(final File file) { FileInputStream inStream = null; try { - final File file = new File(filename); inStream = new FileInputStream(file); final ByteBuffer buffer = inStream.getChannel().map( FileChannel.MapMode.READ_ONLY, 0, file.length()); @@ -1582,8 +1748,7 @@ public class BinaryDictInputOutput { final int bigramFrequency) { final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); - final float resultFreqFloat = (float)unigramFrequency - + stepSize * (bigramFrequency + 1.0f); + final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f); return (int)resultFreqFloat; } } diff --git a/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java b/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java index ed9388409..8e64082e6 100644 --- a/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java +++ b/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java @@ -23,7 +23,7 @@ import java.util.ArrayList; /** * Raw char group info straight out of a file. This will contain numbers for addresses. */ -public class CharGroupInfo { +public final class CharGroupInfo { public final int mOriginalAddress; public final int mEndAddress; diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index adc6037bb..705f66414 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -42,32 +42,43 @@ public final class FormatSpec { * ps * * f | - * o | IF HAS_LINKEDLIST_NODE (defined in the file header) + * o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header) * r | forward link address, 3byte - * w | the address must be positive. - * a | - * rdlinkaddress + * w | 1 byte = bbbbbbbb match + * a | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte) + * r | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte + * d | + * linkaddress */ /* Node(CharGroup) layout is as follows: - * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE - * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE - * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES - * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES - * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS - * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL + * | IF !SUPPORTS_DYNAMIC_UPDATE + * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE + * | 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS + * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE + * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES + * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES + * g | ELSE + * s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED + * | This must be the same as FLAG_GROUP_ADDRESS_TYPE_THREEBYTES + * | 01 = yes : FLAG_IS_MOVED + * | the new address is stored in the same place as the parent address + * | is deleted? 10 = yes : FLAG_IS_DELETED + * | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS + * | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD * | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED * * p | - * a | IF HAS_PARENT_ADDRESS (defined in the file header) + * a | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header) * r | parent address, 3byte - * e | the address must be negative, so the absolute value of the address is stored. - * n | - * taddress + * e | 1 byte = bbbbbbbb match + * n | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) + * t | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte + * a | + * ddress * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers @@ -145,17 +156,14 @@ public final class FormatSpec { static final int MAXIMUM_SUPPORTED_VERSION = 3; static final int NOT_A_VERSION_NUMBER = -1; static final int FIRST_VERSION_WITH_HEADER_SIZE = 2; - static final int FIRST_VERSION_WITH_PARENT_ADDRESS = 3; - static final int FIRST_VERSION_WITH_LINKEDLIST_NODE = 3; + static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3; // These options need to be the same numeric values as the one in the native reading code. static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; // TODO: Make the native reading code read this variable. - static final int HAS_PARENT_ADDRESS = 0x2; + static final int SUPPORTS_DYNAMIC_UPDATE = 0x2; static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; static final int CONTAINS_BIGRAMS_FLAG = 0x8; - // TODO: Make the native reading code read this variable. - static final int HAS_LINKEDLIST_NODE = 0x10; // TODO: Make this value adaptative to content data, store it in the header, and // use it in the reading code. @@ -164,6 +172,7 @@ public final class FormatSpec { static final int PARENT_ADDRESS_SIZE = 3; static final int FORWARD_LINK_ADDRESS_SIZE = 3; + // These flags are used only in the static dictionary. static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; @@ -178,6 +187,13 @@ public final class FormatSpec { static final int FLAG_IS_NOT_A_WORD = 0x02; static final int FLAG_IS_BLACKLISTED = 0x01; + // These flags are used only in the dynamic dictionary. + static final int MASK_MOVE_AND_DELETE_FLAG = 0xC0; + static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40; + static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; + static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE; + static final int FLAG_IS_DELETED = 0x80; + static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; @@ -203,43 +219,33 @@ public final class FormatSpec { static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 + static final int MAX_BIGRAMS_IN_A_GROUP = 10000; static final int MAX_TERMINAL_FREQUENCY = 255; static final int MAX_BIGRAM_FREQUENCY = 15; + public static final int SHORTCUT_WHITELIST_FREQUENCY = 15; + // This option needs to be the same numeric value as the one in binary_format.h. static final int NOT_VALID_WORD = -99; + static final int SIGNED_CHILDREN_ADDRESS_SIZE = 3; /** * Options about file format. */ - public static class FormatOptions { + public static final class FormatOptions { public final int mVersion; - public final boolean mHasParentAddress; - public final boolean mHasLinkedListNode; + public final boolean mSupportsDynamicUpdate; public FormatOptions(final int version) { this(version, false); } - public FormatOptions(final int version, final boolean hasParentAddress) { - this(version, hasParentAddress, false); - } - public FormatOptions(final int version, final boolean hasParentAddress, - final boolean hasLinkedListNode) { + public FormatOptions(final int version, final boolean supportsDynamicUpdate) { mVersion = version; - if (version < FIRST_VERSION_WITH_PARENT_ADDRESS && hasParentAddress) { - throw new RuntimeException("Parent addresses are only supported with versions " - + FIRST_VERSION_WITH_PARENT_ADDRESS + " and ulterior."); - } - mHasParentAddress = hasParentAddress; - - if (version < FIRST_VERSION_WITH_LINKEDLIST_NODE && hasLinkedListNode) { - throw new RuntimeException("Linked list nodes are only supported with versions " - + FIRST_VERSION_WITH_LINKEDLIST_NODE + " and ulterior."); - } - if (!hasParentAddress && hasLinkedListNode) { - throw new RuntimeException("Linked list nodes need parent addresses."); + if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) { + throw new RuntimeException("Dynamic updates are only supported with versions " + + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior."); } - mHasLinkedListNode = hasLinkedListNode; + mSupportsDynamicUpdate = supportsDynamicUpdate; } } diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 98cf308c8..b0b3777df 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -21,6 +21,7 @@ import com.android.inputmethod.latin.Constants; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; @@ -28,8 +29,7 @@ import java.util.LinkedList; /** * A dictionary that can fusion heads and tails of words for more compression. */ -public class FusionDictionary implements Iterable<Word> { - +public final class FusionDictionary implements Iterable<Word> { private static final boolean DBG = MakedictLog.DBG; /** @@ -40,7 +40,7 @@ public class FusionDictionary implements Iterable<Word> { * This class also contains fields to cache size and address, to help with binary * generation. */ - public static class Node { + public static final class Node { ArrayList<CharGroup> mData; // To help with binary generation int mCachedSize = Integer.MIN_VALUE; @@ -60,7 +60,7 @@ public class FusionDictionary implements Iterable<Word> { * * This represents an "attribute", that is either a bigram or a shortcut. */ - public static class WeightedString { + public static final class WeightedString { public final String mWord; public int mFrequency; public WeightedString(String word, int frequency) { @@ -94,7 +94,7 @@ public class FusionDictionary implements Iterable<Word> { * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or * bigrams, but a non-terminal may not. Moreover, children, if present, are null. */ - public static class CharGroup { + public static final class CharGroup { public static final int NOT_A_TERMINAL = -1; final int mChars[]; ArrayList<WeightedString> mShortcutTargets; @@ -142,6 +142,33 @@ public class FusionDictionary implements Iterable<Word> { return NOT_A_TERMINAL != mFrequency; } + public int getFrequency() { + return mFrequency; + } + + public boolean getIsNotAWord() { + return mIsNotAWord; + } + + public boolean getIsBlacklistEntry() { + return mIsBlacklistEntry; + } + + public ArrayList<WeightedString> getShortcutTargets() { + // We don't want write permission to escape outside the package, so we return a copy + if (null == mShortcutTargets) return null; + final ArrayList<WeightedString> copyOfShortcutTargets = + new ArrayList<WeightedString>(mShortcutTargets); + return copyOfShortcutTargets; + } + + public ArrayList<WeightedString> getBigrams() { + // We don't want write permission to escape outside the package, so we return a copy + if (null == mBigrams) return null; + final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); + return copyOfBigrams; + } + public boolean hasSeveralChars() { assert(mChars.length > 0); return 1 < mChars.length; @@ -250,10 +277,8 @@ public class FusionDictionary implements Iterable<Word> { /** * Options global to the dictionary. - * - * There are no options at the moment, so this class is empty. */ - public static class DictionaryOptions { + public static final class DictionaryOptions { public final boolean mGermanUmlautProcessing; public final boolean mFrenchLigatureProcessing; public final HashMap<String, String> mAttributes; @@ -263,6 +288,43 @@ public class FusionDictionary implements Iterable<Word> { mGermanUmlautProcessing = germanUmlautProcessing; mFrenchLigatureProcessing = frenchLigatureProcessing; } + @Override + public String toString() { // Convenience method + return toString(0, false); + } + public String toString(final int indentCount, final boolean plumbing) { + final StringBuilder indent = new StringBuilder(); + if (plumbing) { + indent.append("H:"); + } else { + for (int i = 0; i < indentCount; ++i) { + indent.append(" "); + } + } + final StringBuilder s = new StringBuilder(); + for (final String optionKey : mAttributes.keySet()) { + s.append(indent); + s.append(optionKey); + s.append(" = "); + if ("date".equals(optionKey) && !plumbing) { + // Date needs a number of milliseconds, but the dictionary contains seconds + s.append(new Date( + 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); + } else { + s.append(mAttributes.get(optionKey)); + } + s.append("\n"); + } + if (mGermanUmlautProcessing) { + s.append(indent); + s.append("Needs German umlaut processing\n"); + } + if (mFrenchLigatureProcessing) { + s.append(indent); + s.append("Needs French ligature processing\n"); + } + return s.toString(); + } } public final DictionaryOptions mOptions; @@ -280,7 +342,7 @@ public class FusionDictionary implements Iterable<Word> { /** * Helper method to convert a String to an int array. */ - static private int[] getCodePoints(final String word) { + static int[] getCodePoints(final String word) { // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray, // which is not visible from the makedict package. Factor this code. final char[] characters = word.toCharArray(); @@ -359,6 +421,10 @@ public class FusionDictionary implements Iterable<Word> { if (charGroup2 == null) { add(getCodePoints(word2), 0, null, false /* isNotAWord */, false /* isBlacklistEntry */); + // The chargroup for the first word may have moved by the above insertion, + // if word1 and word2 share a common stem that happens not to have been + // a cutting point until now. In this case, we need to refresh charGroup. + charGroup = findWordInTree(mRoot, word1); } charGroup.addBigram(word2, frequency); } else { @@ -511,7 +577,7 @@ public class FusionDictionary implements Iterable<Word> { * is ignored. * This comparator imposes orderings that are inconsistent with equals. */ - static private class CharGroupComparator implements java.util.Comparator<CharGroup> { + static private final class CharGroupComparator implements java.util.Comparator<CharGroup> { @Override public int compare(CharGroup c1, CharGroup c2) { if (c1.mChars[0] == c2.mChars[0]) return 0; @@ -746,9 +812,8 @@ public class FusionDictionary implements Iterable<Word> { * * This is purely for convenience. */ - public static class DictionaryIterator implements Iterator<Word> { - - private static class Position { + public static final class DictionaryIterator implements Iterator<Word> { + private static final class Position { public Iterator<CharGroup> pos; public int length; public Position(ArrayList<CharGroup> groups) { diff --git a/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java b/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java index 3f0cd0796..6c6b00b6a 100644 --- a/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java +++ b/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java @@ -21,7 +21,7 @@ import android.util.Log; /** * Wrapper to redirect log events to the right output medium. */ -public class MakedictLog { +public final class MakedictLog { public static final boolean DBG = false; private static final String TAG = MakedictLog.class.getSimpleName(); diff --git a/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java b/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java index 5b41d27f2..5bb24da74 100644 --- a/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java +++ b/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java @@ -22,7 +22,7 @@ package com.android.inputmethod.latin.makedict; * An attribute is either a bigram or a shortcut. * All instances of this class are always immutable. */ -public class PendingAttribute { +public final class PendingAttribute { public final int mFrequency; public final int mAddress; public PendingAttribute(final int frequency, final int address) { diff --git a/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java b/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java index bd42fb8fa..dbb2ea870 100644 --- a/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java +++ b/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java @@ -19,7 +19,7 @@ package com.android.inputmethod.latin.makedict; /** * Simple exception thrown when a file format is not recognized. */ -public class UnsupportedFormatException extends Exception { +public final class UnsupportedFormatException extends Exception { public UnsupportedFormatException(String description) { super(description); } diff --git a/java/src/com/android/inputmethod/latin/makedict/Word.java b/java/src/com/android/inputmethod/latin/makedict/Word.java index 4683ef154..4c4f18f1a 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Word.java +++ b/java/src/com/android/inputmethod/latin/makedict/Word.java @@ -26,7 +26,7 @@ import java.util.Arrays; * * This is chiefly used to iterate a dictionary. */ -public class Word implements Comparable<Word> { +public final class Word implements Comparable<Word> { public final String mWord; public final int mFrequency; public final ArrayList<WeightedString> mShortcutTargets; |