diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java')
-rw-r--r-- | java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java | 845 |
1 files changed, 816 insertions, 29 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 7a1b9dcb7..ee0e9cd7e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -16,21 +16,34 @@ package com.android.inputmethod.latin.makedict; +import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.IOException; +import java.io.OutputStream; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; import java.util.Map; import java.util.Stack; -public class BinaryDictIOUtils { +public final class BinaryDictIOUtils { private static final boolean DBG = false; + private static final int MSB24 = 0x800000; + private static final int SINT24_MAX = 0x7FFFFF; + private static final int MAX_JUMPS = 10000; - private static class Position { + private BinaryDictIOUtils() { + // This utility class is not publicly instantiable. + } + + private static final class Position { public static final int NOT_READ_GROUPCOUNT = -1; public int mAddress; @@ -77,7 +90,10 @@ public class BinaryDictIOUtils { p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup); p.mPosition = 0; } - + if (p.mNumOfCharGroup == 0) { + stack.pop(); + continue; + } CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer, p.mAddress - headerSize, formatOptions); for (int i = 0; i < info.mCharacters.length; ++i) { @@ -85,20 +101,36 @@ public class BinaryDictIOUtils { } p.mPosition++; - if (info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) { // found word + final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags, + formatOptions); + final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags, + formatOptions); + if (!isMovedGroup && !isDeletedGroup + && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); frequencies.put(info.mOriginalAddress, info.mFrequency); if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); } if (p.mPosition == p.mNumOfCharGroup) { - stack.pop(); + if (formatOptions.mSupportsDynamicUpdate) { + final int forwardLinkAddress = buffer.readUnsignedInt24(); + if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) { + // the node has a forward link. + p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT; + p.mAddress = forwardLinkAddress; + } else { + stack.pop(); + } + } else { + stack.pop(); + } } else { // the node has more groups. p.mAddress = buffer.position(); } - if (BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) { + if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) { Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); stack.push(childrenPos); } @@ -136,6 +168,7 @@ public class BinaryDictIOUtils { * @throws IOException * @throws UnsupportedFormatException */ + @UsedForTesting public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer, final String word) throws IOException, UnsupportedFormatException { if (word == null) return FormatSpec.NOT_VALID_WORD; @@ -146,48 +179,802 @@ public class BinaryDictIOUtils { final int wordLen = word.codePointCount(0, word.length()); for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; - int groupOffset = buffer.position() - header.mHeaderSize; + + do { + final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); + boolean foundNextCharGroup = false; + for (int i = 0; i < charGroupCount; ++i) { + final int charGroupPos = buffer.position(); + final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, + buffer.position(), header.mFormatOptions); + final boolean isMovedGroup = + BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, + header.mFormatOptions); + final boolean isDeletedGroup = + BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags, + header.mFormatOptions); + if (isMovedGroup) continue; + boolean same = true; + for (int p = 0, j = word.offsetByCodePoints(0, wordPos); + p < currentInfo.mCharacters.length; + ++p, j = word.offsetByCodePoints(j, 1)) { + if (wordPos + p >= wordLen + || word.codePointAt(j) != currentInfo.mCharacters[p]) { + same = false; + break; + } + } + + if (same) { + // found the group matches the word. + if (wordPos + currentInfo.mCharacters.length == wordLen) { + if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL + || isDeletedGroup) { + return FormatSpec.NOT_VALID_WORD; + } else { + return charGroupPos; + } + } + wordPos += currentInfo.mCharacters.length; + if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { + return FormatSpec.NOT_VALID_WORD; + } + foundNextCharGroup = true; + buffer.position(currentInfo.mChildrenAddress); + break; + } + } + + // If we found the next char group, it is under the file pointer. + // But if not, we are at the end of this node so we expect to have + // a forward link address that we need to consult and possibly resume + // search on the next node in the linked list. + if (foundNextCharGroup) break; + if (!header.mFormatOptions.mSupportsDynamicUpdate) { + return FormatSpec.NOT_VALID_WORD; + } + + final int forwardLinkAddress = buffer.readUnsignedInt24(); + if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + return FormatSpec.NOT_VALID_WORD; + } + buffer.position(forwardLinkAddress); + } while(true); + } + return FormatSpec.NOT_VALID_WORD; + } + + private static int markAsDeleted(final int flags) { + return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; + } + + /** + * Delete the word from the binary file. + * + * @param buffer the buffer to write. + * @param word the word we delete + * @throws IOException + * @throws UnsupportedFormatException + */ + @UsedForTesting + public static void deleteWord(final FusionDictionaryBufferInterface buffer, + final String word) throws IOException, UnsupportedFormatException { + buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + final int wordPosition = getTerminalPosition(buffer, word); + if (wordPosition == FormatSpec.NOT_VALID_WORD) return; + + buffer.position(wordPosition); + final int flags = buffer.readUnsignedByte(); + buffer.position(wordPosition); + buffer.put((byte)markAsDeleted(flags)); + } + + /** + * @return the size written, in bytes. Always 3 bytes. + */ + private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer, + final int value) { + final int absValue = Math.abs(value); + buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); + buffer.put((byte)((absValue >> 8) & 0xFF)); + buffer.put((byte)(absValue & 0xFF)); + return 3; + } + + /** + * @return the size written, in bytes. Always 3 bytes. + */ + private static int writeSInt24ToStream(final OutputStream destination, final int value) + throws IOException { + final int absValue = Math.abs(value); + destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); + destination.write((byte)((absValue >> 8) & 0xFF)); + destination.write((byte)(absValue & 0xFF)); + return 3; + } + + /** + * @return the size written, in bytes. 1, 2, or 3 bytes. + */ + private static int writeVariableAddress(final OutputStream destination, final int value) + throws IOException { + switch (BinaryDictInputOutput.getByteSize(value)) { + case 1: + destination.write((byte)value); + break; + case 2: + destination.write((byte)(0xFF & (value >> 8))); + destination.write((byte)(0xFF & value)); + break; + case 3: + destination.write((byte)(0xFF & (value >> 16))); + destination.write((byte)(0xFF & (value >> 8))); + destination.write((byte)(0xFF & value)); + break; + } + return BinaryDictInputOutput.getByteSize(value); + } + + /** + * Update a parent address in a CharGroup that is referred to by groupOriginAddress. + * + * @param buffer the buffer to write. + * @param groupOriginAddress the address of the group. + * @param newParentAddress the absolute address of the parent. + * @param formatOptions file format options. + */ + public static void updateParentAddress(final FusionDictionaryBufferInterface buffer, + final int groupOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(groupOriginAddress); + if (!formatOptions.mSupportsDynamicUpdate) { + throw new RuntimeException("this file format does not support parent addresses"); + } + final int flags = buffer.readUnsignedByte(); + if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) { + // if the group is moved, the parent address is stored in the destination group. + // We are guaranteed to process the destination group later, so there is no need to + // update anything here. + buffer.position(originalPosition); + return; + } + if (DBG) { + MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress); + } + final int parentOffset = newParentAddress - groupOriginAddress; + writeSInt24ToBuffer(buffer, parentOffset); + buffer.position(originalPosition); + } + + private static void skipCharGroup(final FusionDictionaryBufferInterface buffer, + final FormatOptions formatOptions) { + final int flags = buffer.readUnsignedByte(); + BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { + final int shortcutsSize = buffer.readUnsignedShort(); + buffer.position(buffer.position() + shortcutsSize + - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE); + } + if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + final int bigramFlags = buffer.readUnsignedByte(); + switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + buffer.readUnsignedByte(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + buffer.readUnsignedShort(); + break; + case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + buffer.readUnsignedInt24(); + break; + } + if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break; + } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) { + throw new RuntimeException("Too many bigrams in a group."); + } + } + } + + /** + * Update parent addresses in a Node that is referred to by nodeOriginAddress. + * + * @param buffer the buffer to be modified. + * @param nodeOriginAddress the address of a modified Node. + * @param newParentAddress the address to be written. + * @param formatOptions file format options. + */ + public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newParentAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(nodeOriginAddress); + do { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) { + updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions); + skipCharGroup(buffer, formatOptions); + } + final int forwardLinkAddress = buffer.readUnsignedInt24(); + buffer.position(forwardLinkAddress); + } while (formatOptions.mSupportsDynamicUpdate + && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); + buffer.position(originalPosition); + } + + private static void skipString(final FusionDictionaryBufferInterface buffer, + final boolean hasMultipleChars) { + if (hasMultipleChars) { + int character = CharEncoding.readChar(buffer); + while (character != FormatSpec.INVALID_CHARACTER) { + character = CharEncoding.readChar(buffer); + } + } else { + CharEncoding.readChar(buffer); + } + } + + /** + * Write a string to a stream. + * + * @param destination the stream to write. + * @param word the string to be written. + * @return the size written, in bytes. + * @throws IOException + */ + private static int writeString(final OutputStream destination, final String word) + throws IOException { + int size = 0; + final int length = word.length(); + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + if (CharEncoding.getCharSize(codePoint) == 1) { + destination.write((byte)codePoint); + size++; + } else { + destination.write((byte)(0xFF & (codePoint >> 16))); + destination.write((byte)(0xFF & (codePoint >> 8))); + destination.write((byte)(0xFF & codePoint)); + size += 3; + } + } + destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); + size += FormatSpec.GROUP_TERMINATOR_SIZE; + return size; + } + + /** + * Update a children address in a CharGroup that is addressed by groupOriginAddress. + * + * @param buffer the buffer to write. + * @param groupOriginAddress the address of the group. + * @param newChildrenAddress the absolute address of the child. + * @param formatOptions file format options. + */ + public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer, + final int groupOriginAddress, final int newChildrenAddress, + final FormatOptions formatOptions) { + final int originalPosition = buffer.position(); + buffer.position(groupOriginAddress); + final int flags = buffer.readUnsignedByte(); + final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); + skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); + final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS + ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); + writeSInt24ToBuffer(buffer, childrenOffset); + buffer.position(originalPosition); + } + + /** + * Write a char group to an output stream. + * A char group is an in-memory representation of a node in trie. + * A char group info is an on-disk representation of a node. + * + * @param destination the stream to write. + * @param info the char group info to be written. + * @return the size written, in bytes. + */ + public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info) + throws IOException { + int size = FormatSpec.GROUP_FLAGS_SIZE; + destination.write((byte)info.mFlags); + final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ? + FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress; + size += writeSInt24ToStream(destination, parentOffset); + + for (int i = 0; i < info.mCharacters.length; ++i) { + if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) { + destination.write((byte)info.mCharacters[i]); + size++; + } else { + size += writeSInt24ToStream(destination, info.mCharacters[i]); + } + } + if (info.mCharacters.length > 1) { + destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); + size++; + } + + if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { + destination.write((byte)info.mFrequency); + size++; + } + + if (DBG) { + MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size + + ", child=" + info.mChildrenAddress + ", characters =" + + new String(info.mCharacters, 0, info.mCharacters.length)); + } + final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ? + 0 : info.mChildrenAddress - (info.mOriginalAddress + size); + writeSInt24ToStream(destination, childrenOffset); + size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; + + if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) { + final int shortcutListSize = + BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); + destination.write((byte)(shortcutListSize >> 8)); + destination.write((byte)(shortcutListSize & 0xFF)); + size += 2; + final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + destination.write((byte)BinaryDictInputOutput.makeShortcutFlags( + shortcutIterator.hasNext(), target.mFrequency)); + size++; + size += writeString(destination, target.mWord); + } + } + + if (info.mBigrams != null) { + // TODO: Consolidate this code with the code that computes the size of the bigram list + // in BinaryDictionaryInputOutput#computeActualNodeSize + for (int i = 0; i < info.mBigrams.size(); ++i) { + + final int bigramFrequency = info.mBigrams.get(i).mFrequency; + int bigramFlags = (i < info.mBigrams.size() - 1) + ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0; + size++; + final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress + + size); + bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0; + switch (BinaryDictInputOutput.getByteSize(bigramOffset)) { + case 1: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + break; + } + bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY; + destination.write((byte)bigramFlags); + size += writeVariableAddress(destination, Math.abs(bigramOffset)); + } + } + return size; + } + + @SuppressWarnings("unused") + private static void updateForwardLink(final FusionDictionaryBufferInterface buffer, + final int nodeOriginAddress, final int newNodeAddress, + final FormatOptions formatOptions) { + buffer.position(nodeOriginAddress); + int jumpCount = 0; + while (jumpCount++ < MAX_JUMPS) { + final int count = BinaryDictInputOutput.readCharGroupCount(buffer); + for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions); + final int forwardLinkAddress = buffer.readUnsignedInt24(); + if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + writeSInt24ToBuffer(buffer, newNodeAddress); + return; + } + buffer.position(forwardLinkAddress); + } + if (DBG && jumpCount >= MAX_JUMPS) { + throw new RuntimeException("too many jumps, probably a bug."); + } + } + + /** + * Helper method to move a char group to the tail of the file. + */ + private static int moveCharGroup(final OutputStream destination, + final FusionDictionaryBufferInterface buffer, final CharGroupInfo info, + final int nodeOriginAddress, final int oldGroupAddress, + final FormatOptions formatOptions) throws IOException { + updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions); + buffer.position(oldGroupAddress); + final int currentFlags = buffer.readUnsignedByte(); + buffer.position(oldGroupAddress); + buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags + & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); + int size = FormatSpec.GROUP_FLAGS_SIZE; + updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions); + size += writeNode(destination, new CharGroupInfo[] { info }); + return size; + } + + /** + * Compute the size of the char group. + */ + private static int computeGroupSize(final CharGroupInfo info, + final FormatOptions formatOptions) { + int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE + + BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters) + + BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions); + if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { + size += FormatSpec.GROUP_FREQUENCY_SIZE; + } + if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) { + size += BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); + } + if (info.mBigrams != null) { + for (final PendingAttribute attr : info.mBigrams) { + size += FormatSpec.GROUP_FLAGS_SIZE; + size += BinaryDictInputOutput.getByteSize(attr.mAddress); + } + } + return size; + } + + /** + * Write a node to the stream. + * + * @param destination the stream to write. + * @param infos groups to be written. + * @return the size written, in bytes. + * @throws IOException + */ + private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos) + throws IOException { + int size = BinaryDictInputOutput.getGroupCountSize(infos.length); + switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) { + case 1: + destination.write((byte)infos.length); + break; + case 2: + destination.write((byte)(infos.length >> 8)); + destination.write((byte)(infos.length & 0xFF)); + break; + default: + throw new RuntimeException("Invalid group count size."); + } + for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info); + writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS); + return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; + } + + /** + * Move a group that is referred to by oldGroupOrigin to the tail of the file. + * And set the children address to the byte after the group. + * + * @param nodeOrigin the address of the tail of the file. + * @param characters + * @param length + * @param flags + * @param frequency + * @param parentAddress + * @param shortcutTargets + * @param bigrams + * @param destination the stream representing the tail of the file. + * @param buffer the buffer representing the (constant-size) body of the file. + * @param oldNodeOrigin + * @param oldGroupOrigin + * @param formatOptions + * @return the size written, in bytes. + * @throws IOException + */ + private static int moveGroup(final int nodeOrigin, final int[] characters, final int length, + final int flags, final int frequency, final int parentAddress, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<PendingAttribute> bigrams, final OutputStream destination, + final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin, + final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException { + int size = 0; + final int newGroupOrigin = nodeOrigin + 1; + final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length); + final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */, + flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS, + shortcutTargets, bigrams); + size = computeGroupSize(tmpInfo, formatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size, + flags, writtenCharacters, frequency, parentAddress, + nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, + bigrams); + moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions); + return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; + } + + /** + * Insert a word into a binary dictionary. + * + * @param buffer + * @param destination + * @param word + * @param frequency + * @param bigramStrings + * @param shortcuts + * @throws IOException + * @throws UnsupportedFormatException + */ + // TODO: Support batch insertion. + // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. + @UsedForTesting + public static void insertWord(final FusionDictionaryBufferInterface buffer, + final OutputStream destination, final String word, final int frequency, + final ArrayList<WeightedString> bigramStrings, + final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, + final boolean isBlackListEntry) + throws IOException, UnsupportedFormatException { + final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>(); + if (bigramStrings != null) { + for (final WeightedString bigram : bigramStrings) { + int position = getTerminalPosition(buffer, bigram.mWord); + if (position == FormatSpec.NOT_VALID_WORD) { + // TODO: figure out what is the correct thing to do here. + } else { + bigrams.add(new PendingAttribute(bigram.mFrequency, position)); + } + } + } + + final boolean isTerminal = true; + final boolean hasBigrams = !bigrams.isEmpty(); + final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty(); + + // find the insert position of the word. + if (buffer.position() != 0) buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + + int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position(); + final int[] codePoints = FusionDictionary.getCodePoints(word); + final int wordLen = codePoints.length; + + for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { + if (wordPos >= wordLen) break; + nodeOriginAddress = buffer.position(); + int nodeParentAddress = -1; final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); - groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount); + boolean foundNextGroup = false; for (int i = 0; i < charGroupCount; ++i) { - final int charGroupPos = buffer.position(); + address = buffer.position(); final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, buffer.position(), header.mFormatOptions); - boolean same = true; - for (int p = 0, j = word.offsetByCodePoints(0, wordPos); - p < currentInfo.mCharacters.length; - ++p, j = word.offsetByCodePoints(j, 1)) { - if (wordPos + p >= wordLen - || word.codePointAt(j) != currentInfo.mCharacters[p]) { - same = false; + final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, + header.mFormatOptions); + if (isMovedGroup) continue; + nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) + ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address; + boolean matched = true; + for (int p = 0; p < currentInfo.mCharacters.length; ++p) { + if (wordPos + p >= wordLen) { + /* + * splitting + * before + * abcd - ef + * + * insert "abc" + * + * after + * abc - d - ef + */ + final int newNodeAddress = buffer.limit(); + final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, + isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */, + false /* isBlackListEntry */, header.mFormatOptions); + int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags, + frequency, nodeParentAddress, shortcuts, bigrams, destination, + buffer, nodeOriginAddress, address, header.mFormatOptions); + + final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, + currentInfo.mCharacters.length); + if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + updateParentAddresses(buffer, currentInfo.mChildrenAddress, + newNodeAddress + written + 1, header.mFormatOptions); + } + final CharGroupInfo newInfo2 = new CharGroupInfo( + newNodeAddress + written + 1, -1 /* endAddress */, + currentInfo.mFlags, characters2, currentInfo.mFrequency, + newNodeAddress + 1, currentInfo.mChildrenAddress, + currentInfo.mShortcutTargets, currentInfo.mBigrams); + writeNode(destination, new CharGroupInfo[] { newInfo2 }); + return; + } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) { + if (p > 0) { + /* + * splitting + * before + * ab - cd + * + * insert "ac" + * + * after + * a - b - cd + * | + * - c + */ + + final int newNodeAddress = buffer.limit(); + final int childrenAddress = currentInfo.mChildrenAddress; + + // move prefix + final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1, + false /* isTerminal */, 0 /* childrenAddressSize*/, + false /* hasShortcut */, false /* hasBigrams */, + false /* isNotAWord */, false /* isBlackListEntry */, + header.mFormatOptions); + int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, + prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, + destination, buffer, nodeOriginAddress, address, + header.mFormatOptions); + + final int[] suffixCharacters = Arrays.copyOfRange( + currentInfo.mCharacters, p, currentInfo.mCharacters.length); + if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + updateParentAddresses(buffer, currentInfo.mChildrenAddress, + newNodeAddress + written + 1, header.mFormatOptions); + } + final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags( + suffixCharacters.length > 1, + (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0, + 0 /* childrenAddressSize */, + (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) + != 0, + (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo suffixInfo = new CharGroupInfo( + newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags, + suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1, + currentInfo.mChildrenAddress, currentInfo.mShortcutTargets, + currentInfo.mBigrams); + written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1; + + final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p, + codePoints.length); + final int flags = BinaryDictInputOutput.makeCharGroupFlags( + newCharacters.length > 1, isTerminal, + 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo( + newNodeAddress + written, -1 /* endAddress */, flags, + newCharacters, frequency, newNodeAddress + 1, + FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); + writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo }); + return; + } + matched = false; break; } } - if (same) { + if (matched) { if (wordPos + currentInfo.mCharacters.length == wordLen) { - if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) { - return FormatSpec.NOT_VALID_WORD; - } else { - return charGroupPos; - } + // the word exists in the dictionary. + // only update group. + final int newNodeAddress = buffer.limit(); + final boolean hasMultipleChars = currentInfo.mCharacters.length > 1; + final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, + -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, + nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, + bigrams); + moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address, + header.mFormatOptions); + return; } wordPos += currentInfo.mCharacters.length; if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { - return FormatSpec.NOT_VALID_WORD; + /* + * found the prefix of the word. + * make new node and link to the node from this group. + * + * before + * ab - cd + * + * insert "abcde" + * + * after + * ab - cd - e + */ + final int newNodeAddress = buffer.limit(); + updateChildrenAddress(buffer, address, newNodeAddress, + header.mFormatOptions); + final int newGroupAddress = newNodeAddress + 1; + final boolean hasMultipleChars = (wordLen - wordPos) > 1; + final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); + final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags, + characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS, + shortcuts, bigrams); + writeNode(destination, new CharGroupInfo[] { newInfo }); + return; } buffer.position(currentInfo.mChildrenAddress); + foundNextGroup = true; break; } - groupOffset = currentInfo.mEndAddress; + } - // not found - if (i >= charGroupCount - 1) { - return FormatSpec.NOT_VALID_WORD; - } + if (foundNextGroup) continue; + + // reached the end of the array. + final int linkAddressPosition = buffer.position(); + int nextLink = buffer.readUnsignedInt24(); + if ((nextLink & MSB24) != 0) { + nextLink = -(nextLink & SINT24_MAX); + } + if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + /* + * expand this node. + * + * before + * ab - cd + * + * insert "abef" + * + * after + * ab - cd + * | + * - ef + */ + + // change the forward link address. + final int newNodeAddress = buffer.limit(); + buffer.position(linkAddressPosition); + writeSInt24ToBuffer(buffer, newNodeAddress); + + final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen); + final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1, + isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams, + isNotAWord, isBlackListEntry, header.mFormatOptions); + final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1, + -1 /* endAddress */, flags, characters, frequency, nodeParentAddress, + FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams); + writeNode(destination, new CharGroupInfo[]{ newInfo }); + return; + } else { + depth--; + buffer.position(nextLink); } } - return FormatSpec.NOT_VALID_WORD; + } + + /** + * Find a word from the buffer. + * + * @param buffer the buffer representing the body of the dictionary file. + * @param word the word searched + * @return the found group + * @throws IOException + * @throws UnsupportedFormatException + */ + @UsedForTesting + public static CharGroupInfo findWordFromBuffer(final FusionDictionaryBufferInterface buffer, + final String word) throws IOException, UnsupportedFormatException { + int position = getTerminalPosition(buffer, word); + if (position != FormatSpec.NOT_VALID_WORD) { + buffer.position(0); + final FileHeader header = BinaryDictInputOutput.readHeader(buffer); + buffer.position(position); + return BinaryDictInputOutput.readCharGroup(buffer, position, header.mFormatOptions); + } + return null; } } |