aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
diff options
context:
space:
mode:
authorSatoshi Kataoka <satok@google.com>2012-11-06 12:49:53 +0900
committerSatoshi Kataoka <satok@google.com>2012-11-06 12:49:53 +0900
commit555e15a96a00e8829981557d96e9fa7fc5a74f8c (patch)
tree9750c7af9777b63c176cc780c2a6de4849931434 /java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
parent9381ab669f12664f7e2debea846d3ce71f89b256 (diff)
parent5f2fa6b82cbb6714ab2996aebc16f10c62d0e673 (diff)
downloadlatinime-555e15a96a00e8829981557d96e9fa7fc5a74f8c.tar.gz
latinime-555e15a96a00e8829981557d96e9fa7fc5a74f8c.tar.xz
latinime-555e15a96a00e8829981557d96e9fa7fc5a74f8c.zip
Merge remote-tracking branch 'goog/master' into mergescriptpackage
Conflicts: java/res/values-ca/strings.xml java/res/values-cs/strings.xml java/res/values-de/strings.xml java/res/values-es/strings.xml java/res/values-hr/strings.xml java/res/values-hu/strings.xml java/res/values-it/strings.xml java/res/values-lv/strings.xml java/res/values-nb/strings.xml java/res/values-nl/strings.xml java/res/values-pl/strings.xml java/res/values-pt/strings.xml java/res/values-ro/strings.xml java/res/values-ru/strings.xml java/res/values-sv/strings.xml java/res/values-sw/strings.xml java/res/values-tr/strings.xml java/res/values-uk/strings.xml java/res/values-zh-rCN/strings.xml java/res/values-zh-rTW/strings.xml java/src/com/android/inputmethod/latin/RichInputConnection.java Change-Id: Iba00dd5b86cb16d72968bc7e40d75853845b6dcb
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java845
1 files changed, 816 insertions, 29 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
index 7a1b9dcb7..ee0e9cd7e 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
@@ -16,21 +16,34 @@
package com.android.inputmethod.latin.makedict;
+import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.Constants;
+import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
+import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import java.io.IOException;
+import java.io.OutputStream;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
import java.util.Map;
import java.util.Stack;
-public class BinaryDictIOUtils {
+public final class BinaryDictIOUtils {
private static final boolean DBG = false;
+ private static final int MSB24 = 0x800000;
+ private static final int SINT24_MAX = 0x7FFFFF;
+ private static final int MAX_JUMPS = 10000;
- private static class Position {
+ private BinaryDictIOUtils() {
+ // This utility class is not publicly instantiable.
+ }
+
+ private static final class Position {
public static final int NOT_READ_GROUPCOUNT = -1;
public int mAddress;
@@ -77,7 +90,10 @@ public class BinaryDictIOUtils {
p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup);
p.mPosition = 0;
}
-
+ if (p.mNumOfCharGroup == 0) {
+ stack.pop();
+ continue;
+ }
CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer,
p.mAddress - headerSize, formatOptions);
for (int i = 0; i < info.mCharacters.length; ++i) {
@@ -85,20 +101,36 @@ public class BinaryDictIOUtils {
}
p.mPosition++;
- if (info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) { // found word
+ final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags,
+ formatOptions);
+ final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags,
+ formatOptions);
+ if (!isMovedGroup && !isDeletedGroup
+ && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word
words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
frequencies.put(info.mOriginalAddress, info.mFrequency);
if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
}
if (p.mPosition == p.mNumOfCharGroup) {
- stack.pop();
+ if (formatOptions.mSupportsDynamicUpdate) {
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ // the node has a forward link.
+ p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT;
+ p.mAddress = forwardLinkAddress;
+ } else {
+ stack.pop();
+ }
+ } else {
+ stack.pop();
+ }
} else {
// the node has more groups.
p.mAddress = buffer.position();
}
- if (BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
+ if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
Position childrenPos = new Position(info.mChildrenAddress + headerSize, index);
stack.push(childrenPos);
}
@@ -136,6 +168,7 @@ public class BinaryDictIOUtils {
* @throws IOException
* @throws UnsupportedFormatException
*/
+ @UsedForTesting
public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
final String word) throws IOException, UnsupportedFormatException {
if (word == null) return FormatSpec.NOT_VALID_WORD;
@@ -146,48 +179,802 @@ public class BinaryDictIOUtils {
final int wordLen = word.codePointCount(0, word.length());
for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
- int groupOffset = buffer.position() - header.mHeaderSize;
+
+ do {
+ final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
+ boolean foundNextCharGroup = false;
+ for (int i = 0; i < charGroupCount; ++i) {
+ final int charGroupPos = buffer.position();
+ final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
+ buffer.position(), header.mFormatOptions);
+ final boolean isMovedGroup =
+ BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
+ header.mFormatOptions);
+ final boolean isDeletedGroup =
+ BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags,
+ header.mFormatOptions);
+ if (isMovedGroup) continue;
+ boolean same = true;
+ for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
+ p < currentInfo.mCharacters.length;
+ ++p, j = word.offsetByCodePoints(j, 1)) {
+ if (wordPos + p >= wordLen
+ || word.codePointAt(j) != currentInfo.mCharacters[p]) {
+ same = false;
+ break;
+ }
+ }
+
+ if (same) {
+ // found the group matches the word.
+ if (wordPos + currentInfo.mCharacters.length == wordLen) {
+ if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL
+ || isDeletedGroup) {
+ return FormatSpec.NOT_VALID_WORD;
+ } else {
+ return charGroupPos;
+ }
+ }
+ wordPos += currentInfo.mCharacters.length;
+ if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
+ return FormatSpec.NOT_VALID_WORD;
+ }
+ foundNextCharGroup = true;
+ buffer.position(currentInfo.mChildrenAddress);
+ break;
+ }
+ }
+
+ // If we found the next char group, it is under the file pointer.
+ // But if not, we are at the end of this node so we expect to have
+ // a forward link address that we need to consult and possibly resume
+ // search on the next node in the linked list.
+ if (foundNextCharGroup) break;
+ if (!header.mFormatOptions.mSupportsDynamicUpdate) {
+ return FormatSpec.NOT_VALID_WORD;
+ }
+
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ return FormatSpec.NOT_VALID_WORD;
+ }
+ buffer.position(forwardLinkAddress);
+ } while(true);
+ }
+ return FormatSpec.NOT_VALID_WORD;
+ }
+
+ private static int markAsDeleted(final int flags) {
+ return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED;
+ }
+
+ /**
+ * Delete the word from the binary file.
+ *
+ * @param buffer the buffer to write.
+ * @param word the word we delete
+ * @throws IOException
+ * @throws UnsupportedFormatException
+ */
+ @UsedForTesting
+ public static void deleteWord(final FusionDictionaryBufferInterface buffer,
+ final String word) throws IOException, UnsupportedFormatException {
+ buffer.position(0);
+ final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
+ final int wordPosition = getTerminalPosition(buffer, word);
+ if (wordPosition == FormatSpec.NOT_VALID_WORD) return;
+
+ buffer.position(wordPosition);
+ final int flags = buffer.readUnsignedByte();
+ buffer.position(wordPosition);
+ buffer.put((byte)markAsDeleted(flags));
+ }
+
+ /**
+ * @return the size written, in bytes. Always 3 bytes.
+ */
+ private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer,
+ final int value) {
+ final int absValue = Math.abs(value);
+ buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
+ buffer.put((byte)((absValue >> 8) & 0xFF));
+ buffer.put((byte)(absValue & 0xFF));
+ return 3;
+ }
+
+ /**
+ * @return the size written, in bytes. Always 3 bytes.
+ */
+ private static int writeSInt24ToStream(final OutputStream destination, final int value)
+ throws IOException {
+ final int absValue = Math.abs(value);
+ destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
+ destination.write((byte)((absValue >> 8) & 0xFF));
+ destination.write((byte)(absValue & 0xFF));
+ return 3;
+ }
+
+ /**
+ * @return the size written, in bytes. 1, 2, or 3 bytes.
+ */
+ private static int writeVariableAddress(final OutputStream destination, final int value)
+ throws IOException {
+ switch (BinaryDictInputOutput.getByteSize(value)) {
+ case 1:
+ destination.write((byte)value);
+ break;
+ case 2:
+ destination.write((byte)(0xFF & (value >> 8)));
+ destination.write((byte)(0xFF & value));
+ break;
+ case 3:
+ destination.write((byte)(0xFF & (value >> 16)));
+ destination.write((byte)(0xFF & (value >> 8)));
+ destination.write((byte)(0xFF & value));
+ break;
+ }
+ return BinaryDictInputOutput.getByteSize(value);
+ }
+
+ /**
+ * Update a parent address in a CharGroup that is referred to by groupOriginAddress.
+ *
+ * @param buffer the buffer to write.
+ * @param groupOriginAddress the address of the group.
+ * @param newParentAddress the absolute address of the parent.
+ * @param formatOptions file format options.
+ */
+ public static void updateParentAddress(final FusionDictionaryBufferInterface buffer,
+ final int groupOriginAddress, final int newParentAddress,
+ final FormatOptions formatOptions) {
+ final int originalPosition = buffer.position();
+ buffer.position(groupOriginAddress);
+ if (!formatOptions.mSupportsDynamicUpdate) {
+ throw new RuntimeException("this file format does not support parent addresses");
+ }
+ final int flags = buffer.readUnsignedByte();
+ if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) {
+ // if the group is moved, the parent address is stored in the destination group.
+ // We are guaranteed to process the destination group later, so there is no need to
+ // update anything here.
+ buffer.position(originalPosition);
+ return;
+ }
+ if (DBG) {
+ MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress);
+ }
+ final int parentOffset = newParentAddress - groupOriginAddress;
+ writeSInt24ToBuffer(buffer, parentOffset);
+ buffer.position(originalPosition);
+ }
+
+ private static void skipCharGroup(final FusionDictionaryBufferInterface buffer,
+ final FormatOptions formatOptions) {
+ final int flags = buffer.readUnsignedByte();
+ BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
+ skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
+ BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions);
+ if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
+ if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) {
+ final int shortcutsSize = buffer.readUnsignedShort();
+ buffer.position(buffer.position() + shortcutsSize
+ - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE);
+ }
+ if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) {
+ int bigramCount = 0;
+ while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
+ final int bigramFlags = buffer.readUnsignedByte();
+ switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) {
+ case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
+ buffer.readUnsignedByte();
+ break;
+ case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
+ buffer.readUnsignedShort();
+ break;
+ case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
+ buffer.readUnsignedInt24();
+ break;
+ }
+ if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break;
+ }
+ if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
+ throw new RuntimeException("Too many bigrams in a group.");
+ }
+ }
+ }
+
+ /**
+ * Update parent addresses in a Node that is referred to by nodeOriginAddress.
+ *
+ * @param buffer the buffer to be modified.
+ * @param nodeOriginAddress the address of a modified Node.
+ * @param newParentAddress the address to be written.
+ * @param formatOptions file format options.
+ */
+ public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer,
+ final int nodeOriginAddress, final int newParentAddress,
+ final FormatOptions formatOptions) {
+ final int originalPosition = buffer.position();
+ buffer.position(nodeOriginAddress);
+ do {
+ final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
+ for (int i = 0; i < count; ++i) {
+ updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions);
+ skipCharGroup(buffer, formatOptions);
+ }
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ buffer.position(forwardLinkAddress);
+ } while (formatOptions.mSupportsDynamicUpdate
+ && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
+ buffer.position(originalPosition);
+ }
+
+ private static void skipString(final FusionDictionaryBufferInterface buffer,
+ final boolean hasMultipleChars) {
+ if (hasMultipleChars) {
+ int character = CharEncoding.readChar(buffer);
+ while (character != FormatSpec.INVALID_CHARACTER) {
+ character = CharEncoding.readChar(buffer);
+ }
+ } else {
+ CharEncoding.readChar(buffer);
+ }
+ }
+
+ /**
+ * Write a string to a stream.
+ *
+ * @param destination the stream to write.
+ * @param word the string to be written.
+ * @return the size written, in bytes.
+ * @throws IOException
+ */
+ private static int writeString(final OutputStream destination, final String word)
+ throws IOException {
+ int size = 0;
+ final int length = word.length();
+ for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
+ final int codePoint = word.codePointAt(i);
+ if (CharEncoding.getCharSize(codePoint) == 1) {
+ destination.write((byte)codePoint);
+ size++;
+ } else {
+ destination.write((byte)(0xFF & (codePoint >> 16)));
+ destination.write((byte)(0xFF & (codePoint >> 8)));
+ destination.write((byte)(0xFF & codePoint));
+ size += 3;
+ }
+ }
+ destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
+ size += FormatSpec.GROUP_TERMINATOR_SIZE;
+ return size;
+ }
+
+ /**
+ * Update a children address in a CharGroup that is addressed by groupOriginAddress.
+ *
+ * @param buffer the buffer to write.
+ * @param groupOriginAddress the address of the group.
+ * @param newChildrenAddress the absolute address of the child.
+ * @param formatOptions file format options.
+ */
+ public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer,
+ final int groupOriginAddress, final int newChildrenAddress,
+ final FormatOptions formatOptions) {
+ final int originalPosition = buffer.position();
+ buffer.position(groupOriginAddress);
+ final int flags = buffer.readUnsignedByte();
+ final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
+ skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
+ if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
+ final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS
+ ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position();
+ writeSInt24ToBuffer(buffer, childrenOffset);
+ buffer.position(originalPosition);
+ }
+
+ /**
+ * Write a char group to an output stream.
+ * A char group is an in-memory representation of a node in trie.
+ * A char group info is an on-disk representation of a node.
+ *
+ * @param destination the stream to write.
+ * @param info the char group info to be written.
+ * @return the size written, in bytes.
+ */
+ public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info)
+ throws IOException {
+ int size = FormatSpec.GROUP_FLAGS_SIZE;
+ destination.write((byte)info.mFlags);
+ final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ?
+ FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress;
+ size += writeSInt24ToStream(destination, parentOffset);
+
+ for (int i = 0; i < info.mCharacters.length; ++i) {
+ if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) {
+ destination.write((byte)info.mCharacters[i]);
+ size++;
+ } else {
+ size += writeSInt24ToStream(destination, info.mCharacters[i]);
+ }
+ }
+ if (info.mCharacters.length > 1) {
+ destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
+ size++;
+ }
+
+ if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
+ destination.write((byte)info.mFrequency);
+ size++;
+ }
+
+ if (DBG) {
+ MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size
+ + ", child=" + info.mChildrenAddress + ", characters ="
+ + new String(info.mCharacters, 0, info.mCharacters.length));
+ }
+ final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ?
+ 0 : info.mChildrenAddress - (info.mOriginalAddress + size);
+ writeSInt24ToStream(destination, childrenOffset);
+ size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+
+ if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) {
+ final int shortcutListSize =
+ BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
+ destination.write((byte)(shortcutListSize >> 8));
+ destination.write((byte)(shortcutListSize & 0xFF));
+ size += 2;
+ final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator();
+ while (shortcutIterator.hasNext()) {
+ final WeightedString target = shortcutIterator.next();
+ destination.write((byte)BinaryDictInputOutput.makeShortcutFlags(
+ shortcutIterator.hasNext(), target.mFrequency));
+ size++;
+ size += writeString(destination, target.mWord);
+ }
+ }
+
+ if (info.mBigrams != null) {
+ // TODO: Consolidate this code with the code that computes the size of the bigram list
+ // in BinaryDictionaryInputOutput#computeActualNodeSize
+ for (int i = 0; i < info.mBigrams.size(); ++i) {
+
+ final int bigramFrequency = info.mBigrams.get(i).mFrequency;
+ int bigramFlags = (i < info.mBigrams.size() - 1)
+ ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0;
+ size++;
+ final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress
+ + size);
+ bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0;
+ switch (BinaryDictInputOutput.getByteSize(bigramOffset)) {
+ case 1:
+ bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
+ break;
+ case 2:
+ bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
+ break;
+ case 3:
+ bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
+ break;
+ }
+ bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
+ destination.write((byte)bigramFlags);
+ size += writeVariableAddress(destination, Math.abs(bigramOffset));
+ }
+ }
+ return size;
+ }
+
+ @SuppressWarnings("unused")
+ private static void updateForwardLink(final FusionDictionaryBufferInterface buffer,
+ final int nodeOriginAddress, final int newNodeAddress,
+ final FormatOptions formatOptions) {
+ buffer.position(nodeOriginAddress);
+ int jumpCount = 0;
+ while (jumpCount++ < MAX_JUMPS) {
+ final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
+ for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions);
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
+ writeSInt24ToBuffer(buffer, newNodeAddress);
+ return;
+ }
+ buffer.position(forwardLinkAddress);
+ }
+ if (DBG && jumpCount >= MAX_JUMPS) {
+ throw new RuntimeException("too many jumps, probably a bug.");
+ }
+ }
+
+ /**
+ * Helper method to move a char group to the tail of the file.
+ */
+ private static int moveCharGroup(final OutputStream destination,
+ final FusionDictionaryBufferInterface buffer, final CharGroupInfo info,
+ final int nodeOriginAddress, final int oldGroupAddress,
+ final FormatOptions formatOptions) throws IOException {
+ updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions);
+ buffer.position(oldGroupAddress);
+ final int currentFlags = buffer.readUnsignedByte();
+ buffer.position(oldGroupAddress);
+ buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags
+ & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
+ int size = FormatSpec.GROUP_FLAGS_SIZE;
+ updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions);
+ size += writeNode(destination, new CharGroupInfo[] { info });
+ return size;
+ }
+
+ /**
+ * Compute the size of the char group.
+ */
+ private static int computeGroupSize(final CharGroupInfo info,
+ final FormatOptions formatOptions) {
+ int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ + BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters)
+ + BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions);
+ if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
+ size += FormatSpec.GROUP_FREQUENCY_SIZE;
+ }
+ if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) {
+ size += BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
+ }
+ if (info.mBigrams != null) {
+ for (final PendingAttribute attr : info.mBigrams) {
+ size += FormatSpec.GROUP_FLAGS_SIZE;
+ size += BinaryDictInputOutput.getByteSize(attr.mAddress);
+ }
+ }
+ return size;
+ }
+
+ /**
+ * Write a node to the stream.
+ *
+ * @param destination the stream to write.
+ * @param infos groups to be written.
+ * @return the size written, in bytes.
+ * @throws IOException
+ */
+ private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos)
+ throws IOException {
+ int size = BinaryDictInputOutput.getGroupCountSize(infos.length);
+ switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) {
+ case 1:
+ destination.write((byte)infos.length);
+ break;
+ case 2:
+ destination.write((byte)(infos.length >> 8));
+ destination.write((byte)(infos.length & 0xFF));
+ break;
+ default:
+ throw new RuntimeException("Invalid group count size.");
+ }
+ for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info);
+ writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
+ return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
+ }
+
+ /**
+ * Move a group that is referred to by oldGroupOrigin to the tail of the file.
+ * And set the children address to the byte after the group.
+ *
+ * @param nodeOrigin the address of the tail of the file.
+ * @param characters
+ * @param length
+ * @param flags
+ * @param frequency
+ * @param parentAddress
+ * @param shortcutTargets
+ * @param bigrams
+ * @param destination the stream representing the tail of the file.
+ * @param buffer the buffer representing the (constant-size) body of the file.
+ * @param oldNodeOrigin
+ * @param oldGroupOrigin
+ * @param formatOptions
+ * @return the size written, in bytes.
+ * @throws IOException
+ */
+ private static int moveGroup(final int nodeOrigin, final int[] characters, final int length,
+ final int flags, final int frequency, final int parentAddress,
+ final ArrayList<WeightedString> shortcutTargets,
+ final ArrayList<PendingAttribute> bigrams, final OutputStream destination,
+ final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin,
+ final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException {
+ int size = 0;
+ final int newGroupOrigin = nodeOrigin + 1;
+ final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length);
+ final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */,
+ flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS,
+ shortcutTargets, bigrams);
+ size = computeGroupSize(tmpInfo, formatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size,
+ flags, writtenCharacters, frequency, parentAddress,
+ nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets,
+ bigrams);
+ moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions);
+ return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
+ }
+
+ /**
+ * Insert a word into a binary dictionary.
+ *
+ * @param buffer
+ * @param destination
+ * @param word
+ * @param frequency
+ * @param bigramStrings
+ * @param shortcuts
+ * @throws IOException
+ * @throws UnsupportedFormatException
+ */
+ // TODO: Support batch insertion.
+ // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary.
+ @UsedForTesting
+ public static void insertWord(final FusionDictionaryBufferInterface buffer,
+ final OutputStream destination, final String word, final int frequency,
+ final ArrayList<WeightedString> bigramStrings,
+ final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
+ final boolean isBlackListEntry)
+ throws IOException, UnsupportedFormatException {
+ final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>();
+ if (bigramStrings != null) {
+ for (final WeightedString bigram : bigramStrings) {
+ int position = getTerminalPosition(buffer, bigram.mWord);
+ if (position == FormatSpec.NOT_VALID_WORD) {
+ // TODO: figure out what is the correct thing to do here.
+ } else {
+ bigrams.add(new PendingAttribute(bigram.mFrequency, position));
+ }
+ }
+ }
+
+ final boolean isTerminal = true;
+ final boolean hasBigrams = !bigrams.isEmpty();
+ final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty();
+
+ // find the insert position of the word.
+ if (buffer.position() != 0) buffer.position(0);
+ final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
+
+ int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position();
+ final int[] codePoints = FusionDictionary.getCodePoints(word);
+ final int wordLen = codePoints.length;
+
+ for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
+ if (wordPos >= wordLen) break;
+ nodeOriginAddress = buffer.position();
+ int nodeParentAddress = -1;
final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
- groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount);
+ boolean foundNextGroup = false;
for (int i = 0; i < charGroupCount; ++i) {
- final int charGroupPos = buffer.position();
+ address = buffer.position();
final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
buffer.position(), header.mFormatOptions);
- boolean same = true;
- for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
- p < currentInfo.mCharacters.length;
- ++p, j = word.offsetByCodePoints(j, 1)) {
- if (wordPos + p >= wordLen
- || word.codePointAt(j) != currentInfo.mCharacters[p]) {
- same = false;
+ final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
+ header.mFormatOptions);
+ if (isMovedGroup) continue;
+ nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS)
+ ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address;
+ boolean matched = true;
+ for (int p = 0; p < currentInfo.mCharacters.length; ++p) {
+ if (wordPos + p >= wordLen) {
+ /*
+ * splitting
+ * before
+ * abcd - ef
+ *
+ * insert "abc"
+ *
+ * after
+ * abc - d - ef
+ */
+ final int newNodeAddress = buffer.limit();
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
+ isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */,
+ false /* isBlackListEntry */, header.mFormatOptions);
+ int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags,
+ frequency, nodeParentAddress, shortcuts, bigrams, destination,
+ buffer, nodeOriginAddress, address, header.mFormatOptions);
+
+ final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p,
+ currentInfo.mCharacters.length);
+ if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
+ updateParentAddresses(buffer, currentInfo.mChildrenAddress,
+ newNodeAddress + written + 1, header.mFormatOptions);
+ }
+ final CharGroupInfo newInfo2 = new CharGroupInfo(
+ newNodeAddress + written + 1, -1 /* endAddress */,
+ currentInfo.mFlags, characters2, currentInfo.mFrequency,
+ newNodeAddress + 1, currentInfo.mChildrenAddress,
+ currentInfo.mShortcutTargets, currentInfo.mBigrams);
+ writeNode(destination, new CharGroupInfo[] { newInfo2 });
+ return;
+ } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) {
+ if (p > 0) {
+ /*
+ * splitting
+ * before
+ * ab - cd
+ *
+ * insert "ac"
+ *
+ * after
+ * a - b - cd
+ * |
+ * - c
+ */
+
+ final int newNodeAddress = buffer.limit();
+ final int childrenAddress = currentInfo.mChildrenAddress;
+
+ // move prefix
+ final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
+ false /* isTerminal */, 0 /* childrenAddressSize*/,
+ false /* hasShortcut */, false /* hasBigrams */,
+ false /* isNotAWord */, false /* isBlackListEntry */,
+ header.mFormatOptions);
+ int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p,
+ prefixFlags, -1 /* frequency */, nodeParentAddress, null, null,
+ destination, buffer, nodeOriginAddress, address,
+ header.mFormatOptions);
+
+ final int[] suffixCharacters = Arrays.copyOfRange(
+ currentInfo.mCharacters, p, currentInfo.mCharacters.length);
+ if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
+ updateParentAddresses(buffer, currentInfo.mChildrenAddress,
+ newNodeAddress + written + 1, header.mFormatOptions);
+ }
+ final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags(
+ suffixCharacters.length > 1,
+ (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0,
+ 0 /* childrenAddressSize */,
+ (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)
+ != 0,
+ (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo suffixInfo = new CharGroupInfo(
+ newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags,
+ suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1,
+ currentInfo.mChildrenAddress, currentInfo.mShortcutTargets,
+ currentInfo.mBigrams);
+ written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1;
+
+ final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p,
+ codePoints.length);
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(
+ newCharacters.length > 1, isTerminal,
+ 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(
+ newNodeAddress + written, -1 /* endAddress */, flags,
+ newCharacters, frequency, newNodeAddress + 1,
+ FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
+ writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo });
+ return;
+ }
+ matched = false;
break;
}
}
- if (same) {
+ if (matched) {
if (wordPos + currentInfo.mCharacters.length == wordLen) {
- if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) {
- return FormatSpec.NOT_VALID_WORD;
- } else {
- return charGroupPos;
- }
+ // the word exists in the dictionary.
+ // only update group.
+ final int newNodeAddress = buffer.limit();
+ final boolean hasMultipleChars = currentInfo.mCharacters.length > 1;
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
+ isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
+ -1 /* endAddress */, flags, currentInfo.mCharacters, frequency,
+ nodeParentAddress, currentInfo.mChildrenAddress, shortcuts,
+ bigrams);
+ moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address,
+ header.mFormatOptions);
+ return;
}
wordPos += currentInfo.mCharacters.length;
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
- return FormatSpec.NOT_VALID_WORD;
+ /*
+ * found the prefix of the word.
+ * make new node and link to the node from this group.
+ *
+ * before
+ * ab - cd
+ *
+ * insert "abcde"
+ *
+ * after
+ * ab - cd - e
+ */
+ final int newNodeAddress = buffer.limit();
+ updateChildrenAddress(buffer, address, newNodeAddress,
+ header.mFormatOptions);
+ final int newGroupAddress = newNodeAddress + 1;
+ final boolean hasMultipleChars = (wordLen - wordPos) > 1;
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
+ isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
+ final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags,
+ characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS,
+ shortcuts, bigrams);
+ writeNode(destination, new CharGroupInfo[] { newInfo });
+ return;
}
buffer.position(currentInfo.mChildrenAddress);
+ foundNextGroup = true;
break;
}
- groupOffset = currentInfo.mEndAddress;
+ }
- // not found
- if (i >= charGroupCount - 1) {
- return FormatSpec.NOT_VALID_WORD;
- }
+ if (foundNextGroup) continue;
+
+ // reached the end of the array.
+ final int linkAddressPosition = buffer.position();
+ int nextLink = buffer.readUnsignedInt24();
+ if ((nextLink & MSB24) != 0) {
+ nextLink = -(nextLink & SINT24_MAX);
+ }
+ if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ /*
+ * expand this node.
+ *
+ * before
+ * ab - cd
+ *
+ * insert "abef"
+ *
+ * after
+ * ab - cd
+ * |
+ * - ef
+ */
+
+ // change the forward link address.
+ final int newNodeAddress = buffer.limit();
+ buffer.position(linkAddressPosition);
+ writeSInt24ToBuffer(buffer, newNodeAddress);
+
+ final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1,
+ isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
+ -1 /* endAddress */, flags, characters, frequency, nodeParentAddress,
+ FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
+ writeNode(destination, new CharGroupInfo[]{ newInfo });
+ return;
+ } else {
+ depth--;
+ buffer.position(nextLink);
}
}
- return FormatSpec.NOT_VALID_WORD;
+ }
+
+ /**
+ * Find a word from the buffer.
+ *
+ * @param buffer the buffer representing the body of the dictionary file.
+ * @param word the word searched
+ * @return the found group
+ * @throws IOException
+ * @throws UnsupportedFormatException
+ */
+ @UsedForTesting
+ public static CharGroupInfo findWordFromBuffer(final FusionDictionaryBufferInterface buffer,
+ final String word) throws IOException, UnsupportedFormatException {
+ int position = getTerminalPosition(buffer, word);
+ if (position != FormatSpec.NOT_VALID_WORD) {
+ buffer.position(0);
+ final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
+ buffer.position(position);
+ return BinaryDictInputOutput.readCharGroup(buffer, position, header.mFormatOptions);
+ }
+ return null;
}
}