aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/latin/makedict
diff options
context:
space:
mode:
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java845
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java429
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java2
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/FormatSpec.java88
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java91
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/MakedictLog.java2
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java2
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java2
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Word.java2
9 files changed, 1243 insertions, 220 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
index 7a1b9dcb7..ee0e9cd7e 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
@@ -16,21 +16,34 @@
package com.android.inputmethod.latin.makedict;
+import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.Constants;
+import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
+import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import java.io.IOException;
+import java.io.OutputStream;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
import java.util.Map;
import java.util.Stack;
-public class BinaryDictIOUtils {
+public final class BinaryDictIOUtils {
private static final boolean DBG = false;
+ private static final int MSB24 = 0x800000;
+ private static final int SINT24_MAX = 0x7FFFFF;
+ private static final int MAX_JUMPS = 10000;
- private static class Position {
+ private BinaryDictIOUtils() {
+ // This utility class is not publicly instantiable.
+ }
+
+ private static final class Position {
public static final int NOT_READ_GROUPCOUNT = -1;
public int mAddress;
@@ -77,7 +90,10 @@ public class BinaryDictIOUtils {
p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup);
p.mPosition = 0;
}
-
+ if (p.mNumOfCharGroup == 0) {
+ stack.pop();
+ continue;
+ }
CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer,
p.mAddress - headerSize, formatOptions);
for (int i = 0; i < info.mCharacters.length; ++i) {
@@ -85,20 +101,36 @@ public class BinaryDictIOUtils {
}
p.mPosition++;
- if (info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) { // found word
+ final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags,
+ formatOptions);
+ final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags,
+ formatOptions);
+ if (!isMovedGroup && !isDeletedGroup
+ && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word
words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
frequencies.put(info.mOriginalAddress, info.mFrequency);
if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
}
if (p.mPosition == p.mNumOfCharGroup) {
- stack.pop();
+ if (formatOptions.mSupportsDynamicUpdate) {
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ // the node has a forward link.
+ p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT;
+ p.mAddress = forwardLinkAddress;
+ } else {
+ stack.pop();
+ }
+ } else {
+ stack.pop();
+ }
} else {
// the node has more groups.
p.mAddress = buffer.position();
}
- if (BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
+ if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
Position childrenPos = new Position(info.mChildrenAddress + headerSize, index);
stack.push(childrenPos);
}
@@ -136,6 +168,7 @@ public class BinaryDictIOUtils {
* @throws IOException
* @throws UnsupportedFormatException
*/
+ @UsedForTesting
public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
final String word) throws IOException, UnsupportedFormatException {
if (word == null) return FormatSpec.NOT_VALID_WORD;
@@ -146,48 +179,802 @@ public class BinaryDictIOUtils {
final int wordLen = word.codePointCount(0, word.length());
for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
- int groupOffset = buffer.position() - header.mHeaderSize;
+
+ do {
+ final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
+ boolean foundNextCharGroup = false;
+ for (int i = 0; i < charGroupCount; ++i) {
+ final int charGroupPos = buffer.position();
+ final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
+ buffer.position(), header.mFormatOptions);
+ final boolean isMovedGroup =
+ BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
+ header.mFormatOptions);
+ final boolean isDeletedGroup =
+ BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags,
+ header.mFormatOptions);
+ if (isMovedGroup) continue;
+ boolean same = true;
+ for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
+ p < currentInfo.mCharacters.length;
+ ++p, j = word.offsetByCodePoints(j, 1)) {
+ if (wordPos + p >= wordLen
+ || word.codePointAt(j) != currentInfo.mCharacters[p]) {
+ same = false;
+ break;
+ }
+ }
+
+ if (same) {
+ // found the group matches the word.
+ if (wordPos + currentInfo.mCharacters.length == wordLen) {
+ if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL
+ || isDeletedGroup) {
+ return FormatSpec.NOT_VALID_WORD;
+ } else {
+ return charGroupPos;
+ }
+ }
+ wordPos += currentInfo.mCharacters.length;
+ if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
+ return FormatSpec.NOT_VALID_WORD;
+ }
+ foundNextCharGroup = true;
+ buffer.position(currentInfo.mChildrenAddress);
+ break;
+ }
+ }
+
+ // If we found the next char group, it is under the file pointer.
+ // But if not, we are at the end of this node so we expect to have
+ // a forward link address that we need to consult and possibly resume
+ // search on the next node in the linked list.
+ if (foundNextCharGroup) break;
+ if (!header.mFormatOptions.mSupportsDynamicUpdate) {
+ return FormatSpec.NOT_VALID_WORD;
+ }
+
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ return FormatSpec.NOT_VALID_WORD;
+ }
+ buffer.position(forwardLinkAddress);
+ } while(true);
+ }
+ return FormatSpec.NOT_VALID_WORD;
+ }
+
+ private static int markAsDeleted(final int flags) {
+ return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED;
+ }
+
+ /**
+ * Delete the word from the binary file.
+ *
+ * @param buffer the buffer to write.
+ * @param word the word we delete
+ * @throws IOException
+ * @throws UnsupportedFormatException
+ */
+ @UsedForTesting
+ public static void deleteWord(final FusionDictionaryBufferInterface buffer,
+ final String word) throws IOException, UnsupportedFormatException {
+ buffer.position(0);
+ final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
+ final int wordPosition = getTerminalPosition(buffer, word);
+ if (wordPosition == FormatSpec.NOT_VALID_WORD) return;
+
+ buffer.position(wordPosition);
+ final int flags = buffer.readUnsignedByte();
+ buffer.position(wordPosition);
+ buffer.put((byte)markAsDeleted(flags));
+ }
+
+ /**
+ * @return the size written, in bytes. Always 3 bytes.
+ */
+ private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer,
+ final int value) {
+ final int absValue = Math.abs(value);
+ buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
+ buffer.put((byte)((absValue >> 8) & 0xFF));
+ buffer.put((byte)(absValue & 0xFF));
+ return 3;
+ }
+
+ /**
+ * @return the size written, in bytes. Always 3 bytes.
+ */
+ private static int writeSInt24ToStream(final OutputStream destination, final int value)
+ throws IOException {
+ final int absValue = Math.abs(value);
+ destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
+ destination.write((byte)((absValue >> 8) & 0xFF));
+ destination.write((byte)(absValue & 0xFF));
+ return 3;
+ }
+
+ /**
+ * @return the size written, in bytes. 1, 2, or 3 bytes.
+ */
+ private static int writeVariableAddress(final OutputStream destination, final int value)
+ throws IOException {
+ switch (BinaryDictInputOutput.getByteSize(value)) {
+ case 1:
+ destination.write((byte)value);
+ break;
+ case 2:
+ destination.write((byte)(0xFF & (value >> 8)));
+ destination.write((byte)(0xFF & value));
+ break;
+ case 3:
+ destination.write((byte)(0xFF & (value >> 16)));
+ destination.write((byte)(0xFF & (value >> 8)));
+ destination.write((byte)(0xFF & value));
+ break;
+ }
+ return BinaryDictInputOutput.getByteSize(value);
+ }
+
+ /**
+ * Update a parent address in a CharGroup that is referred to by groupOriginAddress.
+ *
+ * @param buffer the buffer to write.
+ * @param groupOriginAddress the address of the group.
+ * @param newParentAddress the absolute address of the parent.
+ * @param formatOptions file format options.
+ */
+ public static void updateParentAddress(final FusionDictionaryBufferInterface buffer,
+ final int groupOriginAddress, final int newParentAddress,
+ final FormatOptions formatOptions) {
+ final int originalPosition = buffer.position();
+ buffer.position(groupOriginAddress);
+ if (!formatOptions.mSupportsDynamicUpdate) {
+ throw new RuntimeException("this file format does not support parent addresses");
+ }
+ final int flags = buffer.readUnsignedByte();
+ if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) {
+ // if the group is moved, the parent address is stored in the destination group.
+ // We are guaranteed to process the destination group later, so there is no need to
+ // update anything here.
+ buffer.position(originalPosition);
+ return;
+ }
+ if (DBG) {
+ MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress);
+ }
+ final int parentOffset = newParentAddress - groupOriginAddress;
+ writeSInt24ToBuffer(buffer, parentOffset);
+ buffer.position(originalPosition);
+ }
+
+ private static void skipCharGroup(final FusionDictionaryBufferInterface buffer,
+ final FormatOptions formatOptions) {
+ final int flags = buffer.readUnsignedByte();
+ BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
+ skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
+ BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions);
+ if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
+ if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) {
+ final int shortcutsSize = buffer.readUnsignedShort();
+ buffer.position(buffer.position() + shortcutsSize
+ - FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE);
+ }
+ if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) {
+ int bigramCount = 0;
+ while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
+ final int bigramFlags = buffer.readUnsignedByte();
+ switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) {
+ case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
+ buffer.readUnsignedByte();
+ break;
+ case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
+ buffer.readUnsignedShort();
+ break;
+ case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
+ buffer.readUnsignedInt24();
+ break;
+ }
+ if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break;
+ }
+ if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
+ throw new RuntimeException("Too many bigrams in a group.");
+ }
+ }
+ }
+
+ /**
+ * Update parent addresses in a Node that is referred to by nodeOriginAddress.
+ *
+ * @param buffer the buffer to be modified.
+ * @param nodeOriginAddress the address of a modified Node.
+ * @param newParentAddress the address to be written.
+ * @param formatOptions file format options.
+ */
+ public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer,
+ final int nodeOriginAddress, final int newParentAddress,
+ final FormatOptions formatOptions) {
+ final int originalPosition = buffer.position();
+ buffer.position(nodeOriginAddress);
+ do {
+ final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
+ for (int i = 0; i < count; ++i) {
+ updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions);
+ skipCharGroup(buffer, formatOptions);
+ }
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ buffer.position(forwardLinkAddress);
+ } while (formatOptions.mSupportsDynamicUpdate
+ && buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
+ buffer.position(originalPosition);
+ }
+
+ private static void skipString(final FusionDictionaryBufferInterface buffer,
+ final boolean hasMultipleChars) {
+ if (hasMultipleChars) {
+ int character = CharEncoding.readChar(buffer);
+ while (character != FormatSpec.INVALID_CHARACTER) {
+ character = CharEncoding.readChar(buffer);
+ }
+ } else {
+ CharEncoding.readChar(buffer);
+ }
+ }
+
+ /**
+ * Write a string to a stream.
+ *
+ * @param destination the stream to write.
+ * @param word the string to be written.
+ * @return the size written, in bytes.
+ * @throws IOException
+ */
+ private static int writeString(final OutputStream destination, final String word)
+ throws IOException {
+ int size = 0;
+ final int length = word.length();
+ for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
+ final int codePoint = word.codePointAt(i);
+ if (CharEncoding.getCharSize(codePoint) == 1) {
+ destination.write((byte)codePoint);
+ size++;
+ } else {
+ destination.write((byte)(0xFF & (codePoint >> 16)));
+ destination.write((byte)(0xFF & (codePoint >> 8)));
+ destination.write((byte)(0xFF & codePoint));
+ size += 3;
+ }
+ }
+ destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
+ size += FormatSpec.GROUP_TERMINATOR_SIZE;
+ return size;
+ }
+
+ /**
+ * Update a children address in a CharGroup that is addressed by groupOriginAddress.
+ *
+ * @param buffer the buffer to write.
+ * @param groupOriginAddress the address of the group.
+ * @param newChildrenAddress the absolute address of the child.
+ * @param formatOptions file format options.
+ */
+ public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer,
+ final int groupOriginAddress, final int newChildrenAddress,
+ final FormatOptions formatOptions) {
+ final int originalPosition = buffer.position();
+ buffer.position(groupOriginAddress);
+ final int flags = buffer.readUnsignedByte();
+ final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
+ skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
+ if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
+ final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS
+ ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position();
+ writeSInt24ToBuffer(buffer, childrenOffset);
+ buffer.position(originalPosition);
+ }
+
+ /**
+ * Write a char group to an output stream.
+ * A char group is an in-memory representation of a node in trie.
+ * A char group info is an on-disk representation of a node.
+ *
+ * @param destination the stream to write.
+ * @param info the char group info to be written.
+ * @return the size written, in bytes.
+ */
+ public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info)
+ throws IOException {
+ int size = FormatSpec.GROUP_FLAGS_SIZE;
+ destination.write((byte)info.mFlags);
+ final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ?
+ FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress;
+ size += writeSInt24ToStream(destination, parentOffset);
+
+ for (int i = 0; i < info.mCharacters.length; ++i) {
+ if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) {
+ destination.write((byte)info.mCharacters[i]);
+ size++;
+ } else {
+ size += writeSInt24ToStream(destination, info.mCharacters[i]);
+ }
+ }
+ if (info.mCharacters.length > 1) {
+ destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
+ size++;
+ }
+
+ if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
+ destination.write((byte)info.mFrequency);
+ size++;
+ }
+
+ if (DBG) {
+ MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size
+ + ", child=" + info.mChildrenAddress + ", characters ="
+ + new String(info.mCharacters, 0, info.mCharacters.length));
+ }
+ final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ?
+ 0 : info.mChildrenAddress - (info.mOriginalAddress + size);
+ writeSInt24ToStream(destination, childrenOffset);
+ size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+
+ if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) {
+ final int shortcutListSize =
+ BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
+ destination.write((byte)(shortcutListSize >> 8));
+ destination.write((byte)(shortcutListSize & 0xFF));
+ size += 2;
+ final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator();
+ while (shortcutIterator.hasNext()) {
+ final WeightedString target = shortcutIterator.next();
+ destination.write((byte)BinaryDictInputOutput.makeShortcutFlags(
+ shortcutIterator.hasNext(), target.mFrequency));
+ size++;
+ size += writeString(destination, target.mWord);
+ }
+ }
+
+ if (info.mBigrams != null) {
+ // TODO: Consolidate this code with the code that computes the size of the bigram list
+ // in BinaryDictionaryInputOutput#computeActualNodeSize
+ for (int i = 0; i < info.mBigrams.size(); ++i) {
+
+ final int bigramFrequency = info.mBigrams.get(i).mFrequency;
+ int bigramFlags = (i < info.mBigrams.size() - 1)
+ ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0;
+ size++;
+ final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress
+ + size);
+ bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0;
+ switch (BinaryDictInputOutput.getByteSize(bigramOffset)) {
+ case 1:
+ bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
+ break;
+ case 2:
+ bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
+ break;
+ case 3:
+ bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
+ break;
+ }
+ bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
+ destination.write((byte)bigramFlags);
+ size += writeVariableAddress(destination, Math.abs(bigramOffset));
+ }
+ }
+ return size;
+ }
+
+ @SuppressWarnings("unused")
+ private static void updateForwardLink(final FusionDictionaryBufferInterface buffer,
+ final int nodeOriginAddress, final int newNodeAddress,
+ final FormatOptions formatOptions) {
+ buffer.position(nodeOriginAddress);
+ int jumpCount = 0;
+ while (jumpCount++ < MAX_JUMPS) {
+ final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
+ for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions);
+ final int forwardLinkAddress = buffer.readUnsignedInt24();
+ if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
+ writeSInt24ToBuffer(buffer, newNodeAddress);
+ return;
+ }
+ buffer.position(forwardLinkAddress);
+ }
+ if (DBG && jumpCount >= MAX_JUMPS) {
+ throw new RuntimeException("too many jumps, probably a bug.");
+ }
+ }
+
+ /**
+ * Helper method to move a char group to the tail of the file.
+ */
+ private static int moveCharGroup(final OutputStream destination,
+ final FusionDictionaryBufferInterface buffer, final CharGroupInfo info,
+ final int nodeOriginAddress, final int oldGroupAddress,
+ final FormatOptions formatOptions) throws IOException {
+ updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions);
+ buffer.position(oldGroupAddress);
+ final int currentFlags = buffer.readUnsignedByte();
+ buffer.position(oldGroupAddress);
+ buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags
+ & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
+ int size = FormatSpec.GROUP_FLAGS_SIZE;
+ updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions);
+ size += writeNode(destination, new CharGroupInfo[] { info });
+ return size;
+ }
+
+ /**
+ * Compute the size of the char group.
+ */
+ private static int computeGroupSize(final CharGroupInfo info,
+ final FormatOptions formatOptions) {
+ int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ + BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters)
+ + BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions);
+ if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
+ size += FormatSpec.GROUP_FREQUENCY_SIZE;
+ }
+ if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) {
+ size += BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
+ }
+ if (info.mBigrams != null) {
+ for (final PendingAttribute attr : info.mBigrams) {
+ size += FormatSpec.GROUP_FLAGS_SIZE;
+ size += BinaryDictInputOutput.getByteSize(attr.mAddress);
+ }
+ }
+ return size;
+ }
+
+ /**
+ * Write a node to the stream.
+ *
+ * @param destination the stream to write.
+ * @param infos groups to be written.
+ * @return the size written, in bytes.
+ * @throws IOException
+ */
+ private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos)
+ throws IOException {
+ int size = BinaryDictInputOutput.getGroupCountSize(infos.length);
+ switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) {
+ case 1:
+ destination.write((byte)infos.length);
+ break;
+ case 2:
+ destination.write((byte)(infos.length >> 8));
+ destination.write((byte)(infos.length & 0xFF));
+ break;
+ default:
+ throw new RuntimeException("Invalid group count size.");
+ }
+ for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info);
+ writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
+ return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
+ }
+
+ /**
+ * Move a group that is referred to by oldGroupOrigin to the tail of the file.
+ * And set the children address to the byte after the group.
+ *
+ * @param nodeOrigin the address of the tail of the file.
+ * @param characters
+ * @param length
+ * @param flags
+ * @param frequency
+ * @param parentAddress
+ * @param shortcutTargets
+ * @param bigrams
+ * @param destination the stream representing the tail of the file.
+ * @param buffer the buffer representing the (constant-size) body of the file.
+ * @param oldNodeOrigin
+ * @param oldGroupOrigin
+ * @param formatOptions
+ * @return the size written, in bytes.
+ * @throws IOException
+ */
+ private static int moveGroup(final int nodeOrigin, final int[] characters, final int length,
+ final int flags, final int frequency, final int parentAddress,
+ final ArrayList<WeightedString> shortcutTargets,
+ final ArrayList<PendingAttribute> bigrams, final OutputStream destination,
+ final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin,
+ final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException {
+ int size = 0;
+ final int newGroupOrigin = nodeOrigin + 1;
+ final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length);
+ final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */,
+ flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS,
+ shortcutTargets, bigrams);
+ size = computeGroupSize(tmpInfo, formatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size,
+ flags, writtenCharacters, frequency, parentAddress,
+ nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets,
+ bigrams);
+ moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions);
+ return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
+ }
+
+ /**
+ * Insert a word into a binary dictionary.
+ *
+ * @param buffer
+ * @param destination
+ * @param word
+ * @param frequency
+ * @param bigramStrings
+ * @param shortcuts
+ * @throws IOException
+ * @throws UnsupportedFormatException
+ */
+ // TODO: Support batch insertion.
+ // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary.
+ @UsedForTesting
+ public static void insertWord(final FusionDictionaryBufferInterface buffer,
+ final OutputStream destination, final String word, final int frequency,
+ final ArrayList<WeightedString> bigramStrings,
+ final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
+ final boolean isBlackListEntry)
+ throws IOException, UnsupportedFormatException {
+ final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>();
+ if (bigramStrings != null) {
+ for (final WeightedString bigram : bigramStrings) {
+ int position = getTerminalPosition(buffer, bigram.mWord);
+ if (position == FormatSpec.NOT_VALID_WORD) {
+ // TODO: figure out what is the correct thing to do here.
+ } else {
+ bigrams.add(new PendingAttribute(bigram.mFrequency, position));
+ }
+ }
+ }
+
+ final boolean isTerminal = true;
+ final boolean hasBigrams = !bigrams.isEmpty();
+ final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty();
+
+ // find the insert position of the word.
+ if (buffer.position() != 0) buffer.position(0);
+ final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
+
+ int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position();
+ final int[] codePoints = FusionDictionary.getCodePoints(word);
+ final int wordLen = codePoints.length;
+
+ for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
+ if (wordPos >= wordLen) break;
+ nodeOriginAddress = buffer.position();
+ int nodeParentAddress = -1;
final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
- groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount);
+ boolean foundNextGroup = false;
for (int i = 0; i < charGroupCount; ++i) {
- final int charGroupPos = buffer.position();
+ address = buffer.position();
final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
buffer.position(), header.mFormatOptions);
- boolean same = true;
- for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
- p < currentInfo.mCharacters.length;
- ++p, j = word.offsetByCodePoints(j, 1)) {
- if (wordPos + p >= wordLen
- || word.codePointAt(j) != currentInfo.mCharacters[p]) {
- same = false;
+ final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
+ header.mFormatOptions);
+ if (isMovedGroup) continue;
+ nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS)
+ ? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address;
+ boolean matched = true;
+ for (int p = 0; p < currentInfo.mCharacters.length; ++p) {
+ if (wordPos + p >= wordLen) {
+ /*
+ * splitting
+ * before
+ * abcd - ef
+ *
+ * insert "abc"
+ *
+ * after
+ * abc - d - ef
+ */
+ final int newNodeAddress = buffer.limit();
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
+ isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */,
+ false /* isBlackListEntry */, header.mFormatOptions);
+ int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags,
+ frequency, nodeParentAddress, shortcuts, bigrams, destination,
+ buffer, nodeOriginAddress, address, header.mFormatOptions);
+
+ final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p,
+ currentInfo.mCharacters.length);
+ if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
+ updateParentAddresses(buffer, currentInfo.mChildrenAddress,
+ newNodeAddress + written + 1, header.mFormatOptions);
+ }
+ final CharGroupInfo newInfo2 = new CharGroupInfo(
+ newNodeAddress + written + 1, -1 /* endAddress */,
+ currentInfo.mFlags, characters2, currentInfo.mFrequency,
+ newNodeAddress + 1, currentInfo.mChildrenAddress,
+ currentInfo.mShortcutTargets, currentInfo.mBigrams);
+ writeNode(destination, new CharGroupInfo[] { newInfo2 });
+ return;
+ } else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) {
+ if (p > 0) {
+ /*
+ * splitting
+ * before
+ * ab - cd
+ *
+ * insert "ac"
+ *
+ * after
+ * a - b - cd
+ * |
+ * - c
+ */
+
+ final int newNodeAddress = buffer.limit();
+ final int childrenAddress = currentInfo.mChildrenAddress;
+
+ // move prefix
+ final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
+ false /* isTerminal */, 0 /* childrenAddressSize*/,
+ false /* hasShortcut */, false /* hasBigrams */,
+ false /* isNotAWord */, false /* isBlackListEntry */,
+ header.mFormatOptions);
+ int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p,
+ prefixFlags, -1 /* frequency */, nodeParentAddress, null, null,
+ destination, buffer, nodeOriginAddress, address,
+ header.mFormatOptions);
+
+ final int[] suffixCharacters = Arrays.copyOfRange(
+ currentInfo.mCharacters, p, currentInfo.mCharacters.length);
+ if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
+ updateParentAddresses(buffer, currentInfo.mChildrenAddress,
+ newNodeAddress + written + 1, header.mFormatOptions);
+ }
+ final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags(
+ suffixCharacters.length > 1,
+ (currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0,
+ 0 /* childrenAddressSize */,
+ (currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)
+ != 0,
+ (currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo suffixInfo = new CharGroupInfo(
+ newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags,
+ suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1,
+ currentInfo.mChildrenAddress, currentInfo.mShortcutTargets,
+ currentInfo.mBigrams);
+ written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1;
+
+ final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p,
+ codePoints.length);
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(
+ newCharacters.length > 1, isTerminal,
+ 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(
+ newNodeAddress + written, -1 /* endAddress */, flags,
+ newCharacters, frequency, newNodeAddress + 1,
+ FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
+ writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo });
+ return;
+ }
+ matched = false;
break;
}
}
- if (same) {
+ if (matched) {
if (wordPos + currentInfo.mCharacters.length == wordLen) {
- if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) {
- return FormatSpec.NOT_VALID_WORD;
- } else {
- return charGroupPos;
- }
+ // the word exists in the dictionary.
+ // only update group.
+ final int newNodeAddress = buffer.limit();
+ final boolean hasMultipleChars = currentInfo.mCharacters.length > 1;
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
+ isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
+ -1 /* endAddress */, flags, currentInfo.mCharacters, frequency,
+ nodeParentAddress, currentInfo.mChildrenAddress, shortcuts,
+ bigrams);
+ moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address,
+ header.mFormatOptions);
+ return;
}
wordPos += currentInfo.mCharacters.length;
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
- return FormatSpec.NOT_VALID_WORD;
+ /*
+ * found the prefix of the word.
+ * make new node and link to the node from this group.
+ *
+ * before
+ * ab - cd
+ *
+ * insert "abcde"
+ *
+ * after
+ * ab - cd - e
+ */
+ final int newNodeAddress = buffer.limit();
+ updateChildrenAddress(buffer, address, newNodeAddress,
+ header.mFormatOptions);
+ final int newGroupAddress = newNodeAddress + 1;
+ final boolean hasMultipleChars = (wordLen - wordPos) > 1;
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
+ isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
+ final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags,
+ characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS,
+ shortcuts, bigrams);
+ writeNode(destination, new CharGroupInfo[] { newInfo });
+ return;
}
buffer.position(currentInfo.mChildrenAddress);
+ foundNextGroup = true;
break;
}
- groupOffset = currentInfo.mEndAddress;
+ }
- // not found
- if (i >= charGroupCount - 1) {
- return FormatSpec.NOT_VALID_WORD;
- }
+ if (foundNextGroup) continue;
+
+ // reached the end of the array.
+ final int linkAddressPosition = buffer.position();
+ int nextLink = buffer.readUnsignedInt24();
+ if ((nextLink & MSB24) != 0) {
+ nextLink = -(nextLink & SINT24_MAX);
+ }
+ if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
+ /*
+ * expand this node.
+ *
+ * before
+ * ab - cd
+ *
+ * insert "abef"
+ *
+ * after
+ * ab - cd
+ * |
+ * - ef
+ */
+
+ // change the forward link address.
+ final int newNodeAddress = buffer.limit();
+ buffer.position(linkAddressPosition);
+ writeSInt24ToBuffer(buffer, newNodeAddress);
+
+ final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
+ final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1,
+ isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
+ isNotAWord, isBlackListEntry, header.mFormatOptions);
+ final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
+ -1 /* endAddress */, flags, characters, frequency, nodeParentAddress,
+ FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
+ writeNode(destination, new CharGroupInfo[]{ newInfo });
+ return;
+ } else {
+ depth--;
+ buffer.position(nextLink);
}
}
- return FormatSpec.NOT_VALID_WORD;
+ }
+
+ /**
+ * Find a word from the buffer.
+ *
+ * @param buffer the buffer representing the body of the dictionary file.
+ * @param word the word searched
+ * @return the found group
+ * @throws IOException
+ * @throws UnsupportedFormatException
+ */
+ @UsedForTesting
+ public static CharGroupInfo findWordFromBuffer(final FusionDictionaryBufferInterface buffer,
+ final String word) throws IOException, UnsupportedFormatException {
+ int position = getTerminalPosition(buffer, word);
+ if (position != FormatSpec.NOT_VALID_WORD) {
+ buffer.position(0);
+ final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
+ buffer.position(position);
+ return BinaryDictInputOutput.readCharGroup(buffer, position, header.mFormatOptions);
+ }
+ return null;
}
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
index 1d3e94bb7..d1a3c7b0a 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
@@ -16,6 +16,7 @@
package com.android.inputmethod.latin.makedict;
+import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
@@ -36,7 +37,6 @@ import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
-import java.util.Stack;
import java.util.TreeMap;
/**
@@ -44,7 +44,7 @@ import java.util.TreeMap;
*
* All the methods in this class are static.
*/
-public class BinaryDictInputOutput {
+public final class BinaryDictInputOutput {
private static final boolean DBG = MakedictLog.DBG;
@@ -54,6 +54,7 @@ public class BinaryDictInputOutput {
// If the number of passes exceeds this number, makedict bails with an exception on
// suspicion that a bug might be causing an infinite loop.
private static final int MAX_PASSES = 24;
+ private static final int MAX_JUMPS = 12;
public interface FusionDictionaryBufferInterface {
public int readUnsignedByte();
@@ -64,6 +65,7 @@ public class BinaryDictInputOutput {
public void position(int newPosition);
public void put(final byte b);
public int limit();
+ public int capacity();
}
public static final class ByteBufferWrapper implements FusionDictionaryBufferInterface {
@@ -75,12 +77,12 @@ public class BinaryDictInputOutput {
@Override
public int readUnsignedByte() {
- return ((int)mBuffer.get()) & 0xFF;
+ return mBuffer.get() & 0xFF;
}
@Override
public int readUnsignedShort() {
- return ((int)mBuffer.getShort()) & 0xFFFF;
+ return mBuffer.getShort() & 0xFFFF;
}
@Override
@@ -113,13 +115,17 @@ public class BinaryDictInputOutput {
public int limit() {
return mBuffer.limit();
}
+
+ @Override
+ public int capacity() {
+ return mBuffer.capacity();
+ }
}
/**
* A class grouping utility function for our specific character encoding.
*/
- private static class CharEncoding {
-
+ static final class CharEncoding {
private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
@@ -148,7 +154,7 @@ public class BinaryDictInputOutput {
* @param character the character code.
* @return the size in binary encoded-form, either 1 or 3 bytes.
*/
- private static int getCharSize(final int character) {
+ static int getCharSize(final int character) {
// See char encoding in FusionDictionary.java
if (fitsOnOneByte(character)) return 1;
if (FormatSpec.INVALID_CHARACTER == character) return 1;
@@ -257,7 +263,7 @@ public class BinaryDictInputOutput {
* @param buffer the buffer, positioned over an encoded character.
* @return the character code.
*/
- private static int readChar(final FusionDictionaryBufferInterface buffer) {
+ static int readChar(final FusionDictionaryBufferInterface buffer) {
int character = buffer.readUnsignedByte();
if (!fitsOnOneByte(character)) {
if (FormatSpec.GROUP_CHARACTERS_TERMINATOR == character) {
@@ -271,6 +277,21 @@ public class BinaryDictInputOutput {
}
/**
+ * Compute the binary size of the character array.
+ *
+ * If only one character, this is the size of this character. If many, it's the sum of their
+ * sizes + 1 byte for the terminator.
+ *
+ * @param characters the character array
+ * @return the size of the char array, including the terminator if any
+ */
+ static int getGroupCharactersSize(final int[] characters) {
+ int size = CharEncoding.getCharArraySize(characters);
+ if (characters.length > 1) size += FormatSpec.GROUP_TERMINATOR_SIZE;
+ return size;
+ }
+
+ /**
* Compute the binary size of the character array in a group
*
* If only one character, this is the size of this character. If many, it's the sum of their
@@ -280,9 +301,7 @@ public class BinaryDictInputOutput {
* @return the size of the char array, including the terminator if any
*/
private static int getGroupCharactersSize(final CharGroup group) {
- int size = CharEncoding.getCharArraySize(group.mChars);
- if (group.hasSeveralChars()) size += FormatSpec.GROUP_TERMINATOR_SIZE;
- return size;
+ return getGroupCharactersSize(group.mChars);
}
/**
@@ -332,7 +351,7 @@ public class BinaryDictInputOutput {
* This is known in advance and does not change according to position in the file
* like address lists do.
*/
- private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
+ static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
if (null == shortcutList) return 0;
int size = FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE;
for (final WeightedString shortcut : shortcutList) {
@@ -376,7 +395,7 @@ public class BinaryDictInputOutput {
g.mCachedSize = groupSize;
size += groupSize;
}
- if (options.mHasLinkedListNode) {
+ if (options.mSupportsDynamicUpdate) {
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
node.mCachedSize = size;
@@ -390,11 +409,27 @@ public class BinaryDictInputOutput {
}
/**
- * Helper method to check whether the CharGroup has a parent address.
+ * Helper method to check whether the group is moved.
*/
- private static boolean hasParentAddress(final FormatOptions options) {
- return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_PARENT_ADDRESS
- && options.mHasParentAddress;
+ public static boolean isMovedGroup(final int flags, final FormatOptions options) {
+ return options.mSupportsDynamicUpdate
+ && ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED);
+ }
+
+ /**
+ * Helper method to check whether the group is deleted.
+ */
+ public static boolean isDeletedGroup(final int flags, final FormatOptions formatOptions) {
+ return formatOptions.mSupportsDynamicUpdate
+ && ((flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED);
+ }
+
+ /**
+ * Helper method to check whether the dictionary can be updated dynamically.
+ */
+ public static boolean supportsDynamicUpdate(final FormatOptions options) {
+ return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_DYNAMIC_UPDATE
+ && options.mSupportsDynamicUpdate;
}
/**
@@ -404,7 +439,7 @@ public class BinaryDictInputOutput {
* @param options file format options.
*/
private static int getGroupHeaderSize(final CharGroup group, final FormatOptions options) {
- if (hasParentAddress(options)) {
+ if (supportsDynamicUpdate(options)) {
return FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ getGroupCharactersSize(group);
} else {
@@ -412,6 +447,10 @@ public class BinaryDictInputOutput {
}
}
+ private static final int UINT8_MAX = 0xFF;
+ private static final int UINT16_MAX = 0xFFFF;
+ private static final int UINT24_MAX = 0xFFFFFF;
+
/**
* Compute the size, in bytes, that an address will occupy.
*
@@ -422,18 +461,23 @@ public class BinaryDictInputOutput {
* @param address the address
* @return the byte size.
*/
- private static int getByteSize(final int address) {
- assert(address < 0x1000000);
+ static int getByteSize(final int address) {
+ assert(address <= UINT24_MAX);
if (!hasChildrenAddress(address)) {
return 0;
- } else if (Math.abs(address) < 0x100) {
+ } else if (Math.abs(address) <= UINT8_MAX) {
return 1;
- } else if (Math.abs(address) < 0x10000) {
+ } else if (Math.abs(address) <= UINT16_MAX) {
return 2;
} else {
return 3;
}
}
+
+ private static final int SINT24_MAX = 0x7FFFFF;
+ private static final int MSB8 = 0x80;
+ private static final int MSB24 = 0x800000;
+
// End utility methods.
// This method is responsible for finding a nice ordering of the nodes that favors run-time
@@ -509,13 +553,19 @@ public class BinaryDictInputOutput {
}
int groupSize = getGroupHeaderSize(group, formatOptions);
if (group.isTerminal()) groupSize += FormatSpec.GROUP_FREQUENCY_SIZE;
- if (null != group.mChildren) {
+ if (null == group.mChildren && formatOptions.mSupportsDynamicUpdate) {
+ groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+ } else if (null != group.mChildren) {
final int offsetBasePoint = groupSize + node.mCachedAddress + size;
final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
// assign my address to children's parent address
group.mChildren.mCachedParentAddress = group.mCachedAddress
- group.mChildren.mCachedAddress;
- groupSize += getByteSize(offset);
+ if (formatOptions.mSupportsDynamicUpdate) {
+ groupSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+ } else {
+ groupSize += getByteSize(offset);
+ }
}
groupSize += getShortcutListSize(group.mShortcutTargets);
if (null != group.mBigrams) {
@@ -530,7 +580,7 @@ public class BinaryDictInputOutput {
group.mCachedSize = groupSize;
size += groupSize;
}
- if (formatOptions.mHasLinkedListNode) {
+ if (formatOptions.mSupportsDynamicUpdate) {
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
if (node.mCachedSize != size) {
@@ -559,7 +609,8 @@ public class BinaryDictInputOutput {
groupOffset += g.mCachedSize;
}
final int nodeSize = groupCountSize + groupOffset
- + (formatOptions.mHasLinkedListNode ? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0);
+ + (formatOptions.mSupportsDynamicUpdate
+ ? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0);
if (nodeSize != n.mCachedSize) {
throw new RuntimeException("Bug : Stored and computed node size differ");
}
@@ -668,49 +719,81 @@ public class BinaryDictInputOutput {
}
}
- private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress,
- final int childrenOffset) {
- byte flags = 0;
- if (group.mChars.length > 1) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
- if (group.mFrequency >= 0) {
- flags |= FormatSpec.FLAG_IS_TERMINAL;
- }
- if (null != group.mChildren) {
- switch (getByteSize(childrenOffset)) {
- case 1:
- flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
- break;
- case 2:
- flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
- break;
- case 3:
- flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
- break;
- default:
- throw new RuntimeException("Node with a strange address");
- }
- }
- if (null != group.mShortcutTargets) {
- if (DBG && 0 == group.mShortcutTargets.size()) {
- throw new RuntimeException("0-sized shortcut list must be null");
- }
- flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
+ /**
+ * Helper method to write a variable-size signed address to a file.
+ *
+ * @param buffer the buffer to write to.
+ * @param index the index in the buffer to write the address to.
+ * @param address the address to write.
+ * @return the size in bytes the address actually took.
+ */
+ private static int writeVariableSignedAddress(final byte[] buffer, int index,
+ final int address) {
+ if (!hasChildrenAddress(address)) {
+ buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
+ } else {
+ final int absAddress = Math.abs(address);
+ buffer[index++] = (byte)((address < 0 ? MSB8 : 0) | (0xFF & (absAddress >> 16)));
+ buffer[index++] = (byte)(0xFF & (absAddress >> 8));
+ buffer[index++] = (byte)(0xFF & absAddress);
}
- if (null != group.mBigrams) {
- if (DBG && 0 == group.mBigrams.size()) {
- throw new RuntimeException("0-sized bigram list must be null");
+ return 3;
+ }
+
+ /**
+ * Makes the flag value for a char group.
+ *
+ * @param hasMultipleChars whether the group has multiple chars.
+ * @param isTerminal whether the group is terminal.
+ * @param childrenAddressSize the size of a children address.
+ * @param hasShortcuts whether the group has shortcuts.
+ * @param hasBigrams whether the group has bigrams.
+ * @param isNotAWord whether the group is not a word.
+ * @param isBlackListEntry whether the group is a blacklist entry.
+ * @param formatOptions file format options.
+ * @return the flags
+ */
+ static int makeCharGroupFlags(final boolean hasMultipleChars, final boolean isTerminal,
+ final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams,
+ final boolean isNotAWord, final boolean isBlackListEntry,
+ final FormatOptions formatOptions) {
+ byte flags = 0;
+ if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
+ if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL;
+ if (formatOptions.mSupportsDynamicUpdate) {
+ flags |= FormatSpec.FLAG_IS_NOT_MOVED;
+ } else if (true) {
+ switch (childrenAddressSize) {
+ case 1:
+ flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
+ break;
+ case 2:
+ flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
+ break;
+ case 3:
+ flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
+ break;
+ case 0:
+ flags |= FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS;
+ break;
+ default:
+ throw new RuntimeException("Node with a strange address");
}
- flags |= FormatSpec.FLAG_HAS_BIGRAMS;
- }
- if (group.mIsNotAWord) {
- flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
- }
- if (group.mIsBlacklistEntry) {
- flags |= FormatSpec.FLAG_IS_BLACKLISTED;
}
+ if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
+ if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS;
+ if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
+ if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED;
return flags;
}
+ private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress,
+ final int childrenOffset, final FormatOptions formatOptions) {
+ return (byte) makeCharGroupFlags(group.mChars.length > 1, group.mFrequency >= 0,
+ getByteSize(childrenOffset), group.mShortcutTargets != null, group.mBigrams != null,
+ group.mIsNotAWord, group.mIsBlacklistEntry, formatOptions);
+ }
+
/**
* Makes the flag value for a bigram.
*
@@ -792,8 +875,7 @@ public class BinaryDictInputOutput {
return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0)
+ (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0)
+ (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0)
- + (formatOptions.mHasParentAddress ? FormatSpec.HAS_PARENT_ADDRESS : 0)
- + (formatOptions.mHasLinkedListNode ? FormatSpec.HAS_LINKEDLIST_NODE : 0);
+ + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0);
}
/**
@@ -803,11 +885,30 @@ public class BinaryDictInputOutput {
* @param frequency the frequency of the attribute, 0..15
* @return the flags
*/
- private static final int makeShortcutFlags(final boolean more, final int frequency) {
+ static final int makeShortcutFlags(final boolean more, final int frequency) {
return (more ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0)
+ (frequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY);
}
+ private static final int writeParentAddress(final byte[] buffer, final int index,
+ final int address, final FormatOptions formatOptions) {
+ if (supportsDynamicUpdate(formatOptions)) {
+ if (address == FormatSpec.NO_PARENT_ADDRESS) {
+ buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
+ } else {
+ final int absAddress = Math.abs(address);
+ assert(absAddress <= SINT24_MAX);
+ buffer[index] = (byte)((address < 0 ? MSB8 : 0)
+ | ((absAddress >> 16) & 0xFF));
+ buffer[index + 1] = (byte)((absAddress >> 8) & 0xFF);
+ buffer[index + 2] = (byte)(absAddress & 0xFF);
+ }
+ return index + 3;
+ } else {
+ return index;
+ }
+ }
+
/**
* Write a node to memory. The node is expected to have its final position cached.
*
@@ -822,6 +923,7 @@ public class BinaryDictInputOutput {
*/
private static int writePlacedNode(final FusionDictionary dict, byte[] buffer,
final Node node, final FormatOptions formatOptions) {
+ // TODO: Make the code in common with BinaryDictIOUtils#writeCharGroup
int index = node.mCachedAddress;
final int groupCount = node.mData.size();
@@ -854,22 +956,15 @@ public class BinaryDictInputOutput {
final int childrenOffset = null == group.mChildren
? FormatSpec.NO_CHILDREN_ADDRESS
: group.mChildren.mCachedAddress - groupAddress;
- byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset);
+ byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset, formatOptions);
buffer[index++] = flags;
- if (hasParentAddress(formatOptions)) {
- if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) {
- // this node is the root node.
- buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
- } else {
- // write parent address. (version 3)
- final int actualParentAddress = Math.abs(parentAddress
- + (node.mCachedAddress - group.mCachedAddress));
- buffer[index] = (byte)((actualParentAddress >> 16) & 0xFF);
- buffer[index + 1] = (byte)((actualParentAddress >> 8) & 0xFF);
- buffer[index + 2] = (byte)(actualParentAddress & 0xFF);
- }
- index += 3;
+ if (parentAddress == FormatSpec.NO_PARENT_ADDRESS) {
+ index = writeParentAddress(buffer, index, parentAddress, formatOptions);
+ } else {
+ index = writeParentAddress(buffer, index,
+ parentAddress + (node.mCachedAddress - group.mCachedAddress),
+ formatOptions);
}
index = CharEncoding.writeCharArray(group.mChars, buffer, index);
@@ -879,7 +974,13 @@ public class BinaryDictInputOutput {
if (group.mFrequency >= 0) {
buffer[index++] = (byte) group.mFrequency;
}
- final int shift = writeVariableAddress(buffer, index, childrenOffset);
+
+ final int shift;
+ if (formatOptions.mSupportsDynamicUpdate) {
+ shift = writeVariableSignedAddress(buffer, index, childrenOffset);
+ } else {
+ shift = writeVariableAddress(buffer, index, childrenOffset);
+ }
index += shift;
groupAddress += shift;
@@ -927,7 +1028,7 @@ public class BinaryDictInputOutput {
}
}
- if (formatOptions.mHasLinkedListNode) {
+ if (formatOptions.mSupportsDynamicUpdate) {
buffer[index] = buffer[index + 1] = buffer[index + 2]
= FormatSpec.NO_FORWARD_LINK_ADDRESS;
index += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
@@ -1104,6 +1205,58 @@ public class BinaryDictInputOutput {
// Input methods: Read a binary dictionary to memory.
// readDictionaryBinary is the public entry point for them.
+ static int getChildrenAddressSize(final int optionFlags,
+ final FormatOptions formatOptions) {
+ if (formatOptions.mSupportsDynamicUpdate) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+ switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
+ return 1;
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
+ return 2;
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
+ return 3;
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
+ default:
+ return 0;
+ }
+ }
+
+ static int readChildrenAddress(final FusionDictionaryBufferInterface buffer,
+ final int optionFlags, final FormatOptions options) {
+ if (options.mSupportsDynamicUpdate) {
+ final int address = buffer.readUnsignedInt24();
+ if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS;
+ if ((address & MSB24) != 0) {
+ return -(address & SINT24_MAX);
+ } else {
+ return address;
+ }
+ }
+ int address;
+ switch (optionFlags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
+ return buffer.readUnsignedByte();
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
+ return buffer.readUnsignedShort();
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
+ return buffer.readUnsignedInt24();
+ case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
+ default:
+ return FormatSpec.NO_CHILDREN_ADDRESS;
+ }
+ }
+
+ static int readParentAddress(final FusionDictionaryBufferInterface buffer,
+ final FormatOptions formatOptions) {
+ if (supportsDynamicUpdate(formatOptions)) {
+ final int parentAddress = buffer.readUnsignedInt24();
+ final int sign = ((parentAddress & MSB24) != 0) ? -1 : 1;
+ return sign * (parentAddress & SINT24_MAX);
+ } else {
+ return FormatSpec.NO_PARENT_ADDRESS;
+ }
+ }
+
private static final int[] CHARACTER_BUFFER = new int[FormatSpec.MAX_WORD_LENGTH];
public static CharGroupInfo readCharGroup(final FusionDictionaryBufferInterface buffer,
final int originalGroupAddress, final FormatOptions options) {
@@ -1111,13 +1264,9 @@ public class BinaryDictInputOutput {
final int flags = buffer.readUnsignedByte();
++addressPointer;
- final int parentAddress;
- if (hasParentAddress(options)) {
- // read the parent address. (version 3)
- parentAddress = -buffer.readUnsignedInt24();
+ final int parentAddress = readParentAddress(buffer, options);
+ if (supportsDynamicUpdate(options)) {
addressPointer += 3;
- } else {
- parentAddress = FormatSpec.NO_PARENT_ADDRESS;
}
final int characters[];
@@ -1146,25 +1295,11 @@ public class BinaryDictInputOutput {
} else {
frequency = CharGroup.NOT_A_TERMINAL;
}
- int childrenAddress = addressPointer;
- switch (flags & FormatSpec.MASK_GROUP_ADDRESS_TYPE) {
- case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
- childrenAddress += buffer.readUnsignedByte();
- addressPointer += 1;
- break;
- case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
- childrenAddress += buffer.readUnsignedShort();
- addressPointer += 2;
- break;
- case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
- childrenAddress += buffer.readUnsignedInt24();
- addressPointer += 3;
- break;
- case FormatSpec.FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
- default:
- childrenAddress = FormatSpec.NO_CHILDREN_ADDRESS;
- break;
+ int childrenAddress = readChildrenAddress(buffer, flags, options);
+ if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
+ childrenAddress += addressPointer;
}
+ addressPointer += getChildrenAddressSize(flags, options);
ArrayList<WeightedString> shortcutTargets = null;
if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
final int pointerBefore = buffer.position();
@@ -1182,7 +1317,8 @@ public class BinaryDictInputOutput {
ArrayList<PendingAttribute> bigrams = null;
if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
bigrams = new ArrayList<PendingAttribute>();
- while (true) {
+ int bigramCount = 0;
+ while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
final int bigramFlags = buffer.readUnsignedByte();
++addressPointer;
final int sign = 0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE)
@@ -1210,6 +1346,9 @@ public class BinaryDictInputOutput {
bigramAddress));
if (0 == (bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT)) break;
}
+ if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
+ MakedictLog.d("too many bigrams in a group.");
+ }
}
return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency,
parentAddress, childrenAddress, shortcutTargets, bigrams);
@@ -1232,7 +1371,8 @@ public class BinaryDictInputOutput {
// of this method. Since it performs direct, unbuffered random access to the file and
// may be called hundreds of thousands of times, the resulting performance is not
// reasonable without some kind of cache. Thus:
- private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>();
+ private static TreeMap<Integer, WeightedString> wordCache =
+ new TreeMap<Integer, WeightedString>();
/**
* Finds, as a string, the word at the address passed as an argument.
*
@@ -1240,18 +1380,19 @@ public class BinaryDictInputOutput {
* @param headerSize the size of the header.
* @param address the address to seek.
* @param formatOptions file format options.
- * @return the word, as a string.
+ * @return the word with its frequency, as a weighted string.
*/
- /* packages for tests */ static String getWordAtAddress(
+ /* package for tests */ static WeightedString getWordAtAddress(
final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
final FormatOptions formatOptions) {
- final String cachedString = wordCache.get(address);
+ final WeightedString cachedString = wordCache.get(address);
if (null != cachedString) return cachedString;
- final String result;
+ final WeightedString result;
final int originalPointer = buffer.position();
+ buffer.position(address);
- if (hasParentAddress(formatOptions)) {
+ if (supportsDynamicUpdate(formatOptions)) {
result = getWordAtAddressWithParentAddress(buffer, headerSize, address, formatOptions);
} else {
result = getWordAtAddressWithoutParentAddress(buffer, headerSize, address,
@@ -1263,38 +1404,53 @@ public class BinaryDictInputOutput {
return result;
}
+ // TODO: static!? This will behave erratically when used in multi-threaded code.
+ // We need to fix this
private static int[] sGetWordBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
- private static String getWordAtAddressWithParentAddress(
+ private static WeightedString getWordAtAddressWithParentAddress(
final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
final FormatOptions options) {
final StringBuilder builder = new StringBuilder();
int currentAddress = address;
int index = FormatSpec.MAX_WORD_LENGTH - 1;
+ int frequency = Integer.MIN_VALUE;
// the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH
for (int count = 0; count < FormatSpec.MAX_WORD_LENGTH; ++count) {
- buffer.position(currentAddress + headerSize);
- final CharGroupInfo currentInfo = readCharGroup(buffer, currentAddress, options);
+ CharGroupInfo currentInfo;
+ int loopCounter = 0;
+ do {
+ buffer.position(currentAddress + headerSize);
+ currentInfo = readCharGroup(buffer, currentAddress, options);
+ if (isMovedGroup(currentInfo.mFlags, options)) {
+ currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
+ }
+ if (DBG && loopCounter++ > MAX_JUMPS) {
+ MakedictLog.d("Too many jumps - probably a bug");
+ }
+ } while (isMovedGroup(currentInfo.mFlags, options));
+ if (Integer.MIN_VALUE == frequency) frequency = currentInfo.mFrequency;
for (int i = 0; i < currentInfo.mCharacters.length; ++i) {
sGetWordBuffer[index--] =
currentInfo.mCharacters[currentInfo.mCharacters.length - i - 1];
}
-
if (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS) break;
currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
}
- return new String(sGetWordBuffer, index + 1, FormatSpec.MAX_WORD_LENGTH - index - 1);
+ return new WeightedString(
+ new String(sGetWordBuffer, index + 1, FormatSpec.MAX_WORD_LENGTH - index - 1),
+ frequency);
}
- private static String getWordAtAddressWithoutParentAddress(
+ private static WeightedString getWordAtAddressWithoutParentAddress(
final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
final FormatOptions options) {
buffer.position(headerSize);
final int count = readCharGroupCount(buffer);
int groupOffset = getGroupCountSize(count);
final StringBuilder builder = new StringBuilder();
- String result = null;
+ WeightedString result = null;
CharGroupInfo last = null;
for (int i = count - 1; i >= 0; --i) {
@@ -1302,7 +1458,7 @@ public class BinaryDictInputOutput {
groupOffset = info.mEndAddress;
if (info.mOriginalAddress == address) {
builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
- result = builder.toString();
+ result = new WeightedString(builder.toString(), info.mFrequency);
break; // and return
}
if (hasChildrenAddress(info.mChildrenAddress)) {
@@ -1357,14 +1513,17 @@ public class BinaryDictInputOutput {
int groupOffset = nodeHeadPosition + getGroupCountSize(count);
for (int i = count; i > 0; --i) { // Scan the array of CharGroup.
CharGroupInfo info = readCharGroup(buffer, groupOffset, options);
+ if (isMovedGroup(info.mFlags, options)) continue;
ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets;
ArrayList<WeightedString> bigrams = null;
if (null != info.mBigrams) {
bigrams = new ArrayList<WeightedString>();
for (PendingAttribute bigram : info.mBigrams) {
- final String word = getWordAtAddress(
+ final WeightedString word = getWordAtAddress(
buffer, headerSize, bigram.mAddress, options);
- bigrams.add(new WeightedString(word, bigram.mFrequency));
+ final int reconstructedFrequency =
+ reconstructBigramFrequency(word.mFrequency, bigram.mFrequency);
+ bigrams.add(new WeightedString(word.mWord, reconstructedFrequency));
}
}
if (hasChildrenAddress(info.mChildrenAddress)) {
@@ -1392,7 +1551,7 @@ public class BinaryDictInputOutput {
}
// reach the end of the array.
- if (options.mHasLinkedListNode) {
+ if (options.mSupportsDynamicUpdate) {
final int nextAddress = buffer.readUnsignedInt24();
if (nextAddress >= 0 && nextAddress < buffer.limit()) {
buffer.position(nextAddress);
@@ -1400,7 +1559,7 @@ public class BinaryDictInputOutput {
break;
}
}
- } while (options.mHasLinkedListNode &&
+ } while (options.mSupportsDynamicUpdate &&
buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
final Node node = new Node(nodeContents);
@@ -1469,8 +1628,7 @@ public class BinaryDictInputOutput {
0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG),
0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)),
new FormatOptions(version,
- 0 != (optionsFlags & FormatSpec.HAS_PARENT_ADDRESS),
- 0 != (optionsFlags & FormatSpec.HAS_LINKEDLIST_NODE)));
+ 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE)));
return header;
}
@@ -1500,6 +1658,7 @@ public class BinaryDictInputOutput {
* @param dict an optional dictionary to add words to, or null.
* @return the created (or merged) dictionary.
*/
+ @UsedForTesting
public static FusionDictionary readDictionaryBinary(
final FusionDictionaryBufferInterface buffer, final FusionDictionary dict)
throws IOException, UnsupportedFormatException {
@@ -1537,17 +1696,24 @@ public class BinaryDictInputOutput {
}
/**
+ * Helper method to pass a file name instead of a File object to isBinaryDictionary.
+ */
+ public static boolean isBinaryDictionary(final String filename) {
+ final File file = new File(filename);
+ return isBinaryDictionary(file);
+ }
+
+ /**
* Basic test to find out whether the file is a binary dictionary or not.
*
* Concretely this only tests the magic number.
*
- * @param filename The name of the file to test.
+ * @param file The file to test.
* @return true if it's a binary dictionary, false otherwise
*/
- public static boolean isBinaryDictionary(final String filename) {
+ public static boolean isBinaryDictionary(final File file) {
FileInputStream inStream = null;
try {
- final File file = new File(filename);
inStream = new FileInputStream(file);
final ByteBuffer buffer = inStream.getChannel().map(
FileChannel.MapMode.READ_ONLY, 0, file.length());
@@ -1582,8 +1748,7 @@ public class BinaryDictInputOutput {
final int bigramFrequency) {
final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
/ (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
- final float resultFreqFloat = (float)unigramFrequency
- + stepSize * (bigramFrequency + 1.0f);
+ final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
return (int)resultFreqFloat;
}
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java b/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java
index ed9388409..8e64082e6 100644
--- a/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java
+++ b/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java
@@ -23,7 +23,7 @@ import java.util.ArrayList;
/**
* Raw char group info straight out of a file. This will contain numbers for addresses.
*/
-public class CharGroupInfo {
+public final class CharGroupInfo {
public final int mOriginalAddress;
public final int mEndAddress;
diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
index adc6037bb..705f66414 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
@@ -42,32 +42,43 @@ public final class FormatSpec {
* ps
*
* f |
- * o | IF HAS_LINKEDLIST_NODE (defined in the file header)
+ * o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
* r | forward link address, 3byte
- * w | the address must be positive.
- * a |
- * rdlinkaddress
+ * w | 1 byte = bbbbbbbb match
+ * a | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte)
+ * r | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte
+ * d |
+ * linkaddress
*/
/* Node(CharGroup) layout is as follows:
- * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE
- * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
- * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
- * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
- * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS
- * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
+ * | IF !SUPPORTS_DYNAMIC_UPDATE
+ * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE
+ * | 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
+ * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
+ * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
+ * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
+ * g | ELSE
+ * s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED
+ * | This must be the same as FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
+ * | 01 = yes : FLAG_IS_MOVED
+ * | the new address is stored in the same place as the parent address
+ * | is deleted? 10 = yes : FLAG_IS_DELETED
+ * | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS
+ * | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
* | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
* | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED
*
* p |
- * a | IF HAS_PARENT_ADDRESS (defined in the file header)
+ * a | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
* r | parent address, 3byte
- * e | the address must be negative, so the absolute value of the address is stored.
- * n |
- * taddress
+ * e | 1 byte = bbbbbbbb match
+ * n | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
+ * t | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte
+ * a |
+ * ddress
*
* c | IF FLAG_HAS_MULTIPLE_CHARS
* h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
@@ -145,17 +156,14 @@ public final class FormatSpec {
static final int MAXIMUM_SUPPORTED_VERSION = 3;
static final int NOT_A_VERSION_NUMBER = -1;
static final int FIRST_VERSION_WITH_HEADER_SIZE = 2;
- static final int FIRST_VERSION_WITH_PARENT_ADDRESS = 3;
- static final int FIRST_VERSION_WITH_LINKEDLIST_NODE = 3;
+ static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3;
// These options need to be the same numeric values as the one in the native reading code.
static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
// TODO: Make the native reading code read this variable.
- static final int HAS_PARENT_ADDRESS = 0x2;
+ static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
static final int CONTAINS_BIGRAMS_FLAG = 0x8;
- // TODO: Make the native reading code read this variable.
- static final int HAS_LINKEDLIST_NODE = 0x10;
// TODO: Make this value adaptative to content data, store it in the header, and
// use it in the reading code.
@@ -164,6 +172,7 @@ public final class FormatSpec {
static final int PARENT_ADDRESS_SIZE = 3;
static final int FORWARD_LINK_ADDRESS_SIZE = 3;
+ // These flags are used only in the static dictionary.
static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
@@ -178,6 +187,13 @@ public final class FormatSpec {
static final int FLAG_IS_NOT_A_WORD = 0x02;
static final int FLAG_IS_BLACKLISTED = 0x01;
+ // These flags are used only in the dynamic dictionary.
+ static final int MASK_MOVE_AND_DELETE_FLAG = 0xC0;
+ static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40;
+ static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE;
+ static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE;
+ static final int FLAG_IS_DELETED = 0x80;
+
static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
@@ -203,43 +219,33 @@ public final class FormatSpec {
static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
+ static final int MAX_BIGRAMS_IN_A_GROUP = 10000;
static final int MAX_TERMINAL_FREQUENCY = 255;
static final int MAX_BIGRAM_FREQUENCY = 15;
+ public static final int SHORTCUT_WHITELIST_FREQUENCY = 15;
+
// This option needs to be the same numeric value as the one in binary_format.h.
static final int NOT_VALID_WORD = -99;
+ static final int SIGNED_CHILDREN_ADDRESS_SIZE = 3;
/**
* Options about file format.
*/
- public static class FormatOptions {
+ public static final class FormatOptions {
public final int mVersion;
- public final boolean mHasParentAddress;
- public final boolean mHasLinkedListNode;
+ public final boolean mSupportsDynamicUpdate;
public FormatOptions(final int version) {
this(version, false);
}
- public FormatOptions(final int version, final boolean hasParentAddress) {
- this(version, hasParentAddress, false);
- }
- public FormatOptions(final int version, final boolean hasParentAddress,
- final boolean hasLinkedListNode) {
+ public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
mVersion = version;
- if (version < FIRST_VERSION_WITH_PARENT_ADDRESS && hasParentAddress) {
- throw new RuntimeException("Parent addresses are only supported with versions "
- + FIRST_VERSION_WITH_PARENT_ADDRESS + " and ulterior.");
- }
- mHasParentAddress = hasParentAddress;
-
- if (version < FIRST_VERSION_WITH_LINKEDLIST_NODE && hasLinkedListNode) {
- throw new RuntimeException("Linked list nodes are only supported with versions "
- + FIRST_VERSION_WITH_LINKEDLIST_NODE + " and ulterior.");
- }
- if (!hasParentAddress && hasLinkedListNode) {
- throw new RuntimeException("Linked list nodes need parent addresses.");
+ if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
+ throw new RuntimeException("Dynamic updates are only supported with versions "
+ + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior.");
}
- mHasLinkedListNode = hasLinkedListNode;
+ mSupportsDynamicUpdate = supportsDynamicUpdate;
}
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
index 98cf308c8..b0b3777df 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
@@ -21,6 +21,7 @@ import com.android.inputmethod.latin.Constants;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
@@ -28,8 +29,7 @@ import java.util.LinkedList;
/**
* A dictionary that can fusion heads and tails of words for more compression.
*/
-public class FusionDictionary implements Iterable<Word> {
-
+public final class FusionDictionary implements Iterable<Word> {
private static final boolean DBG = MakedictLog.DBG;
/**
@@ -40,7 +40,7 @@ public class FusionDictionary implements Iterable<Word> {
* This class also contains fields to cache size and address, to help with binary
* generation.
*/
- public static class Node {
+ public static final class Node {
ArrayList<CharGroup> mData;
// To help with binary generation
int mCachedSize = Integer.MIN_VALUE;
@@ -60,7 +60,7 @@ public class FusionDictionary implements Iterable<Word> {
*
* This represents an "attribute", that is either a bigram or a shortcut.
*/
- public static class WeightedString {
+ public static final class WeightedString {
public final String mWord;
public int mFrequency;
public WeightedString(String word, int frequency) {
@@ -94,7 +94,7 @@ public class FusionDictionary implements Iterable<Word> {
* value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
* bigrams, but a non-terminal may not. Moreover, children, if present, are null.
*/
- public static class CharGroup {
+ public static final class CharGroup {
public static final int NOT_A_TERMINAL = -1;
final int mChars[];
ArrayList<WeightedString> mShortcutTargets;
@@ -142,6 +142,33 @@ public class FusionDictionary implements Iterable<Word> {
return NOT_A_TERMINAL != mFrequency;
}
+ public int getFrequency() {
+ return mFrequency;
+ }
+
+ public boolean getIsNotAWord() {
+ return mIsNotAWord;
+ }
+
+ public boolean getIsBlacklistEntry() {
+ return mIsBlacklistEntry;
+ }
+
+ public ArrayList<WeightedString> getShortcutTargets() {
+ // We don't want write permission to escape outside the package, so we return a copy
+ if (null == mShortcutTargets) return null;
+ final ArrayList<WeightedString> copyOfShortcutTargets =
+ new ArrayList<WeightedString>(mShortcutTargets);
+ return copyOfShortcutTargets;
+ }
+
+ public ArrayList<WeightedString> getBigrams() {
+ // We don't want write permission to escape outside the package, so we return a copy
+ if (null == mBigrams) return null;
+ final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams);
+ return copyOfBigrams;
+ }
+
public boolean hasSeveralChars() {
assert(mChars.length > 0);
return 1 < mChars.length;
@@ -250,10 +277,8 @@ public class FusionDictionary implements Iterable<Word> {
/**
* Options global to the dictionary.
- *
- * There are no options at the moment, so this class is empty.
*/
- public static class DictionaryOptions {
+ public static final class DictionaryOptions {
public final boolean mGermanUmlautProcessing;
public final boolean mFrenchLigatureProcessing;
public final HashMap<String, String> mAttributes;
@@ -263,6 +288,43 @@ public class FusionDictionary implements Iterable<Word> {
mGermanUmlautProcessing = germanUmlautProcessing;
mFrenchLigatureProcessing = frenchLigatureProcessing;
}
+ @Override
+ public String toString() { // Convenience method
+ return toString(0, false);
+ }
+ public String toString(final int indentCount, final boolean plumbing) {
+ final StringBuilder indent = new StringBuilder();
+ if (plumbing) {
+ indent.append("H:");
+ } else {
+ for (int i = 0; i < indentCount; ++i) {
+ indent.append(" ");
+ }
+ }
+ final StringBuilder s = new StringBuilder();
+ for (final String optionKey : mAttributes.keySet()) {
+ s.append(indent);
+ s.append(optionKey);
+ s.append(" = ");
+ if ("date".equals(optionKey) && !plumbing) {
+ // Date needs a number of milliseconds, but the dictionary contains seconds
+ s.append(new Date(
+ 1000 * Long.parseLong(mAttributes.get(optionKey))).toString());
+ } else {
+ s.append(mAttributes.get(optionKey));
+ }
+ s.append("\n");
+ }
+ if (mGermanUmlautProcessing) {
+ s.append(indent);
+ s.append("Needs German umlaut processing\n");
+ }
+ if (mFrenchLigatureProcessing) {
+ s.append(indent);
+ s.append("Needs French ligature processing\n");
+ }
+ return s.toString();
+ }
}
public final DictionaryOptions mOptions;
@@ -280,7 +342,7 @@ public class FusionDictionary implements Iterable<Word> {
/**
* Helper method to convert a String to an int array.
*/
- static private int[] getCodePoints(final String word) {
+ static int[] getCodePoints(final String word) {
// TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray,
// which is not visible from the makedict package. Factor this code.
final char[] characters = word.toCharArray();
@@ -359,6 +421,10 @@ public class FusionDictionary implements Iterable<Word> {
if (charGroup2 == null) {
add(getCodePoints(word2), 0, null, false /* isNotAWord */,
false /* isBlacklistEntry */);
+ // The chargroup for the first word may have moved by the above insertion,
+ // if word1 and word2 share a common stem that happens not to have been
+ // a cutting point until now. In this case, we need to refresh charGroup.
+ charGroup = findWordInTree(mRoot, word1);
}
charGroup.addBigram(word2, frequency);
} else {
@@ -511,7 +577,7 @@ public class FusionDictionary implements Iterable<Word> {
* is ignored.
* This comparator imposes orderings that are inconsistent with equals.
*/
- static private class CharGroupComparator implements java.util.Comparator<CharGroup> {
+ static private final class CharGroupComparator implements java.util.Comparator<CharGroup> {
@Override
public int compare(CharGroup c1, CharGroup c2) {
if (c1.mChars[0] == c2.mChars[0]) return 0;
@@ -746,9 +812,8 @@ public class FusionDictionary implements Iterable<Word> {
*
* This is purely for convenience.
*/
- public static class DictionaryIterator implements Iterator<Word> {
-
- private static class Position {
+ public static final class DictionaryIterator implements Iterator<Word> {
+ private static final class Position {
public Iterator<CharGroup> pos;
public int length;
public Position(ArrayList<CharGroup> groups) {
diff --git a/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java b/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java
index 3f0cd0796..6c6b00b6a 100644
--- a/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java
+++ b/java/src/com/android/inputmethod/latin/makedict/MakedictLog.java
@@ -21,7 +21,7 @@ import android.util.Log;
/**
* Wrapper to redirect log events to the right output medium.
*/
-public class MakedictLog {
+public final class MakedictLog {
public static final boolean DBG = false;
private static final String TAG = MakedictLog.class.getSimpleName();
diff --git a/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java b/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java
index 5b41d27f2..5bb24da74 100644
--- a/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java
+++ b/java/src/com/android/inputmethod/latin/makedict/PendingAttribute.java
@@ -22,7 +22,7 @@ package com.android.inputmethod.latin.makedict;
* An attribute is either a bigram or a shortcut.
* All instances of this class are always immutable.
*/
-public class PendingAttribute {
+public final class PendingAttribute {
public final int mFrequency;
public final int mAddress;
public PendingAttribute(final int frequency, final int address) {
diff --git a/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java b/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java
index bd42fb8fa..dbb2ea870 100644
--- a/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java
+++ b/java/src/com/android/inputmethod/latin/makedict/UnsupportedFormatException.java
@@ -19,7 +19,7 @@ package com.android.inputmethod.latin.makedict;
/**
* Simple exception thrown when a file format is not recognized.
*/
-public class UnsupportedFormatException extends Exception {
+public final class UnsupportedFormatException extends Exception {
public UnsupportedFormatException(String description) {
super(description);
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/Word.java b/java/src/com/android/inputmethod/latin/makedict/Word.java
index 4683ef154..4c4f18f1a 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Word.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Word.java
@@ -26,7 +26,7 @@ import java.util.Arrays;
*
* This is chiefly used to iterate a dictionary.
*/
-public class Word implements Comparable<Word> {
+public final class Word implements Comparable<Word> {
public final String mWord;
public final int mFrequency;
public final ArrayList<WeightedString> mShortcutTargets;