diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
12 files changed, 1217 insertions, 309 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java index ceb8fa81f..665c7a27c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -342,13 +342,11 @@ public final class BinaryDictDecoderUtils { * @param formatOptions file format options. * @return the word with its frequency, as a weighted string. */ - /* package for tests */ static WeightedString getWordAtPosition( - final Ver3DictDecoder dictDecoder, final int headerSize, final int pos, - final FormatOptions formatOptions) { - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); + /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, + final int headerSize, final int pos, final FormatOptions formatOptions) { final WeightedString result; - final int originalPos = dictBuffer.position(); - dictBuffer.position(pos); + final int originalPos = dictDecoder.getPosition(); + dictDecoder.setPosition(pos); if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { result = getWordAtPositionWithParentAddress(dictDecoder, pos, formatOptions); @@ -357,14 +355,13 @@ public final class BinaryDictDecoderUtils { formatOptions); } - dictBuffer.position(originalPos); + dictDecoder.setPosition(originalPos); return result; } @SuppressWarnings("unused") - private static WeightedString getWordAtPositionWithParentAddress( - final Ver3DictDecoder dictDecoder, final int pos, final FormatOptions options) { - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); + private static WeightedString getWordAtPositionWithParentAddress(final DictDecoder dictDecoder, + final int pos, final FormatOptions options) { int currentPos = pos; int frequency = Integer.MIN_VALUE; final StringBuilder builder = new StringBuilder(); @@ -373,7 +370,7 @@ public final class BinaryDictDecoderUtils { PtNodeInfo currentInfo; int loopCounter = 0; do { - dictBuffer.position(currentPos); + dictDecoder.setPosition(currentPos); currentInfo = dictDecoder.readPtNode(currentPos, options); if (BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, options)) { currentPos = currentInfo.mParentAddress + currentInfo.mOriginalAddress; @@ -392,11 +389,10 @@ public final class BinaryDictDecoderUtils { } private static WeightedString getWordAtPositionWithoutParentAddress( - final Ver3DictDecoder dictDecoder, final int headerSize, final int pos, + final DictDecoder dictDecoder, final int headerSize, final int pos, final FormatOptions options) { - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); - dictBuffer.position(headerSize); - final int count = readPtNodeCount(dictBuffer); + dictDecoder.setPosition(headerSize); + final int count = dictDecoder.readPtNodeCount(); int groupPos = headerSize + BinaryDictIOUtils.getPtNodeCountSize(count); final StringBuilder builder = new StringBuilder(); WeightedString result = null; @@ -414,8 +410,8 @@ public final class BinaryDictDecoderUtils { if (info.mChildrenAddress > pos) { if (null == last) continue; builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); - dictBuffer.position(last.mChildrenAddress); - i = readPtNodeCount(dictBuffer); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); last = null; continue; @@ -424,8 +420,8 @@ public final class BinaryDictDecoderUtils { } if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); - dictBuffer.position(last.mChildrenAddress); - i = readPtNodeCount(dictBuffer); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); last = null; continue; @@ -449,17 +445,16 @@ public final class BinaryDictDecoderUtils { * @param options file format options. * @return the read node array with all his children already read. */ - private static PtNodeArray readNodeArray(final Ver3DictDecoder dictDecoder, + private static PtNodeArray readNodeArray(final DictDecoder dictDecoder, final int headerSize, final Map<Integer, PtNodeArray> reverseNodeArrayMap, final Map<Integer, PtNode> reversePtNodeMap, final FormatOptions options) throws IOException { - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); final ArrayList<PtNode> nodeArrayContents = new ArrayList<PtNode>(); - final int nodeArrayOriginPos = dictBuffer.position(); + final int nodeArrayOriginPos = dictDecoder.getPosition(); do { // Scan the linked-list node. - final int nodeArrayHeadPos = dictBuffer.position(); - final int count = readPtNodeCount(dictBuffer); + final int nodeArrayHeadPos = dictDecoder.getPosition(); + final int count = dictDecoder.readPtNodeCount(); int groupOffsetPos = nodeArrayHeadPos + BinaryDictIOUtils.getPtNodeCountSize(count); for (int i = count; i > 0; --i) { // Scan the array of PtNode. PtNodeInfo info = dictDecoder.readPtNode(groupOffsetPos, options); @@ -480,11 +475,11 @@ public final class BinaryDictDecoderUtils { if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { PtNodeArray children = reverseNodeArrayMap.get(info.mChildrenAddress); if (null == children) { - final int currentPosition = dictBuffer.position(); - dictBuffer.position(info.mChildrenAddress); + final int currentPosition = dictDecoder.getPosition(); + dictDecoder.setPosition(info.mChildrenAddress); children = readNodeArray(dictDecoder, headerSize, reverseNodeArrayMap, reversePtNodeMap, options); - dictBuffer.position(currentPosition); + dictDecoder.setPosition(currentPosition); } nodeArrayContents.add( new PtNode(info.mCharacters, shortcutTargets, bigrams, @@ -503,15 +498,10 @@ public final class BinaryDictDecoderUtils { // reach the end of the array. if (options.mSupportsDynamicUpdate) { - final int nextAddress = dictBuffer.readUnsignedInt24(); - if (nextAddress >= 0 && nextAddress < dictBuffer.limit()) { - dictBuffer.position(nextAddress); - } else { - break; - } + final boolean hasValidForwardLink = dictDecoder.readAndFollowForwardLink(); + if (!hasValidForwardLink) break; } - } while (options.mSupportsDynamicUpdate && - dictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); + } while (options.mSupportsDynamicUpdate && dictDecoder.hasNextPtNodeArray()); final PtNodeArray nodeArray = new PtNodeArray(nodeArrayContents); nodeArray.mCachedAddressBeforeUpdate = nodeArrayOriginPos; @@ -560,7 +550,7 @@ public final class BinaryDictDecoderUtils { * @return the created (or merged) dictionary. */ @UsedForTesting - /* package */ static FusionDictionary readDictionaryBinary(final Ver3DictDecoder dictDecoder, + /* package */ static FusionDictionary readDictionaryBinary(final DictDecoder dictDecoder, final FusionDictionary dict) throws IOException, UnsupportedFormatException { // Read header final FileHeader fileHeader = dictDecoder.readHeader(); diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java index 5a213415a..6cc0bfb76 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -126,8 +126,14 @@ public class BinaryDictEncoderUtils { */ private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) { int size = getNodeHeaderSize(ptNode, options); - // If terminal, one byte for the frequency - if (ptNode.isTerminal()) size += FormatSpec.PTNODE_FREQUENCY_SIZE; + if (ptNode.isTerminal()) { + // If terminal, one byte for the frequency or four bytes for the terminal id. + if (options.mHasTerminalId) { + size += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + } else { + size += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + } size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address size += getShortcutListSize(ptNode.mShortcutTargets); if (null != ptNode.mBigrams) { @@ -198,6 +204,47 @@ public class BinaryDictEncoderUtils { } } + static int writeUIntToBuffer(final byte[] buffer, int position, final int value, + final int size) { + switch(size) { + case 4: + buffer[position++] = (byte) ((value >> 24) & 0xFF); + /* fall through */ + case 3: + buffer[position++] = (byte) ((value >> 16) & 0xFF); + /* fall through */ + case 2: + buffer[position++] = (byte) ((value >> 8) & 0xFF); + /* fall through */ + case 1: + buffer[position++] = (byte) (value & 0xFF); + break; + default: + /* nop */ + } + return position; + } + + static void writeUIntToStream(final OutputStream stream, final int value, final int size) + throws IOException { + switch(size) { + case 4: + stream.write((value >> 24) & 0xFF); + /* fall through */ + case 3: + stream.write((value >> 16) & 0xFF); + /* fall through */ + case 2: + stream.write((value >> 8) & 0xFF); + /* fall through */ + case 1: + stream.write(value & 0xFF); + break; + default: + /* nop */ + } + } + // End utility methods // This method is responsible for finding a nice ordering of the nodes that favors run-time @@ -324,7 +371,13 @@ public class BinaryDictEncoderUtils { changed = true; } int nodeSize = getNodeHeaderSize(ptNode, formatOptions); - if (ptNode.isTerminal()) nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; + if (ptNode.isTerminal()) { + if (formatOptions.mHasTerminalId) { + nodeSize += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + } else { + nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + } if (formatOptions.mSupportsDynamicUpdate) { nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; } else if (null != ptNode.mChildren) { @@ -335,9 +388,9 @@ public class BinaryDictEncoderUtils { if (null != ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, - nodeSize + size + FormatSpec.PTNODE_FLAGS_SIZE, + nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); - nodeSize += getByteSize(offset) + FormatSpec.PTNODE_FLAGS_SIZE; + nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; } } ptNode.mCachedSize = nodeSize; @@ -725,15 +778,22 @@ public class BinaryDictEncoderUtils { final FormatOptions formatOptions) { int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate + getNodeHeaderSize(ptNode, formatOptions); - if (ptNode.mFrequency >= 0) { - positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE; + if (ptNode.isTerminal()) { + // A terminal node has either the terminal id or the frequency. + // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children + // position. + if (formatOptions.mHasTerminalId) { + positionOfChildrenPosField += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + } else { + positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE; + } } return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField; } /** - * Write a PtNodeArray to memory. The PtNodeArray is expected to have its final position cached. + * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached. * * @param dict the dictionary the node array is a part of (for relative offsets). * @param dictEncoder the dictionary encoder. @@ -741,7 +801,7 @@ public class BinaryDictEncoderUtils { * @param formatOptions file format options. */ @SuppressWarnings("unused") - /* package */ static void writePlacedNode(final FusionDictionary dict, + /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, final DictEncoder dictEncoder, final PtNodeArray ptNodeArray, final FormatOptions formatOptions) { // TODO: Make the code in common with BinaryDictIOUtils#writePtNode @@ -766,14 +826,7 @@ public class BinaryDictEncoderUtils { + FormatSpec.MAX_TERMINAL_FREQUENCY + " : " + ptNode.mFrequency); } - - dictEncoder.writePtNodeFlags(ptNode, parentPosition, formatOptions); - dictEncoder.writeParentPosition(parentPosition, ptNode, formatOptions); - dictEncoder.writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); - dictEncoder.writeFrequency(ptNode.mFrequency); - dictEncoder.writeChildrenPosition(ptNode, formatOptions); - dictEncoder.writeShortcuts(ptNode.mShortcutTargets); - dictEncoder.writeBigrams(ptNode.mBigrams, dict); + dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict); } if (formatOptions.mSupportsDynamicUpdate) { dictEncoder.writeForwardLinkAddress(FormatSpec.NO_FORWARD_LINK_ADDRESS); @@ -849,8 +902,9 @@ public class BinaryDictEncoderUtils { * @param destination the stream to write the file header to. * @param dict the dictionary to write. * @param formatOptions file format options. + * @return the size of the header. */ - /* package */ static void writeDictionaryHeader(final OutputStream destination, + /* package */ static int writeDictionaryHeader(final OutputStream destination, final FusionDictionary dict, final FormatOptions formatOptions) throws IOException, UnsupportedFormatException { final int version = formatOptions.mVersion; @@ -899,5 +953,6 @@ public class BinaryDictEncoderUtils { destination.write(bytes); headerBuffer.close(); + return size; } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 106f02519..a282f595c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -61,12 +61,11 @@ public final class BinaryDictIOUtils { /** * Retrieves all node arrays without recursive call. */ - private static void readUnigramsAndBigramsBinaryInner( - final Ver3DictDecoder dictDecoder, final int headerSize, - final Map<Integer, String> words, final Map<Integer, Integer> frequencies, + private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, + final int headerSize, final Map<Integer, String> words, + final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams, final FormatOptions formatOptions) { - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; Stack<Position> stack = new Stack<Position>(); @@ -83,11 +82,11 @@ public final class BinaryDictIOUtils { p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength); } - if (dictBuffer.position() != p.mAddress) dictBuffer.position(p.mAddress); + if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress); if (index != p.mLength) index = p.mLength; if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) { - p.mNumOfPtNode = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer); + p.mNumOfPtNode = dictDecoder.readPtNodeCount(); p.mAddress += getPtNodeCountSize(p.mNumOfPtNode); p.mPosition = 0; } @@ -114,11 +113,12 @@ public final class BinaryDictIOUtils { if (p.mPosition == p.mNumOfPtNode) { if (formatOptions.mSupportsDynamicUpdate) { - final int forwardLinkAddress = dictBuffer.readUnsignedInt24(); - if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) { + final boolean hasValidForwardLinkAddress = + dictDecoder.readAndFollowForwardLink(); + if (hasValidForwardLinkAddress && dictDecoder.hasNextPtNodeArray()) { // The node array has a forward link. p.mNumOfPtNode = Position.NOT_READ_PTNODE_COUNT; - p.mAddress = forwardLinkAddress; + p.mAddress = dictDecoder.getPosition(); } else { stack.pop(); } @@ -127,7 +127,7 @@ public final class BinaryDictIOUtils { } } else { // The Ptnode array has more PtNodes. - p.mAddress = dictBuffer.position(); + p.mAddress = dictDecoder.getPosition(); } if (!isMovedPtNode && hasChildrenAddress(info.mChildrenAddress)) { @@ -148,7 +148,7 @@ public final class BinaryDictIOUtils { * @throws IOException if the file can't be read. * @throws UnsupportedFormatException if the format of the file is not recognized. */ - /* package */ static void readUnigramsAndBigramsBinary(final Ver3DictDecoder dictDecoder, + /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, UnsupportedFormatException { @@ -169,11 +169,10 @@ public final class BinaryDictIOUtils { * @throws UnsupportedFormatException if the format of the file is not recognized. */ @UsedForTesting - /* package */ static int getTerminalPosition(final Ver3DictDecoder dictDecoder, + /* package */ static int getTerminalPosition(final DictDecoder dictDecoder, final String word) throws IOException, UnsupportedFormatException { - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); if (word == null) return FormatSpec.NOT_VALID_WORD; - if (dictBuffer.position() != 0) dictBuffer.position(0); + dictDecoder.setPosition(0); final FileHeader header = dictDecoder.readHeader(); int wordPos = 0; @@ -182,10 +181,10 @@ public final class BinaryDictIOUtils { if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; do { - final int ptNodeCount = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer); + final int ptNodeCount = dictDecoder.readPtNodeCount(); boolean foundNextPtNode = false; for (int i = 0; i < ptNodeCount; ++i) { - final int ptNodePos = dictBuffer.position(); + final int ptNodePos = dictDecoder.getPosition(); final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos, header.mFormatOptions); final boolean isMovedNode = isMovedPtNode(currentInfo.mFlags, @@ -219,7 +218,7 @@ public final class BinaryDictIOUtils { return FormatSpec.NOT_VALID_WORD; } foundNextPtNode = true; - dictBuffer.position(currentInfo.mChildrenAddress); + dictDecoder.setPosition(currentInfo.mChildrenAddress); break; } } @@ -233,11 +232,11 @@ public final class BinaryDictIOUtils { return FormatSpec.NOT_VALID_WORD; } - final int forwardLinkAddress = dictBuffer.readUnsignedInt24(); - if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { + final boolean hasValidForwardLinkAddress = + dictDecoder.readAndFollowForwardLink(); + if (!hasValidForwardLinkAddress || !dictDecoder.hasNextPtNodeArray()) { return FormatSpec.NOT_VALID_WORD; } - dictBuffer.position(forwardLinkAddress); } while(true); } return FormatSpec.NOT_VALID_WORD; @@ -520,7 +519,7 @@ public final class BinaryDictIOUtils { final File file, final long offset, final long length) throws FileNotFoundException, IOException, UnsupportedFormatException { final byte[] buffer = new byte[HEADER_READING_BUFFER_SIZE]; - final Ver3DictDecoder dictDecoder = new Ver3DictDecoder(file, + final DictDecoder dictDecoder = FormatSpec.getDictDecoder(file, new DictDecoder.DictionaryBufferFactory() { @Override public DictBuffer getDictionaryBuffer(File file) diff --git a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java index 11a3f0b3a..3796a466c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java @@ -17,9 +17,11 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.ByteArrayDictBuffer; import java.io.File; @@ -30,13 +32,50 @@ import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; +import java.util.HashMap; import java.util.TreeMap; /** - * An interface of binary dictionary decoder. + * The base class of binary dictionary decoders. */ -public interface DictDecoder { - public FileHeader readHeader() throws IOException, UnsupportedFormatException; +public abstract class DictDecoder { + + protected FileHeader readHeader(final DictBuffer dictBuffer) + throws IOException, UnsupportedFormatException { + if (dictBuffer == null) { + openDictBuffer(); + } + + final int version = HeaderReader.readVersion(dictBuffer); + if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION + || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { + throw new UnsupportedFormatException("Unsupported version : " + version); + } + // TODO: Remove this field. + final int optionsFlags = HeaderReader.readOptionFlags(dictBuffer); + + final int headerSize = HeaderReader.readHeaderSize(dictBuffer); + + if (headerSize < 0) { + throw new UnsupportedFormatException("header size can't be negative."); + } + + final HashMap<String, String> attributes = HeaderReader.readAttributes(dictBuffer, + headerSize); + + final FileHeader header = new FileHeader(headerSize, + new FusionDictionary.DictionaryOptions(attributes, + 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG), + 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)), + new FormatOptions(version, + 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE))); + return header; + } + + /** + * Reads and returns the file header. + */ + public abstract FileHeader readHeader() throws IOException, UnsupportedFormatException; /** * Reads PtNode from nodeAddress. @@ -44,7 +83,7 @@ public interface DictDecoder { * @param formatOptions the format options. * @return PtNodeInfo. */ - public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions formatOptions); + public abstract PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions formatOptions); /** * Reads a buffer and returns the memory representation of the dictionary. @@ -54,11 +93,14 @@ public interface DictDecoder { * which words from the buffer should be added. If it is null, a new dictionary is created. * * @param dict an optional dictionary to add words to, or null. + * @param deleteDictIfBroken a flag indicating whether this method should remove the broken + * dictionary or not. * @return the created (or merged) dictionary. */ @UsedForTesting - public FusionDictionary readDictionaryBinary(final FusionDictionary dict) - throws FileNotFoundException, IOException, UnsupportedFormatException; + public abstract FusionDictionary readDictionaryBinary(final FusionDictionary dict, + final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException; /** * Gets the address of the last PtNode of the exact matching word in the dictionary. @@ -71,7 +113,12 @@ public interface DictDecoder { */ @UsedForTesting public int getTerminalPosition(final String word) - throws IOException, UnsupportedFormatException; + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + return BinaryDictIOUtils.getTerminalPosition(this, word); + } /** * Reads unigrams and bigrams from the binary file. @@ -83,15 +130,56 @@ public interface DictDecoder { * @throws IOException if the file can't be read. * @throws UnsupportedFormatException if the format of the file is not recognized. */ + @UsedForTesting public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, final TreeMap<Integer, Integer> frequencies, final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) - throws IOException, UnsupportedFormatException; + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams); + } - // Flags for DictionaryBufferFactory. + /** + * Sets the position of the buffer to the given value. + * + * @param newPos the new position + */ + public abstract void setPosition(final int newPos); + + /** + * Gets the position of the buffer. + * + * @return the position + */ + public abstract int getPosition(); + + /** + * Reads and returns the PtNode count out of a buffer and forwards the pointer. + */ + public abstract int readPtNodeCount(); + + /** + * Reads the forward link and advances the position. + * + * @return true if this method moves the file pointer, false otherwise. + */ + public abstract boolean readAndFollowForwardLink(); + public abstract boolean hasNextPtNodeArray(); + + /** + * Opens the dictionary file and makes DictBuffer. + */ + @UsedForTesting + public abstract void openDictBuffer() throws FileNotFoundException, IOException; + @UsedForTesting + public abstract boolean isDictBufferOpen(); + + // Constants for DictionaryBufferFactory. public static final int USE_READONLY_BYTEBUFFER = 0x01000000; public static final int USE_BYTEARRAY = 0x02000000; - public static final int USE_WRITABLE_BYTEBUFFER = 0x04000000; + public static final int USE_WRITABLE_BYTEBUFFER = 0x03000000; public static final int MASK_DICTBUFFER = 0x0F000000; public interface DictionaryBufferFactory { @@ -183,4 +271,124 @@ public interface DictDecoder { return null; } } + + /** + * A utility class for reading a file header. + */ + protected static class HeaderReader { + protected static int readVersion(final DictBuffer dictBuffer) + throws IOException, UnsupportedFormatException { + return BinaryDictDecoderUtils.checkFormatVersion(dictBuffer); + } + + protected static int readOptionFlags(final DictBuffer dictBuffer) { + return dictBuffer.readUnsignedShort(); + } + + protected static int readHeaderSize(final DictBuffer dictBuffer) { + return dictBuffer.readInt(); + } + + protected static HashMap<String, String> readAttributes(final DictBuffer dictBuffer, + final int headerSize) { + final HashMap<String, String> attributes = new HashMap<String, String>(); + while (dictBuffer.position() < headerSize) { + // We can avoid an infinite loop here since dictBuffer.position() is always + // increased by calling CharEncoding.readString. + final String key = CharEncoding.readString(dictBuffer); + final String value = CharEncoding.readString(dictBuffer); + attributes.put(key, value); + } + dictBuffer.position(headerSize); + return attributes; + } + } + + /** + * A utility class for reading a PtNode. + */ + protected static class PtNodeReader { + protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) { + return dictBuffer.readUnsignedByte(); + } + + protected static int readParentAddress(final DictBuffer dictBuffer, + final FormatOptions formatOptions) { + if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { + return BinaryDictDecoderUtils.readSInt24(dictBuffer); + } else { + return FormatSpec.NO_PARENT_ADDRESS; + } + } + + protected static int readChildrenAddress(final DictBuffer dictBuffer, final int optionFlags, + final FormatOptions formatOptions) { + if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { + final int address = BinaryDictDecoderUtils.readSInt24(dictBuffer); + if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS; + return address; + } else { + switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: + return dictBuffer.readUnsignedByte(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: + return dictBuffer.readUnsignedShort(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: + return dictBuffer.readUnsignedInt24(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: + default: + return FormatSpec.NO_CHILDREN_ADDRESS; + } + } + } + + // Reads shortcuts and returns the read length. + protected static int readShortcut(final DictBuffer dictBuffer, + final ArrayList<WeightedString> shortcutTargets) { + final int pointerBefore = dictBuffer.position(); + dictBuffer.readUnsignedShort(); // skip the size + while (true) { + final int targetFlags = dictBuffer.readUnsignedByte(); + final String word = CharEncoding.readString(dictBuffer); + shortcutTargets.add(new WeightedString(word, + targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); + if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return dictBuffer.position() - pointerBefore; + } + + protected static int readBigramAddresses(final DictBuffer dictBuffer, + final ArrayList<PendingAttribute> bigrams, final int baseAddress) { + int readLength = 0; + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + final int bigramFlags = dictBuffer.readUnsignedByte(); + ++readLength; + final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) + ? 1 : -1; + int bigramAddress = baseAddress + readLength; + switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: + bigramAddress += sign * dictBuffer.readUnsignedByte(); + readLength += 1; + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: + bigramAddress += sign * dictBuffer.readUnsignedShort(); + readLength += 2; + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: + bigramAddress += sign * dictBuffer.readUnsignedInt24(); + readLength += 3; + break; + default: + throw new RuntimeException("Has bigrams with no address"); + } + bigrams.add(new PendingAttribute( + bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, + bigramAddress)); + if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return readLength; + } + } } diff --git a/java/src/com/android/inputmethod/latin/makedict/DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/DictEncoder.java index d1589a30e..ea5d492d8 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictEncoder.java @@ -18,10 +18,8 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; -import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.IOException; -import java.util.ArrayList; /** * An interface of binary dictionary encoder. @@ -33,28 +31,8 @@ public interface DictEncoder { public void setPosition(final int position); public int getPosition(); public void writePtNodeCount(final int ptNodeCount); - public void writePtNodeFlags(final PtNode ptNode, final int parentAddress, - final FormatOptions formatOptions); - public void writeParentPosition(final int parentPosition, final PtNode ptNode, - final FormatOptions formatOptions); - public void writeCharacters(final int[] characters, final boolean hasSeveralChars); - public void writeFrequency(final int frequency); - public void writeChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions); - - /** - * Write a shortcut attributes list to memory. - * - * @param shortcuts the shortcut attributes list. - */ - public void writeShortcuts(final ArrayList<WeightedString> shortcuts); - - /** - * Write a bigram attributes list to memory. - * - * @param bigrams the bigram attributes list. - * @param dict the dictionary the node array is a part of (for relative offsets). - */ - public void writeBigrams(final ArrayList<WeightedString> bigrams, final FusionDictionary dict); - public void writeForwardLinkAddress(final int forwardLinkAddress); + + public void writePtNode(final PtNode ptNode, final int parentPosition, + final FormatOptions formatOptions, final FusionDictionary dict); } diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index bf35f6a8a..849bff050 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -18,8 +18,11 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import java.io.File; + /** * Dictionary File Format Specification. */ @@ -195,9 +198,12 @@ public final class FormatSpec { public static final int MAGIC_NUMBER = 0x9BC13AFE; static final int MINIMUM_SUPPORTED_VERSION = 2; - static final int MAXIMUM_SUPPORTED_VERSION = 3; + static final int MAXIMUM_SUPPORTED_VERSION = 4; static final int NOT_A_VERSION_NUMBER = -1; static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3; + static final int FIRST_VERSION_WITH_TERMINAL_ID = 4; + static final int VERSION3 = 3; + static final int VERSION4 = 4; // These options need to be the same numeric values as the one in the native reading code. static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; @@ -248,11 +254,20 @@ public final class FormatSpec { static final int PTNODE_TERMINATOR_SIZE = 1; static final int PTNODE_FLAGS_SIZE = 1; static final int PTNODE_FREQUENCY_SIZE = 1; + static final int PTNODE_TERMINAL_ID_SIZE = 4; static final int PTNODE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1; static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2; + // These values are used only by version 4 or later. + static final String TRIE_FILE_EXTENSION = ".trie"; + static final String FREQ_FILE_EXTENSION = ".freq"; + // tat = Terminal Address Table + static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; + static final int FREQUENCY_AND_FLAGS_SIZE = 2; + static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; + static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_PARENT_ADDRESS = 0; static final int NO_FORWARD_LINK_ADDRESS = 0; @@ -261,6 +276,7 @@ public final class FormatSpec { static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127 static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767 static final int MAX_BIGRAMS_IN_A_PTNODE = 10000; + static final int MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE = 0xFFFF; static final int MAX_TERMINAL_FREQUENCY = 255; static final int MAX_BIGRAM_FREQUENCY = 15; @@ -284,6 +300,7 @@ public final class FormatSpec { public static final class FormatOptions { public final int mVersion; public final boolean mSupportsDynamicUpdate; + public final boolean mHasTerminalId; @UsedForTesting public FormatOptions(final int version) { this(version, false); @@ -297,6 +314,7 @@ public final class FormatSpec { + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior."); } mSupportsDynamicUpdate = supportsDynamicUpdate; + mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID); } } @@ -307,6 +325,12 @@ public final class FormatSpec { public final int mHeaderSize; public final DictionaryOptions mDictionaryOptions; public final FormatOptions mFormatOptions; + // Note that these are corresponding definitions in native code in latinime::HeaderPolicy + // and latinime::HeaderReadWriteUtils. + public static final String SUPPORTS_DYNAMIC_UPDATE_ATTRIBUTE = "SUPPORTS_DYNAMIC_UPDATE"; + public static final String USES_FORGETTING_CURVE_ATTRIBUTE = "USES_FORGETTING_CURVE"; + public static final String ATTRIBUTE_VALUE_TRUE = "1"; + private static final String DICTIONARY_VERSION_ATTRIBUTE = "version"; private static final String DICTIONARY_LOCALE_ATTRIBUTE = "locale"; private static final String DICTIONARY_ID_ATTRIBUTE = "dictionary"; @@ -341,6 +365,36 @@ public final class FormatSpec { } } + /** + * Returns new dictionary decoder. + * + * @param dictFile the dictionary file. + * @param bufferType The type of buffer, as one of USE_* in DictDecoder. + * @return new dictionary decoder if the dictionary file exists, otherwise null. + */ + public static DictDecoder getDictDecoder(final File dictFile, final int bufferType) { + if (dictFile.isDirectory()) { + return new Ver4DictDecoder(dictFile, bufferType); + } else if (dictFile.isFile()) { + return new Ver3DictDecoder(dictFile, bufferType); + } + return null; + } + + public static DictDecoder getDictDecoder(final File dictFile, + final DictionaryBufferFactory factory) { + if (dictFile.isDirectory()) { + return new Ver4DictDecoder(dictFile, factory); + } else if (dictFile.isFile()) { + return new Ver3DictDecoder(dictFile, factory); + } + return null; + } + + public static DictDecoder getDictDecoder(final File dictFile) { + return getDictDecoder(dictFile, DictDecoder.USE_READONLY_BYTEBUFFER); + } + private FormatSpec() { // This utility class is not publicly instantiable. } diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 3e685a3d7..be653feec 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -111,6 +111,7 @@ public final class FusionDictionary implements Iterable<Word> { ArrayList<WeightedString> mShortcutTargets; ArrayList<WeightedString> mBigrams; int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. + int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. PtNodeArray mChildren; boolean mIsNotAWord; // Only a shortcut boolean mIsBlacklistEntry; @@ -129,6 +130,7 @@ public final class FusionDictionary implements Iterable<Word> { final boolean isNotAWord, final boolean isBlacklistEntry) { mChars = chars; mFrequency = frequency; + mTerminalId = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = null; @@ -156,6 +158,10 @@ public final class FusionDictionary implements Iterable<Word> { mChildren.mData.add(n); } + public int getTerminalId() { + return mTerminalId; + } + public boolean isTerminal() { return NOT_A_TERMINAL != mFrequency; } diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTable.java b/java/src/com/android/inputmethod/latin/makedict/SparseTable.java new file mode 100644 index 000000000..0b9cf91d2 --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/SparseTable.java @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; + +/** + * SparseTable is an extensible map from integer to integer. + * This holds one value for every mBlockSize keys, so it uses 1/mBlockSize'th of the full index + * memory. + */ +@UsedForTesting +public class SparseTable { + + /** + * mLookupTable is indexed by terminal ID, containing exactly one entry for every mBlockSize + * terminals. + * It contains at index i = j / mBlockSize the index in mContentsTable where the values for + * terminals with IDs j to j + mBlockSize - 1 are stored as an mBlockSize-sized integer array. + */ + private final ArrayList<Integer> mLookupTable; + private final ArrayList<Integer> mContentTable; + + private final int mBlockSize; + public static final int NOT_EXIST = -1; + + @UsedForTesting + public SparseTable(final int initialCapacity, final int blockSize) { + mBlockSize = blockSize; + final int lookupTableSize = initialCapacity / mBlockSize + + (initialCapacity % mBlockSize > 0 ? 1 : 0); + mLookupTable = new ArrayList<Integer>(Collections.nCopies(lookupTableSize, NOT_EXIST)); + mContentTable = new ArrayList<Integer>(); + } + + @UsedForTesting + public SparseTable(final int[] lookupTable, final int[] contentTable, final int blockSize) { + mBlockSize = blockSize; + mLookupTable = new ArrayList<Integer>(lookupTable.length); + for (int i = 0; i < lookupTable.length; ++i) { + mLookupTable.add(lookupTable[i]); + } + mContentTable = new ArrayList<Integer>(contentTable.length); + for (int i = 0; i < contentTable.length; ++i) { + mContentTable.add(contentTable[i]); + } + } + + /** + * Converts an byte array to an int array considering each set of 4 bytes is an int stored in + * big-endian. + * The length of byteArray must be a multiple of four. + * Otherwise, IndexOutOfBoundsException will be raised. + */ + @UsedForTesting + private static void convertByteArrayToIntegerArray(final byte[] byteArray, + final ArrayList<Integer> integerArray) { + for (int i = 0; i < byteArray.length; i += 4) { + int value = 0; + for (int j = i; j < i + 4; ++j) { + value <<= 8; + value |= byteArray[j] & 0xFF; + } + integerArray.add(value); + } + } + + @UsedForTesting + public SparseTable(final byte[] lookupTable, final byte[] contentTable, final int blockSize) { + mBlockSize = blockSize; + mLookupTable = new ArrayList<Integer>(lookupTable.length / 4); + mContentTable = new ArrayList<Integer>(contentTable.length / 4); + convertByteArrayToIntegerArray(lookupTable, mLookupTable); + convertByteArrayToIntegerArray(contentTable, mContentTable); + } + + @UsedForTesting + public int get(final int index) { + if (index < 0 || index / mBlockSize >= mLookupTable.size() + || mLookupTable.get(index / mBlockSize) == NOT_EXIST) { + return NOT_EXIST; + } + return mContentTable.get(mLookupTable.get(index / mBlockSize) + (index % mBlockSize)); + } + + @UsedForTesting + public void set(final int index, final int value) { + if (mLookupTable.get(index / mBlockSize) == NOT_EXIST) { + mLookupTable.set(index / mBlockSize, mContentTable.size()); + for (int i = 0; i < mBlockSize; ++i) { + mContentTable.add(NOT_EXIST); + } + } + mContentTable.set(mLookupTable.get(index / mBlockSize) + (index % mBlockSize), value); + } + + public void remove(final int index) { + set(index, NOT_EXIST); + } + + @UsedForTesting + public int size() { + return mLookupTable.size() * mBlockSize; + } + + @UsedForTesting + /* package */ int getContentTableSize() { + return mContentTable.size(); + } + + @UsedForTesting + /* package */ int getLookupTableSize() { + return mLookupTable.size(); + } + + public boolean contains(final int index) { + return get(index) != NOT_EXIST; + } + + @UsedForTesting + public void write(final OutputStream lookupOutStream, final OutputStream contentOutStream) + throws IOException { + for (final int index : mLookupTable) { + BinaryDictEncoderUtils.writeUIntToStream(lookupOutStream, index, 4); + } + + for (final int index : mContentTable) { + BinaryDictEncoderUtils.writeUIntToStream(contentOutStream, index, 4); + } + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java index 1a5023ef6..848277cd4 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java @@ -32,14 +32,12 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; -import java.util.TreeMap; /** * An implementation of DictDecoder for version 3 binary dictionary. */ @UsedForTesting -public class Ver3DictDecoder implements DictDecoder { +public class Ver3DictDecoder extends DictDecoder { private static final String TAG = Ver3DictDecoder.class.getSimpleName(); static { @@ -49,135 +47,17 @@ public class Ver3DictDecoder implements DictDecoder { // TODO: implement something sensical instead of just a phony method private static native int doNothing(); - private final static class HeaderReader { - protected static int readVersion(final DictBuffer dictBuffer) - throws IOException, UnsupportedFormatException { - return BinaryDictDecoderUtils.checkFormatVersion(dictBuffer); - } - - protected static int readOptionFlags(final DictBuffer dictBuffer) { - return dictBuffer.readUnsignedShort(); - } - - protected static int readHeaderSize(final DictBuffer dictBuffer) { - return dictBuffer.readInt(); - } - - protected static HashMap<String, String> readAttributes(final DictBuffer dictBuffer, - final int headerSize) { - final HashMap<String, String> attributes = new HashMap<String, String>(); - while (dictBuffer.position() < headerSize) { - // We can avoid an infinite loop here since dictBuffer.position() is always - // increased by calling CharEncoding.readString. - final String key = CharEncoding.readString(dictBuffer); - final String value = CharEncoding.readString(dictBuffer); - attributes.put(key, value); - } - dictBuffer.position(headerSize); - return attributes; - } - } - - private final static class PtNodeReader { - protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) { + protected static class PtNodeReader extends DictDecoder.PtNodeReader { + private static int readFrequency(final DictBuffer dictBuffer) { return dictBuffer.readUnsignedByte(); } - - protected static int readParentAddress(final DictBuffer dictBuffer, - final FormatOptions formatOptions) { - if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { - return BinaryDictDecoderUtils.readSInt24(dictBuffer); - } else { - return FormatSpec.NO_PARENT_ADDRESS; - } - } - - protected static int readFrequency(final DictBuffer dictBuffer) { - return dictBuffer.readUnsignedByte(); - } - - protected static int readChildrenAddress(final DictBuffer dictBuffer, final int optionFlags, - final FormatOptions formatOptions) { - if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { - final int address = BinaryDictDecoderUtils.readSInt24(dictBuffer); - if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS; - return address; - } else { - switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: - return dictBuffer.readUnsignedByte(); - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: - return dictBuffer.readUnsignedShort(); - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: - return dictBuffer.readUnsignedInt24(); - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: - default: - return FormatSpec.NO_CHILDREN_ADDRESS; - } - } - } - - // Reads shortcuts and returns the read length. - protected static int readShortcut(final DictBuffer dictBuffer, - final ArrayList<WeightedString> shortcutTargets) { - final int pointerBefore = dictBuffer.position(); - dictBuffer.readUnsignedShort(); // skip the size - while (true) { - final int targetFlags = dictBuffer.readUnsignedByte(); - final String word = CharEncoding.readString(dictBuffer); - shortcutTargets.add(new WeightedString(word, - targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); - if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; - } - return dictBuffer.position() - pointerBefore; - } - - protected static int readBigrams(final DictBuffer dictBuffer, - final ArrayList<PendingAttribute> bigrams, final int baseAddress) { - int readLength = 0; - int bigramCount = 0; - while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - final int bigramFlags = dictBuffer.readUnsignedByte(); - ++readLength; - final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) - ? 1 : -1; - int bigramAddress = baseAddress + readLength; - switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: - bigramAddress += sign * dictBuffer.readUnsignedByte(); - readLength += 1; - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: - bigramAddress += sign * dictBuffer.readUnsignedShort(); - readLength += 2; - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: - final int offset = (dictBuffer.readUnsignedByte() << 16) - + dictBuffer.readUnsignedShort(); - bigramAddress += sign * offset; - readLength += 3; - break; - default: - throw new RuntimeException("Has bigrams with no address"); - } - bigrams.add(new PendingAttribute( - bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, - bigramAddress)); - if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; - } - return readLength; - } } private final File mDictionaryBinaryFile; private final DictionaryBufferFactory mBufferFactory; private DictBuffer mDictBuffer; - public Ver3DictDecoder(final File file) { - this(file, USE_READONLY_BYTEBUFFER); - } - - public Ver3DictDecoder(final File file, final int factoryFlag) { + /* package */ Ver3DictDecoder(final File file, final int factoryFlag) { mDictionaryBinaryFile = file; mDictBuffer = null; @@ -192,15 +72,21 @@ public class Ver3DictDecoder implements DictDecoder { } } - public Ver3DictDecoder(final File file, final DictionaryBufferFactory factory) { + /* package */ Ver3DictDecoder(final File file, final DictionaryBufferFactory factory) { mDictionaryBinaryFile = file; mBufferFactory = factory; } + @Override public void openDictBuffer() throws FileNotFoundException, IOException { mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile); } + @Override + public boolean isDictBufferOpen() { + return mDictBuffer != null; + } + /* package */ DictBuffer getDictBuffer() { return mDictBuffer; } @@ -216,25 +102,11 @@ public class Ver3DictDecoder implements DictDecoder { if (mDictBuffer == null) { openDictBuffer(); } - - final int version = HeaderReader.readVersion(mDictBuffer); - final int optionsFlags = HeaderReader.readOptionFlags(mDictBuffer); - - final int headerSize = HeaderReader.readHeaderSize(mDictBuffer); - - if (headerSize < 0) { - throw new UnsupportedFormatException("header size can't be negative."); + final FileHeader header = super.readHeader(mDictBuffer); + final int version = header.mFormatOptions.mVersion; + if (!(version >= 2 && version <= 3)) { + throw new UnsupportedFormatException("File header has a wrong version : " + version); } - - final HashMap<String, String> attributes = HeaderReader.readAttributes(mDictBuffer, - headerSize); - - final FileHeader header = new FileHeader(headerSize, - new FusionDictionary.DictionaryOptions(attributes, - 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG), - 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)), - new FormatOptions(version, - 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE))); return header; } @@ -244,11 +116,11 @@ public class Ver3DictDecoder implements DictDecoder { public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions options) { int addressPointer = ptNodePos; final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - ++addressPointer; + addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { - addressPointer += 3; + addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; } final int characters[]; @@ -256,7 +128,7 @@ public class Ver3DictDecoder implements DictDecoder { int index = 0; int character = CharEncoding.readChar(mDictBuffer); addressPointer += CharEncoding.getCharSize(character); - while (-1 != character) { + while (FormatSpec.INVALID_CHARACTER != character) { // FusionDictionary is making sure that the length of the word is smaller than // MAX_WORD_LENGTH. // So we'll never write past the end of mCharacterBuffer. @@ -272,8 +144,8 @@ public class Ver3DictDecoder implements DictDecoder { } final int frequency; if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { - ++addressPointer; frequency = PtNodeReader.readFrequency(mDictBuffer); + addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE; } else { frequency = PtNode.NOT_A_TERMINAL; } @@ -294,7 +166,8 @@ public class Ver3DictDecoder implements DictDecoder { final ArrayList<PendingAttribute> bigrams; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); - addressPointer += PtNodeReader.readBigrams(mDictBuffer, bigrams, addressPointer); + addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, + addressPointer); if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { MakedictLog.d("too many bigrams in a PtNode."); } @@ -306,7 +179,8 @@ public class Ver3DictDecoder implements DictDecoder { } @Override - public FusionDictionary readDictionaryBinary(final FusionDictionary dict) + public FusionDictionary readDictionaryBinary(final FusionDictionary dict, + final boolean deleteDictIfBroken) throws FileNotFoundException, IOException, UnsupportedFormatException { if (mDictBuffer == null) { openDictBuffer(); @@ -315,13 +189,13 @@ public class Ver3DictDecoder implements DictDecoder { return BinaryDictDecoderUtils.readDictionaryBinary(this, dict); } catch (IOException e) { Log.e(TAG, "The dictionary " + mDictionaryBinaryFile.getName() + " is broken.", e); - if (!mDictionaryBinaryFile.delete()) { + if (deleteDictIfBroken && !mDictionaryBinaryFile.delete()) { Log.e(TAG, "Failed to delete the broken dictionary."); } throw e; } catch (UnsupportedFormatException e) { Log.e(TAG, "The dictionary " + mDictionaryBinaryFile.getName() + " is broken.", e); - if (!mDictionaryBinaryFile.delete()) { + if (deleteDictIfBroken && !mDictionaryBinaryFile.delete()) { Log.e(TAG, "Failed to delete the broken dictionary."); } throw e; @@ -329,22 +203,32 @@ public class Ver3DictDecoder implements DictDecoder { } @Override - public int getTerminalPosition(String word) throws IOException, UnsupportedFormatException { - if (mDictBuffer == null) { - openDictBuffer(); - } - return BinaryDictIOUtils.getTerminalPosition(this, word); + public void setPosition(int newPos) { + mDictBuffer.position(newPos); } @Override - public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, - final TreeMap<Integer, Integer> frequencies, - final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) - throws IOException, UnsupportedFormatException { - if (mDictBuffer == null) { - openDictBuffer(); + public int getPosition() { + return mDictBuffer.position(); + } + + @Override + public int readPtNodeCount() { + return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer); + } + + @Override + public boolean readAndFollowForwardLink() { + final int nextAddress = mDictBuffer.readUnsignedInt24(); + if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) { + mDictBuffer.position(nextAddress); + return true; } - BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams); + return false; } + @Override + public boolean hasNextPtNodeArray() { + return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS; + } } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java index 9385ba3a0..76f0f4052 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java @@ -68,6 +68,12 @@ public class Ver3DictEncoder implements DictEncoder { @Override public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion > FormatSpec.VERSION3) { + throw new UnsupportedFormatException( + "The given format options has wrong version number : " + + formatOptions.mVersion); + } + if (mOutStream == null) { openStream(); } @@ -97,7 +103,7 @@ public class Ver3DictEncoder implements DictEncoder { MakedictLog.i("Writing file..."); for (PtNodeArray nodeArray : flatNodes) { - BinaryDictEncoderUtils.writePlacedNode(dict, this, nodeArray, formatOptions); + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); } if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); mOutStream.write(mBuffer, 0, mPosition); @@ -120,26 +126,23 @@ public class Ver3DictEncoder implements DictEncoder { @Override public void writePtNodeCount(final int ptNodeCount) { final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); - if (1 == countSize) { - mBuffer[mPosition++] = (byte) ptNodeCount; - } else if (2 == countSize) { - mBuffer[mPosition++] = (byte) ((ptNodeCount >> 8) & 0xFF); - mBuffer[mPosition++] = (byte) (ptNodeCount & 0xFF); - } else { + if (countSize != 1 && countSize != 2) { throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); } + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, ptNodeCount, + countSize); } - @Override - public void writePtNodeFlags(final PtNode ptNode, final int parentAddress, + private void writePtNodeFlags(final PtNode ptNode, final int parentAddress, final FormatOptions formatOptions) { final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); - mBuffer[mPosition++] = BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mPosition, - childrenPos, formatOptions); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mPosition, childrenPos, + formatOptions), + FormatSpec.PTNODE_FLAGS_SIZE); } - @Override - public void writeParentPosition(final int parentPosition, final PtNode ptNode, + private void writeParentPosition(final int parentPosition, final PtNode ptNode, final FormatOptions formatOptions) { if (parentPosition == FormatSpec.NO_PARENT_ADDRESS) { mPosition = BinaryDictEncoderUtils.writeParentAddress(mBuffer, mPosition, @@ -150,23 +153,21 @@ public class Ver3DictEncoder implements DictEncoder { } } - @Override - public void writeCharacters(final int[] codePoints, final boolean hasSeveralChars) { + private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars) { mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition); if (hasSeveralChars) { mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; } } - @Override - public void writeFrequency(final int frequency) { + private void writeFrequency(final int frequency) { if (frequency >= 0) { - mBuffer[mPosition++] = (byte) frequency; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, + FormatSpec.PTNODE_FREQUENCY_SIZE); } } - @Override - public void writeChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions) { + private void writeChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions) { final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); if (formatOptions.mSupportsDynamicUpdate) { mPosition += BinaryDictEncoderUtils.writeSignedChildrenPosition(mBuffer, mPosition, @@ -177,8 +178,12 @@ public class Ver3DictEncoder implements DictEncoder { } } - @Override - public void writeShortcuts(final ArrayList<WeightedString> shortcuts) { + /** + * Write a shortcut attributes list to mBuffer. + * + * @param shortcuts the shortcut attributes list. + */ + private void writeShortcuts(final ArrayList<WeightedString> shortcuts) { if (null == shortcuts || shortcuts.isEmpty()) return; final int indexOfShortcutByteSize = mPosition; @@ -189,20 +194,27 @@ public class Ver3DictEncoder implements DictEncoder { final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( shortcutIterator.hasNext(), target.mFrequency); - mBuffer[mPosition++] = (byte)shortcutFlags; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord); mPosition += shortcutShift; } final int shortcutByteSize = mPosition - indexOfShortcutByteSize; - if (shortcutByteSize > 0xFFFF) { + if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { throw new RuntimeException("Shortcut list too large"); } - mBuffer[indexOfShortcutByteSize] = (byte)((shortcutByteSize >> 8) & 0xFF); - mBuffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF); - } - - @Override - public void writeBigrams(final ArrayList<WeightedString> bigrams, final FusionDictionary dict) { + BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize, + FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + + /** + * Write a bigram attributes list to mBuffer. + * + * @param bigrams the bigram attributes list. + * @param dict the dictionary the node array is a part of (for relative offsets). + */ + private void writeBigrams(final ArrayList<WeightedString> bigrams, + final FusionDictionary dict) { if (bigrams == null) return; final Iterator<WeightedString> bigramIterator = bigrams.iterator(); @@ -214,9 +226,10 @@ public class Ver3DictEncoder implements DictEncoder { final int unigramFrequencyForThisWord = target.mFrequency; final int offset = addressOfBigram - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); - mBuffer[mPosition++] = (byte) bigramFlags; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, Math.abs(offset)); } @@ -224,8 +237,19 @@ public class Ver3DictEncoder implements DictEncoder { @Override public void writeForwardLinkAddress(final int forwardLinkAddress) { - mBuffer[mPosition++] = (byte) ((forwardLinkAddress >> 16) & 0xFF); - mBuffer[mPosition++] = (byte) ((forwardLinkAddress >> 8) & 0xFF); - mBuffer[mPosition++] = (byte) (forwardLinkAddress & 0xFF); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, forwardLinkAddress, + FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + } + + @Override + public void writePtNode(final PtNode ptNode, final int parentPosition, + final FormatOptions formatOptions, final FusionDictionary dict) { + writePtNodeFlags(ptNode, parentPosition, formatOptions); + writeParentPosition(parentPosition, ptNode, formatOptions); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); + writeFrequency(ptNode.mFrequency); + writeChildrenPosition(ptNode, formatOptions); + writeShortcuts(ptNode.mShortcutTargets); + writeBigrams(ptNode.mBigrams, dict); } } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java new file mode 100644 index 000000000..4c8ff8ea4 --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -0,0 +1,266 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import android.util.Log; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * An implementation of binary dictionary decoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictDecoder extends DictDecoder { + private static final String TAG = Ver4DictDecoder.class.getSimpleName(); + + private static final int FILETYPE_TRIE = 1; + private static final int FILETYPE_FREQUENCY = 2; + private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; + + private final File mDictDirectory; + private final DictionaryBufferFactory mBufferFactory; + private DictBuffer mDictBuffer; + private DictBuffer mFrequencyBuffer; + private DictBuffer mTerminalAddressTableBuffer; + + @UsedForTesting + /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { + mDictDirectory = dictDirectory; + mDictBuffer = mFrequencyBuffer = null; + + if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { + mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); + } else if ((factoryFlag & MASK_DICTBUFFER) == USE_BYTEARRAY) { + mBufferFactory = new DictionaryBufferFromByteArrayFactory(); + } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) { + mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory(); + } else { + mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); + } + } + + @UsedForTesting + /* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) { + mDictDirectory = dictDirectory; + mBufferFactory = factory; + mDictBuffer = mFrequencyBuffer = null; + } + + private File getFile(final int fileType) { + if (fileType == FILETYPE_TRIE) { + return new File(mDictDirectory, + mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION); + } else if (fileType == FILETYPE_FREQUENCY) { + return new File(mDictDirectory, + mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION); + } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) { + return new File(mDictDirectory, + mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } else { + throw new RuntimeException("Unsupported kind of file : " + fileType); + } + } + + @Override + public void openDictBuffer() throws FileNotFoundException, IOException { + final String filename = mDictDirectory.getName(); + mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE)); + mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY)); + mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer( + getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); + } + + @Override + public boolean isDictBufferOpen() { + return mDictBuffer != null; + } + + /* package */ DictBuffer getDictBuffer() { + return mDictBuffer; + } + + @Override + public FileHeader readHeader() throws IOException, UnsupportedFormatException { + if (mDictBuffer == null) { + openDictBuffer(); + } + final FileHeader header = super.readHeader(mDictBuffer); + final int version = header.mFormatOptions.mVersion; + if (version != 4) { + throw new UnsupportedFormatException("File header has a wrong version : " + version); + } + return header; + } + + protected static class PtNodeReader extends DictDecoder.PtNodeReader { + protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { + frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); + return frequencyBuffer.readUnsignedByte(); + } + + protected static int readTerminalId(final DictBuffer dictBuffer) { + return dictBuffer.readInt(); + } + } + + // TODO: Make this buffer thread safe. + // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. + private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; + @Override + public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { + int addressPointer = ptNodePos; + final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); + addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; + + final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); + if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { + addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; + } + + final int characters[]; + if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { + int index = 0; + int character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + while (FormatSpec.INVALID_CHARACTER != character + && index < FormatSpec.MAX_WORD_LENGTH) { + mCharacterBuffer[index++] = character; + character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + } + characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); + } else { + final int character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + characters = new int[] { character }; + } + final int terminalId; + if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { + terminalId = PtNodeReader.readTerminalId(mDictBuffer); + addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + } else { + terminalId = PtNode.NOT_A_TERMINAL; + } + + final int frequency; + if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { + frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId); + } else { + frequency = PtNode.NOT_A_TERMINAL; + } + int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); + if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + childrenAddress += addressPointer; + } + addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); + final ArrayList<WeightedString> shortcutTargets; + if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { + // readShortcut will add shortcuts to shortcutTargets. + shortcutTargets = new ArrayList<WeightedString>(); + addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); + } else { + shortcutTargets = null; + } + + final ArrayList<PendingAttribute> bigrams; + if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { + bigrams = new ArrayList<PendingAttribute>(); + addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, + addressPointer); + if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + MakedictLog.d("too many bigrams in a node."); + } + } else { + bigrams = null; + } + return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency, + parentAddress, childrenAddress, shortcutTargets, bigrams); + } + + private void deleteDictFiles() { + final File[] files = mDictDirectory.listFiles(); + for (int i = 0; i < files.length; ++i) { + files[i].delete(); + } + } + + @Override + public FusionDictionary readDictionaryBinary(final FusionDictionary dict, + final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException { + if (mDictBuffer == null) { + openDictBuffer(); + } + try { + return BinaryDictDecoderUtils.readDictionaryBinary(this, dict); + } catch (IOException e) { + Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e); + if (deleteDictIfBroken) { + deleteDictFiles(); + } + throw e; + } catch (UnsupportedFormatException e) { + Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e); + if (deleteDictIfBroken) { + deleteDictFiles(); + } + throw e; + } + } + + @Override + public void setPosition(int newPos) { + mDictBuffer.position(newPos); + } + + @Override + public int getPosition() { + return mDictBuffer.position(); + } + + @Override + public int readPtNodeCount() { + return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer); + } + + @Override + public boolean readAndFollowForwardLink() { + final int nextAddress = mDictBuffer.readUnsignedInt24(); + if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) { + mDictBuffer.position(nextAddress); + return true; + } + return false; + } + + @Override + public boolean hasNextPtNodeArray() { + return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS; + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java new file mode 100644 index 000000000..4fb89671f --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -0,0 +1,294 @@ +/* +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Iterator; + +/** + * An implementation of DictEncoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictEncoder implements DictEncoder { + private final File mDictPlacedDir; + private byte[] mTrieBuf; + private int mTriePos; + private int mHeaderSize; + private OutputStream mTrieOutStream; + private OutputStream mFreqOutStream; + private OutputStream mTerminalAddressTableOutStream; + + @UsedForTesting + public Ver4DictEncoder(final File dictPlacedDir) { + mDictPlacedDir = dictPlacedDir; + } + + private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) + throws FileNotFoundException, IOException { + final FileHeader header = new FileHeader(0, dictOptions, formatOptions); + final String filename = header.getId() + "." + header.getVersion(); + final File mDictDir = new File(mDictPlacedDir, filename); + final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION); + final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION); + final File terminalAddressTableFile = new File(mDictDir, + filename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + if (!mDictDir.isDirectory()) { + if (mDictDir.exists()) mDictDir.delete(); + mDictDir.mkdirs(); + } + if (!trieFile.exists()) trieFile.createNewFile(); + if (!freqFile.exists()) freqFile.createNewFile(); + if (!terminalAddressTableFile.exists()) terminalAddressTableFile.createNewFile(); + mTrieOutStream = new FileOutputStream(trieFile); + mFreqOutStream = new FileOutputStream(freqFile); + mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile); + } + + private void close() throws IOException { + try { + if (mTrieOutStream != null) { + mTrieOutStream.close(); + } + if (mFreqOutStream != null) { + mFreqOutStream.close(); + } + if (mTerminalAddressTableOutStream != null) { + mTerminalAddressTableOutStream.close(); + } + } finally { + mTrieOutStream = null; + mFreqOutStream = null; + mTerminalAddressTableOutStream = null; + } + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion != FormatSpec.VERSION4) { + throw new UnsupportedFormatException("File header has a wrong version number : " + + formatOptions.mVersion); + } + if (!mDictPlacedDir.isDirectory()) { + throw new UnsupportedFormatException("Given path is not a directory."); + } + + if (mTrieOutStream == null) { + openStreams(formatOptions, dict.mOptions); + } + + mHeaderSize = BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict, + formatOptions); + + MakedictLog.i("Flattening the tree..."); + ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + int terminalCount = 0; + for (final PtNodeArray array : flatNodes) { + for (final PtNode node : array.mData) { + if (node.isTerminal()) node.mTerminalId = terminalCount++; + } + } + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + writeTerminalData(flatNodes, terminalCount); + + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mTrieBuf = new byte[bufferSize]; + + MakedictLog.i("Writing file..."); + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); + } + if (MakedictLog.DBG) { + BinaryDictEncoderUtils.showStatistics(flatNodes); + MakedictLog.i("has " + terminalCount + " terminals."); + } + mTrieOutStream.write(mTrieBuf); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(int position) { + if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return; + mTriePos = position; + } + + @Override + public int getPosition() { + return mTriePos; + } + + @Override + public void writePtNodeCount(int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + // ptNodeCount must fit on one byte or two bytes. + // Please see comments in FormatSpec + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); + } + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, ptNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode, final int parentAddress, + final FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mTriePos, childrenPos, + formatOptions), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeParentPosition(int parentPos, final PtNode ptNode, + final FormatOptions formatOptions) { + if (parentPos != FormatSpec.NO_PARENT_ADDRESS) { + parentPos -= ptNode.mCachedAddressAfterUpdate; + } + mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos, + formatOptions); + } + + private void writeCharacters(final int[] characters, final boolean hasSeveralChars) { + mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos); + if (hasSeveralChars) { + mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeTerminalId(final int terminalId) { + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId, + FormatSpec.PTNODE_TERMINAL_ID_SIZE); + } + + private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + if (formatOptions.mSupportsDynamicUpdate) { + mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf, + mTriePos, childrenPos); + } else { + mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, + mTriePos, childrenPos); + } + } + + private void writeShortcuts(ArrayList<WeightedString> shortcuts) { + if (null == shortcuts || shortcuts.isEmpty()) return; + + final int indexOfShortcutByteSize = mTriePos; + mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; + final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( + shortcutIterator.hasNext(), + target.mFrequency); + mTrieBuf[mTriePos++] = (byte)shortcutFlags; + final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos, + target.mWord); + mTriePos += shortcutShift; + } + final int shortcutByteSize = mTriePos - indexOfShortcutByteSize; + if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { + throw new RuntimeException("Shortcut list too large : " + shortcutByteSize); + } + BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize, + shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + + private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator<WeightedString> bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.mFrequency; + final int offset = addressOfBigram + - (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); + mTrieBuf[mTriePos++] = (byte) bigramFlags; + mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, + mTriePos, Math.abs(offset)); + } + } + + @Override + public void writeForwardLinkAddress(int forwardLinkAddress) { + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, + forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + } + + @Override + public void writePtNode(final PtNode ptNode, final int parentPosition, + final FormatOptions formatOptions, final FusionDictionary dict) { + writePtNodeFlags(ptNode, parentPosition, formatOptions); + writeParentPosition(parentPosition, ptNode, formatOptions); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); + if (ptNode.isTerminal()) { + writeTerminalId(ptNode.mTerminalId); + } + writeChildrenPosition(ptNode, formatOptions); + writeShortcuts(ptNode.mShortcutTargets); + writeBigrams(ptNode.mBigrams, dict); + } + + private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, + final int terminalCount) throws IOException { + final byte[] freqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE]; + final byte[] terminalAddressTableBuf = + new byte[terminalCount * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE]; + for (final PtNodeArray nodeArray : flatNodes) { + for (final PtNode ptNode : nodeArray.mData) { + if (ptNode.isTerminal()) { + BinaryDictEncoderUtils.writeUIntToBuffer(freqBuf, + ptNode.mTerminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE, + ptNode.mFrequency, FormatSpec.FREQUENCY_AND_FLAGS_SIZE); + BinaryDictEncoderUtils.writeUIntToBuffer(terminalAddressTableBuf, + ptNode.mTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, + ptNode.mCachedAddressAfterUpdate + mHeaderSize, + FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); + } + } + } + mFreqOutStream.write(freqBuf); + mTerminalAddressTableOutStream.write(terminalAddressTableBuf); + } +} |