diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
15 files changed, 549 insertions, 1569 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java index f8fa68f45..fda97dafc 100644 --- a/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java @@ -32,35 +32,36 @@ import java.util.TreeMap; * A base class of the binary dictionary decoder. */ public abstract class AbstractDictDecoder implements DictDecoder { - private static final int SUCCESS = 0; - private static final int ERROR_CANNOT_READ = 1; - private static final int ERROR_WRONG_FORMAT = 2; - - protected FileHeader readHeader(final DictBuffer headerBuffer) + protected FileHeader readHeader(final DictBuffer dictBuffer) throws IOException, UnsupportedFormatException { - if (headerBuffer == null) { + if (dictBuffer == null) { openDictBuffer(); } - final int version = HeaderReader.readVersion(headerBuffer); + final int version = HeaderReader.readVersion(dictBuffer); if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { throw new UnsupportedFormatException("Unsupported version : " + version); } // TODO: Remove this field. - final int optionsFlags = HeaderReader.readOptionFlags(headerBuffer); - final int headerSize = HeaderReader.readHeaderSize(headerBuffer); + final int optionsFlags = HeaderReader.readOptionFlags(dictBuffer); + + final int headerSize = HeaderReader.readHeaderSize(dictBuffer); + if (headerSize < 0) { throw new UnsupportedFormatException("header size can't be negative."); } - final HashMap<String, String> attributes = HeaderReader.readAttributes(headerBuffer, + final HashMap<String, String> attributes = HeaderReader.readAttributes(dictBuffer, headerSize); final FileHeader header = new FileHeader(headerSize, - new FusionDictionary.DictionaryOptions(attributes), - new FormatOptions(version, - 0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG))); + new FusionDictionary.DictionaryOptions(attributes, + 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG), + 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)), + new FormatOptions(version, + 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE), + 0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG))); return header; } @@ -203,25 +204,4 @@ public abstract class AbstractDictDecoder implements DictDecoder { return readLength; } } - - /** - * Check whether the header contains the expected information. This is a no-error method, - * that will return an error code and never throw a checked exception. - * @return an error code, either ERROR_* or SUCCESS. - */ - private int checkHeader() { - try { - readHeader(); - } catch (IOException e) { - return ERROR_CANNOT_READ; - } catch (UnsupportedFormatException e) { - return ERROR_WRONG_FORMAT; - } - return SUCCESS; - } - - @Override - public boolean hasValidRawBinaryDictionary() { - return checkHeader() == SUCCESS; - } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java index 7f0aa777f..216492b4d 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -24,9 +24,12 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Map; import java.util.TreeMap; @@ -166,14 +169,6 @@ public final class BinaryDictDecoderUtils { return size; } - static int getCharArraySize(final int[] chars, final int start, final int end) { - int size = 0; - for (int i = start; i < end; ++i) { - size += getCharSize(chars[i]); - } - return size; - } - /** * Writes a char array to a byte buffer. * @@ -205,7 +200,8 @@ public final class BinaryDictDecoderUtils { * @param word the string to write. * @return the size written, in bytes. */ - static int writeString(final byte[] buffer, final int origin, final String word) { + static int writeString(final byte[] buffer, final int origin, + final String word) { final int length = word.length(); int index = origin; for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { @@ -227,62 +223,22 @@ public final class BinaryDictDecoderUtils { * * This will also write the terminator byte. * - * @param stream the OutputStream to write to. + * @param buffer the OutputStream to write to. * @param word the string to write. - * @return the size written, in bytes. */ - static int writeString(final OutputStream stream, final String word) throws IOException { + static void writeString(final OutputStream buffer, final String word) throws IOException { final int length = word.length(); - int written = 0; for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); - final int charSize = getCharSize(codePoint); - if (1 == charSize) { - stream.write((byte) codePoint); - } else { - stream.write((byte) (0xFF & (codePoint >> 16))); - stream.write((byte) (0xFF & (codePoint >> 8))); - stream.write((byte) (0xFF & codePoint)); - } - written += charSize; - } - stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); - written += FormatSpec.PTNODE_TERMINATOR_SIZE; - return written; - } - - /** - * Writes an array of code points with our character format to an OutputStream. - * - * This will also write the terminator byte. - * - * @param stream the OutputStream to write to. - * @param codePoints the array of code points - * @return the size written, in bytes. - */ - // TODO: Merge this method with writeCharArray and rename the various write* methods to - // make the difference clear. - static int writeCodePoints(final OutputStream stream, final int[] codePoints, - final int startIndex, final int endIndex) - throws IOException { - int written = 0; - for (int i = startIndex; i < endIndex; ++i) { - final int codePoint = codePoints[i]; - final int charSize = getCharSize(codePoint); - if (1 == charSize) { - stream.write((byte) codePoint); + if (1 == getCharSize(codePoint)) { + buffer.write((byte) codePoint); } else { - stream.write((byte) (0xFF & (codePoint >> 16))); - stream.write((byte) (0xFF & (codePoint >> 8))); - stream.write((byte) (0xFF & codePoint)); + buffer.write((byte) (0xFF & (codePoint >> 16))); + buffer.write((byte) (0xFF & (codePoint >> 8))); + buffer.write((byte) (0xFF & codePoint)); } - written += charSize; } - if (endIndex - startIndex > 1) { - stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); - written += FormatSpec.PTNODE_TERMINATOR_SIZE; - } - return written; + buffer.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); } /** @@ -330,7 +286,7 @@ public final class BinaryDictDecoderUtils { static int readChildrenAddress(final DictBuffer dictBuffer, final int optionFlags, final FormatOptions options) { - if (options.supportsDynamicUpdate()) { + if (options.mSupportsDynamicUpdate) { final int address = dictBuffer.readUnsignedInt24(); if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS; if ((address & FormatSpec.MSB24) != 0) { @@ -540,11 +496,11 @@ public final class BinaryDictDecoderUtils { } // reach the end of the array. - if (options.supportsDynamicUpdate()) { + if (options.mSupportsDynamicUpdate) { final boolean hasValidForwardLink = dictDecoder.readAndFollowForwardLink(); if (!hasValidForwardLink) break; } - } while (options.supportsDynamicUpdate() && dictDecoder.hasNextPtNodeArray()); + } while (options.mSupportsDynamicUpdate && dictDecoder.hasNextPtNodeArray()); final PtNodeArray nodeArray = new PtNodeArray(nodeArrayContents); nodeArray.mCachedAddressBeforeUpdate = nodeArrayOriginPos; @@ -600,7 +556,7 @@ public final class BinaryDictDecoderUtils { Map<Integer, PtNodeArray> reverseNodeArrayMapping = new TreeMap<Integer, PtNodeArray>(); Map<Integer, PtNode> reversePtNodeMapping = new TreeMap<Integer, PtNode>(); - final PtNodeArray root = readNodeArray(dictDecoder, fileHeader.mBodyOffset, + final PtNodeArray root = readNodeArray(dictDecoder, fileHeader.mHeaderSize, reverseNodeArrayMapping, reversePtNodeMapping, fileHeader.mFormatOptions); FusionDictionary newDict = new FusionDictionary(root, fileHeader.mDictionaryOptions); @@ -636,10 +592,32 @@ public final class BinaryDictDecoderUtils { /** * Basic test to find out whether the file is a binary dictionary or not. * + * Concretely this only tests the magic number. + * * @param file The file to test. * @return true if it's a binary dictionary, false otherwise */ public static boolean isBinaryDictionary(final File file) { - return FormatSpec.getDictDecoder(file).hasValidRawBinaryDictionary(); + FileInputStream inStream = null; + try { + inStream = new FileInputStream(file); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, file.length()); + final int version = getFormatVersion(new ByteBufferDictBuffer(buffer)); + return (version >= FormatSpec.MINIMUM_SUPPORTED_VERSION + && version <= FormatSpec.MAXIMUM_SUPPORTED_VERSION); + } catch (FileNotFoundException e) { + return false; + } catch (IOException e) { + return false; + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } + } } } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java index 8ba0797de..f761829de 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -17,9 +17,9 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; @@ -160,7 +160,7 @@ public class BinaryDictEncoderUtils { node.mCachedSize = nodeSize; size += nodeSize; } - if (options.supportsDynamicUpdate()) { + if (options.mSupportsDynamicUpdate) { size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; } ptNodeArray.mCachedSize = size; @@ -245,26 +245,6 @@ public class BinaryDictEncoderUtils { } } - static void writeUIntToDictBuffer(final DictBuffer dictBuffer, final int value, - final int size) { - switch(size) { - case 4: - dictBuffer.put((byte) ((value >> 24) & 0xFF)); - /* fall through */ - case 3: - dictBuffer.put((byte) ((value >> 16) & 0xFF)); - /* fall through */ - case 2: - dictBuffer.put((byte) ((value >> 8) & 0xFF)); - /* fall through */ - case 1: - dictBuffer.put((byte) (value & 0xFF)); - break; - default: - /* nop */ - } - } - // End utility methods // This method is responsible for finding a nice ordering of the nodes that favors run-time @@ -397,7 +377,7 @@ public class BinaryDictEncoderUtils { nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; } } - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; } else if (null != ptNode.mChildren) { nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, @@ -417,7 +397,7 @@ public class BinaryDictEncoderUtils { ptNode.mCachedSize = nodeSize; size += nodeSize; } - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; } if (ptNodeArray.mCachedSize != size) { @@ -533,7 +513,7 @@ public class BinaryDictEncoderUtils { if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); } while (changesDone); - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { computeParentAddresses(flatNodes); } final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1); @@ -642,7 +622,7 @@ public class BinaryDictEncoderUtils { byte flags = 0; if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { flags |= FormatSpec.FLAG_IS_NOT_MOVED; } else if (true) { switch (childrenAddressSize) { @@ -710,13 +690,6 @@ public class BinaryDictEncoderUtils { + word + " is " + unigramFrequency); bigramFrequency = unigramFrequency; } - bigramFlags += getBigramFrequencyDiff(unigramFrequency, bigramFrequency) - & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; - return bigramFlags; - } - - public static int getBigramFrequencyDiff(final int unigramFrequency, - final int bigramFrequency) { // We compute the difference between 255 (which means probability = 1) and the // unigram score. We split this into a number of discrete steps. // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 @@ -750,15 +723,22 @@ public class BinaryDictEncoderUtils { // include this bigram in the dictionary. For now, register as 0, and live with the // small over-estimation that we get in this case. TODO: actually remove this bigram // if discretizedFrequency < 0. - return discretizedFrequency > 0 ? discretizedFrequency : 0; + final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0; + bigramFlags += finalBigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; + return bigramFlags; } /** - * Makes the 2-byte value for options flags. Unused at the moment, and always 0. + * Makes the 2-byte value for options flags. */ - private static final int makeOptionsValue(final FormatOptions formatOptions) { - // TODO: why doesn't this handle CONTAINS_TIMESTAMP_FLAG? - return 0; + private static final int makeOptionsValue(final FusionDictionary dictionary, + final FormatOptions formatOptions) { + final DictionaryOptions options = dictionary.mOptions; + final boolean hasBigrams = dictionary.hasBigrams(); + return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0) + + (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0) + + (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0) + + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0); } /** @@ -846,7 +826,7 @@ public class BinaryDictEncoderUtils { } dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict); } - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { dictEncoder.writeForwardLinkAddress(FormatSpec.NO_FORWARD_LINK_ADDRESS); } if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate @@ -947,7 +927,7 @@ public class BinaryDictEncoderUtils { headerBuffer.write((byte) (0xFF & version)); // Options flags - final int options = makeOptionsValue(formatOptions); + final int options = makeOptionsValue(dict, formatOptions); headerBuffer.write((byte) (0xFF & (options >> 8))); headerBuffer.write((byte) (0xFF & options)); final int headerSizeOffset = headerBuffer.size(); diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index 640d778bb..d5516ef46 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -62,7 +62,7 @@ public final class BinaryDictIOUtils { * Retrieves all node arrays without recursive call. */ private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, - final int bodyOffset, final Map<Integer, String> words, + final int headerSize, final Map<Integer, String> words, final Map<Integer, Integer> frequencies, final Map<Integer, ArrayList<PendingAttribute>> bigrams, final FormatOptions formatOptions) { @@ -71,7 +71,7 @@ public final class BinaryDictIOUtils { Stack<Position> stack = new Stack<Position>(); int index = 0; - Position initPos = new Position(bodyOffset, 0); + Position initPos = new Position(headerSize, 0); stack.push(initPos); while (!stack.empty()) { @@ -112,7 +112,7 @@ public final class BinaryDictIOUtils { } if (p.mPosition == p.mNumOfPtNode) { - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { final boolean hasValidForwardLinkAddress = dictDecoder.readAndFollowForwardLink(); if (hasValidForwardLinkAddress && dictDecoder.hasNextPtNodeArray()) { @@ -154,7 +154,7 @@ public final class BinaryDictIOUtils { UnsupportedFormatException { // Read header final FileHeader header = dictDecoder.readHeader(); - readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words, + readUnigramsAndBigramsBinaryInner(dictDecoder, header.mHeaderSize, words, frequencies, bigrams, header.mFormatOptions); } @@ -228,7 +228,7 @@ public final class BinaryDictIOUtils { // a forward link address that we need to consult and possibly resume // search on the next node array in the linked list. if (foundNextPtNode) break; - if (!header.mFormatOptions.supportsDynamicUpdate()) { + if (!header.mFormatOptions.mSupportsDynamicUpdate) { return FormatSpec.NOT_VALID_WORD; } @@ -245,7 +245,8 @@ public final class BinaryDictIOUtils { /** * @return the size written, in bytes. Always 3 bytes. */ - static int writeSInt24ToBuffer(final DictBuffer dictBuffer, final int value) { + static int writeSInt24ToBuffer(final DictBuffer dictBuffer, + final int value) { final int absValue = Math.abs(value); dictBuffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); dictBuffer.put((byte)((absValue >> 8) & 0xFF)); @@ -300,6 +301,35 @@ public final class BinaryDictIOUtils { } /** + * Write a string to a stream. + * + * @param destination the stream to write. + * @param word the string to be written. + * @return the size written, in bytes. + * @throws IOException + */ + private static int writeString(final OutputStream destination, final String word) + throws IOException { + int size = 0; + final int length = word.length(); + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + if (CharEncoding.getCharSize(codePoint) == 1) { + destination.write((byte)codePoint); + size++; + } else { + destination.write((byte)(0xFF & (codePoint >> 16))); + destination.write((byte)(0xFF & (codePoint >> 8))); + destination.write((byte)(0xFF & codePoint)); + size += 3; + } + } + destination.write((byte)FormatSpec.PTNODE_CHARACTERS_TERMINATOR); + size += FormatSpec.PTNODE_TERMINATOR_SIZE; + return size; + } + + /** * Write a PtNode to an output stream from a PtNodeInfo. * A PtNode is an in-memory representation of a node in the patricia trie. * A PtNode info is a container for low-level information about how the @@ -357,7 +387,7 @@ public final class BinaryDictIOUtils { destination.write((byte)BinaryDictEncoderUtils.makeShortcutFlags( shortcutIterator.hasNext(), target.mFrequency)); size++; - size += CharEncoding.writeString(destination, target.mWord); + size += writeString(destination, target.mWord); } } @@ -415,27 +445,6 @@ public final class BinaryDictIOUtils { } /** - * Writes a PtNodeCount to the stream. - * - * @param destination the stream to write. - * @param ptNodeCount the count. - * @return the size written in bytes. - */ - static int writePtNodeCount(final OutputStream destination, final int ptNodeCount) - throws IOException { - final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); - // the count must fit on one byte or two bytes. - // Please see comments in FormatSpec. - if (countSize != 1 && countSize != 2) { - throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); - } - final int encodedPtNodeCount = (countSize == 2) ? - (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; - BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize); - return countSize; - } - - /** * Write a node array to the stream. * * @param destination the stream to write. @@ -445,7 +454,20 @@ public final class BinaryDictIOUtils { */ static int writeNodes(final OutputStream destination, final PtNodeInfo[] infos) throws IOException { - int size = writePtNodeCount(destination, infos.length); + int size = getPtNodeCountSize(infos.length); + switch (getPtNodeCountSize(infos.length)) { + case 1: + destination.write((byte)infos.length); + break; + case 2: + final int encodedPtNodeCount = + infos.length | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + destination.write((byte)(encodedPtNodeCount >> 8)); + destination.write((byte)(encodedPtNodeCount & 0xFF)); + break; + default: + throw new RuntimeException("Invalid node count size."); + } for (final PtNodeInfo info : infos) size += writePtNode(destination, info); writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS); return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; @@ -507,7 +529,7 @@ public final class BinaryDictIOUtils { * Helper method to check whether the node is moved. */ public static boolean isMovedPtNode(final int flags, final FormatOptions options) { - return options.supportsDynamicUpdate() + return options.mSupportsDynamicUpdate && ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED); } @@ -516,14 +538,14 @@ public final class BinaryDictIOUtils { */ public static boolean supportsDynamicUpdate(final FormatOptions options) { return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_DYNAMIC_UPDATE - && options.supportsDynamicUpdate(); + && options.mSupportsDynamicUpdate; } /** * Helper method to check whether the node is deleted. */ public static boolean isDeletedPtNode(final int flags, final FormatOptions formatOptions) { - return formatOptions.supportsDynamicUpdate() + return formatOptions.mSupportsDynamicUpdate && ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED); } @@ -546,7 +568,7 @@ public final class BinaryDictIOUtils { static int getChildrenAddressSize(final int optionFlags, final FormatOptions formatOptions) { - if (formatOptions.supportsDynamicUpdate()) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; + if (formatOptions.mSupportsDynamicUpdate) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: return 1; diff --git a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java index b4838f00f..3dbeee099 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java @@ -35,7 +35,6 @@ import java.util.TreeMap; /** * An interface of binary dictionary decoders. */ -// TODO: Straighten out responsibility for the buffer's file pointer. public interface DictDecoder { /** @@ -44,7 +43,7 @@ public interface DictDecoder { public FileHeader readHeader() throws IOException, UnsupportedFormatException; /** - * Reads PtNode from ptNodePos. + * Reads PtNode from nodeAddress. * @param ptNodePos the position of PtNode. * @param formatOptions the format options. * @return PtNodeInfo. @@ -128,8 +127,7 @@ public interface DictDecoder { * Opens the dictionary file and makes DictBuffer. */ @UsedForTesting - public void openDictBuffer() throws FileNotFoundException, IOException, - UnsupportedFormatException; + public void openDictBuffer() throws FileNotFoundException, IOException; @UsedForTesting public boolean isDictBufferOpen(); @@ -230,9 +228,4 @@ public interface DictDecoder { } public void skipPtNode(final FormatOptions formatOptions); - - /** - * @return whether this decoder has a valid binary dictionary that it can decode. - */ - public boolean hasValidRawBinaryDictionary(); } diff --git a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java index ff03190a3..28da9ffdd 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java @@ -37,7 +37,7 @@ import java.util.Arrays; @UsedForTesting public final class DynamicBinaryDictIOUtils { private static final boolean DBG = false; - static final int MAX_JUMPS = 10000; + private static final int MAX_JUMPS = 10000; private DynamicBinaryDictIOUtils() { // This utility class is not publicly instantiable. @@ -61,7 +61,7 @@ public final class DynamicBinaryDictIOUtils { final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); final int originalPosition = dictBuffer.position(); dictBuffer.position(ptNodeOriginAddress); - if (!formatOptions.supportsDynamicUpdate()) { + if (!formatOptions.mSupportsDynamicUpdate) { throw new RuntimeException("this file format does not support parent addresses"); } final int flags = dictBuffer.readUnsignedByte(); @@ -102,7 +102,7 @@ public final class DynamicBinaryDictIOUtils { } if (!dictUpdater.readAndFollowForwardLink()) break; if (dictUpdater.getPosition() == FormatSpec.NO_FORWARD_LINK_ADDRESS) break; - } while (formatOptions.supportsDynamicUpdate()); + } while (formatOptions.mSupportsDynamicUpdate); dictUpdater.setPosition(originalPosition); } diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index 20ddba836..b56234f6d 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -40,8 +40,12 @@ public final class FormatSpec { * p | not used 3 bits * t | each unigram and bigram entry has a time stamp? * i | 1 bit, 1 = yes, 0 = no : CONTAINS_TIMESTAMP_FLAG - * o | - * nflags + * o | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG + * n | FRENCH_LIGATURE_PROCESSING_FLAG + * f | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE + * l | GERMAN_UMLAUT_PROCESSING_FLAG + * a | + * gs * * h | * e | size of the file header, 4bytes @@ -78,36 +82,45 @@ public final class FormatSpec { * s * * f | - * o | forward link address, 3byte - * r | 1 byte = bbbbbbbb match - * w | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte) - * a | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte - * r | - * dlinkaddress + * o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header) + * r | forward link address, 3byte + * w | 1 byte = bbbbbbbb match + * a | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte) + * r | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte + * d | + * linkaddress */ /* Node (FusionDictionary.PtNode) layout is as follows: - * | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED - * | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES - * | 01 = yes : FLAG_IS_MOVED - * f | the new address is stored in the same place as the parent address - * l | is deleted? 10 = yes : FLAG_IS_DELETED - * a | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS - * g | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL - * s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS + * | IF !SUPPORTS_DYNAMIC_UPDATE + * | addressType xx : mask with MASK_CHILDREN_ADDRESS_TYPE + * | 2 bits, 00 = no children : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS + * f | 01 = 1 byte : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE + * l | 10 = 2 bytes : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES + * a | 11 = 3 bytes : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES + * g | ELSE + * s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED + * | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES + * | 01 = yes : FLAG_IS_MOVED + * | the new address is stored in the same place as the parent address + * | is deleted? 10 = yes : FLAG_IS_DELETED + * | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS + * | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL + * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD * | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED * * p | - * a | parent address, 3byte - * r | 1 byte = bbbbbbbb match - * e | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) - * n | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte - * t | This address is relative to the head of the PtNode. - * a | If the node doesn't have a parent, this field is set to 0. + * a | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header) + * r | parent address, 3byte + * e | 1 byte = bbbbbbbb match + * n | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) + * t | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte + * a | This address is relative to the head of the PtNode. + * d | If the node doesn't have a parent, this field is set to 0. * d | - * dress + * ress * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers @@ -121,16 +134,23 @@ public final class FormatSpec { * e | frequency 1 byte * q | * - * c | - * h | children address, 3 bytes - * i | 1 byte = bbbbbbbb match - * l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) - * d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte - * r | if this node doesn't have children, this field is set to 0. - * e | (see BinaryDictEncoderUtils#writeVariableSignedAddress) - * n | This address is relative to the position of this field. - * a | - * ddress + * c | IF SUPPORTS_DYNAMIC_UPDATE + * h | children address, 3 bytes + * i | 1 byte = bbbbbbbb match + * l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte) + * d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte + * r | if this node doesn't have children, this field is set to 0. + * e | (see BinaryDictEncoderUtils#writeVariableSignedAddress) + * n | ELSIF 00 = FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS == addressType + * a | // nothing + * d | ELSIF 01 = FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE == addressType + * d | children address, 1 byte + * r | ELSIF 10 = FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES == addressType + * e | children address, 2 bytes + * s | ELSE // 11 = FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = addressType + * s | children address, 3 bytes + * | END + * | This address is relative to the position of this field. * * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS * | shortcut string list @@ -179,22 +199,20 @@ public final class FormatSpec { */ public static final int MAGIC_NUMBER = 0x9BC13AFE; + static final int MINIMUM_SUPPORTED_VERSION = 2; + static final int MAXIMUM_SUPPORTED_VERSION = 4; static final int NOT_A_VERSION_NUMBER = -1; static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3; static final int FIRST_VERSION_WITH_TERMINAL_ID = 4; - - // These MUST have the same values as the relevant constants in format_utils.h. - // From version 4 on, we use version * 100 + revision as a version number. That allows - // us to change the format during development while having testing devices remove - // older files with each upgrade, while still having a readable versioning scheme. - public static final int VERSION2 = 2; - public static final int VERSION3 = 3; - public static final int VERSION4 = 400; - static final int MINIMUM_SUPPORTED_VERSION = VERSION2; - static final int MAXIMUM_SUPPORTED_VERSION = VERSION4; + static final int VERSION3 = 3; + static final int VERSION4 = 4; // These options need to be the same numeric values as the one in the native reading code. + static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; // TODO: Make the native reading code read this variable. + static final int SUPPORTS_DYNAMIC_UPDATE = 0x2; + static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; + static final int CONTAINS_BIGRAMS_FLAG = 0x8; static final int CONTAINS_TIMESTAMP_FLAG = 0x10; // TODO: Make this value adaptative to content data, store it in the header, and @@ -245,10 +263,8 @@ public final class FormatSpec { static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2; - // These values are used only by version 4 or later. They MUST match the definitions in - // ver4_dict_constants.cpp. + // These values are used only by version 4 or later. static final String TRIE_FILE_EXTENSION = ".trie"; - public static final String HEADER_FILE_EXTENSION = ".header"; static final String FREQ_FILE_EXTENSION = ".freq"; static final String UNIGRAM_TIMESTAMP_FILE_EXTENSION = ".timestamp"; // tat = Terminal Address Table @@ -262,9 +278,9 @@ public final class FormatSpec { static final int UNIGRAM_TIMESTAMP_SIZE = 4; // With the English main dictionary as of October 2013, the size of bigram address table is - // is 345KB with the block size being 16. - // This is 54% of that of full address table. - static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; + // is 584KB with the block size being 4. + // This is 91% of that of full address table. + static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_CONTENT_COUNT = 2; static final int BIGRAM_FREQ_CONTENT_INDEX = 0; static final int BIGRAM_TIMESTAMP_CONTENT_INDEX = 1; @@ -277,7 +293,7 @@ public final class FormatSpec { static final int SHORTCUT_CONTENT_COUNT = 1; static final int SHORTCUT_CONTENT_INDEX = 0; // With the English main dictionary as of October 2013, the size of shortcut address table is - // 26KB with the block size being 64. + // 29KB with the block size being 64. // This is only 4.4% of that of full address table. static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; static final String SHORTCUT_CONTENT_ID = "_shortcut"; @@ -315,36 +331,43 @@ public final class FormatSpec { */ public static final class FormatOptions { public final int mVersion; + public final boolean mSupportsDynamicUpdate; public final boolean mHasTerminalId; public final boolean mHasTimestamp; - @UsedForTesting public FormatOptions(final int version) { - this(version, false /* hasTimestamp */); + this(version, false); } - public FormatOptions(final int version, final boolean hasTimestamp) { + @UsedForTesting + public FormatOptions(final int version, final boolean supportsDynamicUpdate) { + this(version, supportsDynamicUpdate, false /* hasTimestamp */); + } + + public FormatOptions(final int version, final boolean supportsDynamicUpdate, + final boolean hasTimestamp) { mVersion = version; + if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) { + throw new RuntimeException("Dynamic updates are only supported with versions " + + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior."); + } + mSupportsDynamicUpdate = supportsDynamicUpdate; mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID); mHasTimestamp = hasTimestamp; } - - public boolean supportsDynamicUpdate() { - return mVersion >= FIRST_VERSION_WITH_DYNAMIC_UPDATE; - } } /** * Class representing file header. */ public static final class FileHeader { - public final int mBodyOffset; + public final int mHeaderSize; public final DictionaryOptions mDictionaryOptions; public final FormatOptions mFormatOptions; // Note that these are corresponding definitions in native code in latinime::HeaderPolicy // and latinime::HeaderReadWriteUtils. + public static final String SUPPORTS_DYNAMIC_UPDATE_ATTRIBUTE = "SUPPORTS_DYNAMIC_UPDATE"; public static final String USES_FORGETTING_CURVE_ATTRIBUTE = "USES_FORGETTING_CURVE"; - public static final String HAS_HISTORICAL_INFO_ATTRIBUTE = "HAS_HISTORICAL_INFO"; public static final String ATTRIBUTE_VALUE_TRUE = "1"; public static final String DICTIONARY_VERSION_ATTRIBUTE = "version"; @@ -353,18 +376,9 @@ public final class FormatSpec { private static final String DICTIONARY_DESCRIPTION_ATTRIBUTE = "description"; public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions, final FormatOptions formatOptions) { + mHeaderSize = headerSize; mDictionaryOptions = dictionaryOptions; mFormatOptions = formatOptions; - mBodyOffset = formatOptions.mVersion < VERSION4 ? headerSize : 0; - if (null == getLocaleString()) { - throw new RuntimeException("Cannot create a FileHeader without a locale"); - } - if (null == getVersion()) { - throw new RuntimeException("Cannot create a FileHeader without a version"); - } - if (null == getId()) { - throw new RuntimeException("Cannot create a FileHeader without an ID"); - } } // Helper method to get the locale as a String diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index fdf2ae7b5..3bb218bea 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -303,9 +303,14 @@ public final class FusionDictionary implements Iterable<Word> { * Options global to the dictionary. */ public static final class DictionaryOptions { + public final boolean mGermanUmlautProcessing; + public final boolean mFrenchLigatureProcessing; public final HashMap<String, String> mAttributes; - public DictionaryOptions(final HashMap<String, String> attributes) { + public DictionaryOptions(final HashMap<String, String> attributes, + final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { mAttributes = attributes; + mGermanUmlautProcessing = germanUmlautProcessing; + mFrenchLigatureProcessing = frenchLigatureProcessing; } @Override public String toString() { // Convenience method @@ -334,6 +339,14 @@ public final class FusionDictionary implements Iterable<Word> { } s.append("\n"); } + if (mGermanUmlautProcessing) { + s.append(indent); + s.append("Needs German umlaut processing\n"); + } + if (mFrenchLigatureProcessing) { + s.append(indent); + s.append("Needs French ligature processing\n"); + } return s.toString(); } } @@ -688,6 +701,138 @@ public final class FusionDictionary implements Iterable<Word> { } /** + * Recursively count the number of nodes in a given branch of the trie. + * + * @param nodeArray the node array to count. + * @return the number of nodes in this branch. + */ + public static int countNodeArrays(final PtNodeArray nodeArray) { + int size = 1; + for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { + PtNode ptNode = nodeArray.mData.get(i); + if (null != ptNode.mChildren) + size += countNodeArrays(ptNode.mChildren); + } + return size; + } + + // Recursively find out whether there are any bigrams. + // This can be pretty expensive especially if there aren't any (we return as soon + // as we find one, so it's much cheaper if there are bigrams) + private static boolean hasBigramsInternal(final PtNodeArray nodeArray) { + if (null == nodeArray) return false; + for (int i = nodeArray.mData.size() - 1; i >= 0; --i) { + PtNode ptNode = nodeArray.mData.get(i); + if (null != ptNode.mBigrams) return true; + if (hasBigramsInternal(ptNode.mChildren)) return true; + } + return false; + } + + /** + * Finds out whether there are any bigrams in this dictionary. + * + * @return true if there is any bigram, false otherwise. + */ + // TODO: this is expensive especially for large dictionaries without any bigram. + // The up side is, this is always accurate and correct and uses no memory. We should + // find a more efficient way of doing this, without compromising too much on memory + // and ease of use. + public boolean hasBigrams() { + return hasBigramsInternal(mRootNodeArray); + } + + // Historically, the tails of the words were going to be merged to save space. + // However, that would prevent the code to search for a specific address in log(n) + // time so this was abandoned. + // The code is still of interest as it does add some compression to any dictionary + // that has no need for attributes. Implementations that does not read attributes should be + // able to read a dictionary with merged tails. + // Also, the following code does support frequencies, as in, it will only merges + // tails that share the same frequency. Though it would result in the above loss of + // performance while searching by address, it is still technically possible to merge + // tails that contain attributes, but this code does not take that into account - it does + // not compare attributes and will merge terminals with different attributes regardless. + public void mergeTails() { + MakedictLog.i("Do not merge tails"); + return; + +// MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root)); +// MakedictLog.i("Number of PtNodes : " + countPtNodes(root)); +// +// final HashMap<String, ArrayList<PtNodeArray>> repository = +// new HashMap<String, ArrayList<PtNodeArray>>(); +// mergeTailsInner(repository, root); +// +// MakedictLog.i("Number of different pseudohashes : " + repository.size()); +// int size = 0; +// for (ArrayList<PtNodeArray> a : repository.values()) { +// size += a.size(); +// } +// MakedictLog.i("Number of nodes after merge : " + (1 + size)); +// MakedictLog.i("Recursively seen nodes : " + countNodes(root)); + } + + // The following methods are used by the deactivated mergeTails() +// private static boolean isEqual(PtNodeArray a, PtNodeArray b) { +// if (null == a && null == b) return true; +// if (null == a || null == b) return false; +// if (a.data.size() != b.data.size()) return false; +// final int size = a.data.size(); +// for (int i = size - 1; i >= 0; --i) { +// PtNode aPtNode = a.data.get(i); +// PtNode bPtNode = b.data.get(i); +// if (aPtNode.frequency != bPtNode.frequency) return false; +// if (aPtNode.alternates == null && bPtNode.alternates != null) return false; +// if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false; +// if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false; +// if (!isEqual(aPtNode.children, bPtNode.children)) return false; +// } +// return true; +// } + +// static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner( +// final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) { +// final ArrayList<PtNode> branches = nodeArray.data; +// final int nodeSize = branches.size(); +// for (int i = 0; i < nodeSize; ++i) { +// PtNode ptNode = branches.get(i); +// if (null != ptNode.children) { +// String pseudoHash = getPseudoHash(ptNode.children); +// ArrayList<PtNodeArray> similarList = map.get(pseudoHash); +// if (null == similarList) { +// similarList = new ArrayList<PtNodeArray>(); +// map.put(pseudoHash, similarList); +// } +// boolean merged = false; +// for (PtNodeArray similar : similarList) { +// if (isEqual(ptNode.children, similar)) { +// ptNode.children = similar; +// merged = true; +// break; +// } +// } +// if (!merged) { +// similarList.add(ptNode.children); +// } +// mergeTailsInner(map, ptNode.children); +// } +// } +// return map; +// } + +// private static String getPseudoHash(final PtNodeArray nodeArray) { +// StringBuilder s = new StringBuilder(); +// for (PtNode ptNode : nodeArray.data) { +// s.append(ptNode.frequency); +// for (int ch : ptNode.chars) { +// s.append(Character.toChars(ch)); +// } +// } +// return s.toString(); +// } + + /** * Iterator to walk through a dictionary. * * This is purely for convenience. diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java deleted file mode 100644 index 06088b651..000000000 --- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.makedict; - -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; -import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; - -/** - * An auxiliary class for reading SparseTable and data written by SparseTableContentWriter. - */ -public class SparseTableContentReader { - - /** - * An interface of a function which is passed to SparseTableContentReader.read. - */ - public interface SparseTableContentReaderInterface { - /** - * Reads data. - * - * @param buffer the DictBuffer. The position of the buffer is set to the head of data. - */ - public void read(final DictBuffer buffer); - } - - protected final int mContentCount; - protected final int mBlockSize; - protected final File mBaseDir; - protected final File mLookupTableFile; - protected final File[] mAddressTableFiles; - protected final File[] mContentFiles; - protected DictBuffer mLookupTableBuffer; - protected final DictBuffer[] mAddressTableBuffers; - private final DictBuffer[] mContentBuffers; - protected final DictionaryBufferFactory mFactory; - - /** - * Sole constructor of SparseTableContentReader. - * - * @param name the name of SparseTable. - * @param blockSize the block size of the content table. - * @param baseDir the directory which contains the files of the content table. - * @param contentFilenames the file names of content files. - * @param contentIds the ids of contents. These ids are used for a suffix of a name of - * address files and content files. - * @param factory the DictionaryBufferFactory which is used for opening the files. - */ - public SparseTableContentReader(final String name, final int blockSize, final File baseDir, - final String[] contentFilenames, final String[] contentIds, - final DictionaryBufferFactory factory) { - if (contentFilenames.length != contentIds.length) { - throw new RuntimeException("The length of contentFilenames and the length of" - + " contentIds are different " + contentFilenames.length + ", " - + contentIds.length); - } - mBlockSize = blockSize; - mBaseDir = baseDir; - mFactory = factory; - mContentCount = contentFilenames.length; - mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); - mAddressTableFiles = new File[mContentCount]; - mContentFiles = new File[mContentCount]; - for (int i = 0; i < mContentCount; ++i) { - mAddressTableFiles[i] = new File(mBaseDir, - name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]); - mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]); - } - mAddressTableBuffers = new DictBuffer[mContentCount]; - mContentBuffers = new DictBuffer[mContentCount]; - } - - public void openBuffers() throws FileNotFoundException, IOException { - mLookupTableBuffer = mFactory.getDictionaryBuffer(mLookupTableFile); - for (int i = 0; i < mContentCount; ++i) { - mAddressTableBuffers[i] = mFactory.getDictionaryBuffer(mAddressTableFiles[i]); - mContentBuffers[i] = mFactory.getDictionaryBuffer(mContentFiles[i]); - } - } - - protected void read(final int contentIndex, final int index, - final SparseTableContentReaderInterface reader) { - if (index < 0 || (index / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES - >= mLookupTableBuffer.limit()) { - return; - } - - mLookupTableBuffer.position((index / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES); - final int posInAddressTable = mLookupTableBuffer.readInt(); - if (posInAddressTable == SparseTable.NOT_EXIST) { - return; - } - - mAddressTableBuffers[contentIndex].position( - (posInAddressTable + index % mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES); - final int address = mAddressTableBuffers[contentIndex].readInt(); - if (address == SparseTable.NOT_EXIST) { - return; - } - - mContentBuffers[contentIndex].position(address); - reader.read(mContentBuffers[contentIndex]); - } -}
\ No newline at end of file diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentUpdater.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentUpdater.java deleted file mode 100644 index 4518f21b9..000000000 --- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentUpdater.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.makedict; - -import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; - -/** - * An auxiliary class for updating data associated with SparseTable. - */ -public class SparseTableContentUpdater extends SparseTableContentReader { - protected OutputStream mLookupTableOutStream; - protected OutputStream[] mAddressTableOutStreams; - protected OutputStream[] mContentOutStreams; - - public SparseTableContentUpdater(final String name, final int blockSize, - final File baseDir, final String[] contentFilenames, final String[] contentIds, - final DictionaryBufferFactory factory) { - super(name, blockSize, baseDir, contentFilenames, contentIds, factory); - mAddressTableOutStreams = new OutputStream[mContentCount]; - mContentOutStreams = new OutputStream[mContentCount]; - } - - protected void openStreamsAndBuffers() throws IOException { - openBuffers(); - mLookupTableOutStream = new FileOutputStream(mLookupTableFile, true /* append */); - for (int i = 0; i < mContentCount; ++i) { - mAddressTableOutStreams[i] = new FileOutputStream(mAddressTableFiles[i], - true /* append */); - mContentOutStreams[i] = new FileOutputStream(mContentFiles[i], true /* append */); - } - } - - /** - * Set the contentIndex-th elements of contentId-th table. - * - * @param contentId the id of the content table. - * @param contentIndex the index where to set the valie. - * @param value the value to set. - */ - protected void setContentValue(final int contentId, final int contentIndex, final int value) - throws IOException { - if ((contentIndex / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES - >= mLookupTableBuffer.limit()) { - // Need to extend the lookup table - final int currentSize = mLookupTableBuffer.limit() - / SparseTable.SIZE_OF_INT_IN_BYTES; - final int target = contentIndex / mBlockSize + 1; - for (int i = currentSize; i < target; ++i) { - BinaryDictEncoderUtils.writeUIntToStream(mLookupTableOutStream, - SparseTable.NOT_EXIST, SparseTable.SIZE_OF_INT_IN_BYTES); - } - // We need to reopen the byte buffer of the lookup table because a MappedByteBuffer in - // Java isn't expanded automatically when the underlying file is expanded. - reopenLookupTable(); - } - - mLookupTableBuffer.position((contentIndex / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES); - int posInAddressTable = mLookupTableBuffer.readInt(); - if (posInAddressTable == SparseTable.NOT_EXIST) { - // Need to extend the address table - mLookupTableBuffer.position(mLookupTableBuffer.position() - - SparseTable.SIZE_OF_INT_IN_BYTES); - posInAddressTable = mAddressTableBuffers[0].limit() / mBlockSize; - BinaryDictEncoderUtils.writeUIntToDictBuffer(mLookupTableBuffer, - posInAddressTable, SparseTable.SIZE_OF_INT_IN_BYTES); - for (int i = 0; i < mContentCount; ++i) { - for (int j = 0; j < mBlockSize; ++j) { - BinaryDictEncoderUtils.writeUIntToStream(mAddressTableOutStreams[i], - SparseTable.NOT_EXIST, SparseTable.SIZE_OF_INT_IN_BYTES); - } - } - // We need to reopen the byte buffers of the address tables because a MappedByteBuffer - // in Java isn't expanded automatically when the underlying file is expanded. - reopenAddressTables(); - } - posInAddressTable += (contentIndex % mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES; - - mAddressTableBuffers[contentId].position(posInAddressTable); - BinaryDictEncoderUtils.writeUIntToDictBuffer(mAddressTableBuffers[contentId], - value, SparseTable.SIZE_OF_INT_IN_BYTES); - } - - private void reopenLookupTable() throws IOException { - mLookupTableOutStream.flush(); - mLookupTableBuffer = mFactory.getDictionaryBuffer(mLookupTableFile); - } - - private void reopenAddressTables() throws IOException { - for (int i = 0; i < mContentCount; ++i) { - mAddressTableOutStreams[i].flush(); - mAddressTableBuffers[i] = mFactory.getDictionaryBuffer(mAddressTableFiles[i]); - } - } - - protected void close() throws IOException { - mLookupTableOutStream.close(); - for (final OutputStream stream : mAddressTableOutStreams) { - stream.close(); - } - for (final OutputStream stream : mContentOutStreams) { - stream.close(); - } - } -} diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java deleted file mode 100644 index 49f0fd624..000000000 --- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.makedict; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; - -/** - * An auxiliary class for writing data associated with SparseTable to files. - */ -public class SparseTableContentWriter { - public interface SparseTableContentWriterInterface { - public void write(final OutputStream outStream) throws IOException; - } - - private final int mContentCount; - private final SparseTable mSparseTable; - private final File mLookupTableFile; - protected final File mBaseDir; - private final File[] mAddressTableFiles; - private final File[] mContentFiles; - protected final OutputStream[] mContentOutStreams; - - /** - * Sole constructor of SparseTableContentWriter. - * - * @param name the name of SparseTable. - * @param initialCapacity the initial capacity of SparseTable. - * @param blockSize the block size of the content table. - * @param baseDir the directory which contains the files of the content table. - * @param contentFilenames the file names of content files. - * @param contentIds the ids of contents. These ids are used for a suffix of a name of address - * files and content files. - */ - public SparseTableContentWriter(final String name, final int initialCapacity, - final int blockSize, final File baseDir, final String[] contentFilenames, - final String[] contentIds) { - if (contentFilenames.length != contentIds.length) { - throw new RuntimeException("The length of contentFilenames and the length of" - + " contentIds are different " + contentFilenames.length + ", " - + contentIds.length); - } - mContentCount = contentFilenames.length; - mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount); - mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); - mAddressTableFiles = new File[mContentCount]; - mContentFiles = new File[mContentCount]; - mBaseDir = baseDir; - for (int i = 0; i < mContentCount; ++i) { - mAddressTableFiles[i] = new File(mBaseDir, - name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]); - mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]); - } - mContentOutStreams = new OutputStream[mContentCount]; - } - - public void openStreams() throws FileNotFoundException { - for (int i = 0; i < mContentCount; ++i) { - mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]); - } - } - - protected void write(final int contentIndex, final int index, - final SparseTableContentWriterInterface writer) throws IOException { - mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length()); - writer.write(mContentOutStreams[contentIndex]); - mContentOutStreams[contentIndex].flush(); - } - - public void closeStreams() throws IOException { - mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles); - for (int i = 0; i < mContentCount; ++i) { - mContentOutStreams[i].close(); - } - } -}
\ No newline at end of file diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java index 92eb861d6..5da34534e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java @@ -169,7 +169,7 @@ public class Ver3DictEncoder implements DictEncoder { private void writeChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions) { final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { mPosition += BinaryDictEncoderUtils.writeSignedChildrenPosition(mBuffer, mPosition, childrenPos); } else { diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java index 3be62f066..734223ec2 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -40,52 +40,26 @@ import java.util.Arrays; public class Ver4DictDecoder extends AbstractDictDecoder { private static final String TAG = Ver4DictDecoder.class.getSimpleName(); - protected static final int FILETYPE_TRIE = 1; - protected static final int FILETYPE_FREQUENCY = 2; - protected static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; - protected static final int FILETYPE_BIGRAM_FREQ = 4; - protected static final int FILETYPE_SHORTCUT = 5; - protected static final int FILETYPE_HEADER = 6; - - protected final File mDictDirectory; - protected final DictionaryBufferFactory mBufferFactory; + private static final int FILETYPE_TRIE = 1; + private static final int FILETYPE_FREQUENCY = 2; + private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; + private static final int FILETYPE_BIGRAM_FREQ = 4; + private static final int FILETYPE_SHORTCUT = 5; + + private final File mDictDirectory; + private final DictionaryBufferFactory mBufferFactory; protected DictBuffer mDictBuffer; - protected DictBuffer mHeaderBuffer; - protected DictBuffer mFrequencyBuffer; - protected DictBuffer mTerminalAddressTableBuffer; - private BigramContentReader mBigramReader; - private ShortcutContentReader mShortcutReader; - - /** - * Raw PtNode info straight out of a trie file in version 4 dictionary. - */ - protected static final class Ver4PtNodeInfo { - public final int mFlags; - public final int[] mCharacters; - public final int mTerminalId; - public final int mChildrenPos; - public final int mParentPos; - public final int mNodeSize; - public int mStartIndexOfCharacters; - public int mEndIndexOfCharacters; // exclusive - - public Ver4PtNodeInfo(final int flags, final int[] characters, final int terminalId, - final int childrenPos, final int parentPos, final int nodeSize) { - mFlags = flags; - mCharacters = characters; - mTerminalId = terminalId; - mChildrenPos = childrenPos; - mParentPos = parentPos; - mNodeSize = nodeSize; - mStartIndexOfCharacters = 0; - mEndIndexOfCharacters = characters.length; - } - } + private DictBuffer mFrequencyBuffer; + private DictBuffer mTerminalAddressTableBuffer; + private DictBuffer mBigramBuffer; + private DictBuffer mShortcutBuffer; + private SparseTable mBigramAddressTable; + private SparseTable mShortcutAddressTable; @UsedForTesting /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { mDictDirectory = dictDirectory; - mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null; + mDictBuffer = mFrequencyBuffer = null; if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); @@ -102,16 +76,13 @@ public class Ver4DictDecoder extends AbstractDictDecoder { /* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) { mDictDirectory = dictDirectory; mBufferFactory = factory; - mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null; + mDictBuffer = mFrequencyBuffer = null; } - protected File getFile(final int fileType) throws UnsupportedFormatException { + private File getFile(final int fileType) { if (fileType == FILETYPE_TRIE) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION); - } else if (fileType == FILETYPE_HEADER) { - return new File(mDictDirectory, - mDictDirectory.getName() + FormatSpec.HEADER_FILE_EXTENSION); } else if (fileType == FILETYPE_FREQUENCY) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION); @@ -127,27 +98,20 @@ public class Ver4DictDecoder extends AbstractDictDecoder { mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.SHORTCUT_CONTENT_ID); } else { - throw new UnsupportedFormatException("Unsupported kind of file : " + fileType); + throw new RuntimeException("Unsupported kind of file : " + fileType); } } @Override - public void openDictBuffer() throws FileNotFoundException, IOException, - UnsupportedFormatException { - if (!mDictDirectory.isDirectory()) { - throw new UnsupportedFormatException("Format 4 dictionary needs a directory"); - } - mHeaderBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_HEADER)); + public void openDictBuffer() throws FileNotFoundException, IOException { mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE)); mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY)); mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer( getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); - mBigramReader = new BigramContentReader(mDictDirectory.getName(), - mDictDirectory, mBufferFactory, false); - mBigramReader.openBuffers(); - mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory, - mBufferFactory); - mShortcutReader.openBuffers(); + mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); + loadBigramAddressSparseTable(); + mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT)); + loadShortcutAddressSparseTable(); } @Override @@ -155,134 +119,46 @@ public class Ver4DictDecoder extends AbstractDictDecoder { return mDictBuffer != null; } - @UsedForTesting - /* package */ DictBuffer getHeaderBuffer() { - return mHeaderBuffer; - } - - @UsedForTesting /* package */ DictBuffer getDictBuffer() { return mDictBuffer; } @Override public FileHeader readHeader() throws IOException, UnsupportedFormatException { - if (mHeaderBuffer == null) { + if (mDictBuffer == null) { openDictBuffer(); } - mHeaderBuffer.position(0); - final FileHeader header = super.readHeader(mHeaderBuffer); + final FileHeader header = super.readHeader(mDictBuffer); final int version = header.mFormatOptions.mVersion; - if (version != FormatSpec.VERSION4) { + if (version != 4) { throw new UnsupportedFormatException("File header has a wrong version : " + version); } return header; } - /** - * An auxiliary class for reading bigrams. - */ - protected static class BigramContentReader extends SparseTableContentReader { - public BigramContentReader(final String name, final File baseDir, - final DictionaryBufferFactory factory, final boolean hasTimestamp) { - super(name + FormatSpec.BIGRAM_FILE_EXTENSION, - FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - getContentFilenames(name, hasTimestamp), getContentIds(hasTimestamp), factory); - } - - // TODO: Consolidate this method and BigramContentWriter.getContentFilenames. - protected static String[] getContentFilenames(final String name, - final boolean hasTimestamp) { - final String[] contentFilenames; - if (hasTimestamp) { - contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION, - name + FormatSpec.BIGRAM_FILE_EXTENSION }; - } else { - contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }; - } - return contentFilenames; - } - - // TODO: Consolidate this method and BigramContentWriter.getContentIds. - protected static String[] getContentIds(final boolean hasTimestamp) { - final String[] contentIds; - if (hasTimestamp) { - contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID, - FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID }; - } else { - contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }; - } - return contentIds; - } - - public ArrayList<PendingAttribute> readTargetsAndFrequencies(final int terminalId, - final DictBuffer terminalAddressTableBuffer) { - final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList(); - read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, - new SparseTableContentReaderInterface() { - @Override - public void read(final DictBuffer buffer) { - while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, - // remaining bigram entries are ignored. - final int bigramFlags = buffer.readUnsignedByte(); - final int targetTerminalId = buffer.readUnsignedInt24(); - terminalAddressTableBuffer.position(targetTerminalId - * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); - final int targetAddress = - terminalAddressTableBuffer.readUnsignedInt24(); - bigrams.add(new PendingAttribute(bigramFlags - & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, - targetAddress)); - if (0 == (bigramFlags - & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) { - break; - } - } - if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - throw new RuntimeException("Too many bigrams in a PtNode (" - + bigrams.size() + " but max is " - + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); - } - } - }); - if (bigrams.isEmpty()) return null; - return bigrams; - } + private void loadBigramAddressSparseTable() throws IOException { + final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + final File freqsFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.BIGRAM_FREQ_CONTENT_ID); + mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile }, + FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); } - /** - * An auxiliary class for reading shortcuts. - */ - protected static class ShortcutContentReader extends SparseTableContentReader { - public ShortcutContentReader(final String name, final File baseDir, - final DictionaryBufferFactory factory) { - super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, - FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, - new String[] { FormatSpec.SHORTCUT_CONTENT_ID }, factory); - } - - public ArrayList<WeightedString> readShortcuts(final int terminalId) { - final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList(); - read(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, - new SparseTableContentReaderInterface() { - @Override - public void read(final DictBuffer buffer) { - while (true) { - final int flags = buffer.readUnsignedByte(); - final String word = CharEncoding.readString(buffer); - shortcuts.add(new WeightedString(word, - flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); - if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) { - break; - } - } - } - }); - if (shortcuts.isEmpty()) return null; - return shortcuts; - } + // TODO: Let's have something like SparseTableContentsReader in this class. + private void loadShortcutAddressSparseTable() throws IOException { + final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + final File contentFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.SHORTCUT_CONTENT_ID); + final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.SHORTCUT_CONTENT_ID); + mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile, + new File[] { contentFile, timestampsFile }, + FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE); } protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { @@ -296,82 +172,102 @@ public class Ver4DictDecoder extends AbstractDictDecoder { } } - private final int[] mCharacterBufferForReadingVer4PtNodeInfo - = new int[FormatSpec.MAX_WORD_LENGTH]; + private ArrayList<WeightedString> readShortcuts(final int terminalId) { + if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null; + + final ArrayList<WeightedString> ret = CollectionUtils.newArrayList(); + final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX, + terminalId); + mShortcutBuffer.position(posOfShortcuts); + while (true) { + final int flags = mShortcutBuffer.readUnsignedByte(); + final String word = CharEncoding.readString(mShortcutBuffer); + ret.add(new WeightedString(word, + flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); + if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return ret; + } - /** - * Reads PtNode from ptNodePos in the trie file and returns Ver4PtNodeInfo. - * - * @param ptNodePos the position of PtNode. - * @param options the format options. - * @return Ver4PtNodeInfo. - */ // TODO: Make this buffer thread safe. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. - protected Ver4PtNodeInfo readVer4PtNodeInfo(final int ptNodePos, final FormatOptions options) { - int readingPos = ptNodePos; + private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; + @Override + public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { + int addressPointer = ptNodePos; final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - readingPos += FormatSpec.PTNODE_FLAGS_SIZE; + addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; - final int parentPos = PtNodeReader.readParentAddress(mDictBuffer, options); + final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { - readingPos += FormatSpec.PARENT_ADDRESS_SIZE; + addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; } final int characters[]; if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { int index = 0; int character = CharEncoding.readChar(mDictBuffer); - readingPos += CharEncoding.getCharSize(character); + addressPointer += CharEncoding.getCharSize(character); while (FormatSpec.INVALID_CHARACTER != character && index < FormatSpec.MAX_WORD_LENGTH) { - mCharacterBufferForReadingVer4PtNodeInfo[index++] = character; + mCharacterBuffer[index++] = character; character = CharEncoding.readChar(mDictBuffer); - readingPos += CharEncoding.getCharSize(character); + addressPointer += CharEncoding.getCharSize(character); } - characters = Arrays.copyOfRange(mCharacterBufferForReadingVer4PtNodeInfo, 0, index); + characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); } else { final int character = CharEncoding.readChar(mDictBuffer); - readingPos += CharEncoding.getCharSize(character); + addressPointer += CharEncoding.getCharSize(character); characters = new int[] { character }; } final int terminalId; if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { terminalId = PtNodeReader.readTerminalId(mDictBuffer); - readingPos += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE; } else { terminalId = PtNode.NOT_A_TERMINAL; } - int childrenPos = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); - if (childrenPos != FormatSpec.NO_CHILDREN_ADDRESS) { - childrenPos += readingPos; - } - readingPos += BinaryDictIOUtils.getChildrenAddressSize(flags, options); - - return new Ver4PtNodeInfo(flags, characters, terminalId, childrenPos, parentPos, - readingPos - ptNodePos); - } - - @Override - public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { - final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(ptNodePos, options); - final int frequency; - if (0 != (FormatSpec.FLAG_IS_TERMINAL & nodeInfo.mFlags)) { - frequency = PtNodeReader.readFrequency(mFrequencyBuffer, nodeInfo.mTerminalId); + if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { + frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId); } else { frequency = PtNode.NOT_A_TERMINAL; } - - final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts( - nodeInfo.mTerminalId); - final ArrayList<PendingAttribute> bigrams = mBigramReader.readTargetsAndFrequencies( - nodeInfo.mTerminalId, mTerminalAddressTableBuffer); - - return new PtNodeInfo(ptNodePos, ptNodePos + nodeInfo.mNodeSize, nodeInfo.mFlags, - nodeInfo.mCharacters, frequency, nodeInfo.mParentPos, nodeInfo.mChildrenPos, - shortcutTargets, bigrams); + int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); + if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + childrenAddress += addressPointer; + } + addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); + final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); + + final ArrayList<PendingAttribute> bigrams; + if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { + bigrams = new ArrayList<PendingAttribute>(); + final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId); + mBigramBuffer.position(posOfBigrams); + while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, + // remaining bigram entries are ignored. + final int bigramFlags = mBigramBuffer.readUnsignedByte(); + final int targetTerminalId = mBigramBuffer.readUnsignedInt24(); + mTerminalAddressTableBuffer.position( + targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); + final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24(); + bigrams.add(new PendingAttribute( + bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, + targetAddress)); + if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() + + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); + } + } else { + bigrams = null; + } + return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency, + parentAddress, childrenAddress, shortcutTargets, bigrams); } private void deleteDictFiles() { @@ -422,14 +318,10 @@ public class Ver4DictDecoder extends AbstractDictDecoder { @Override public boolean readAndFollowForwardLink() { - final int forwardLinkPos = mDictBuffer.position(); - int nextRelativePos = BinaryDictDecoderUtils.readSInt24(mDictBuffer); - if (nextRelativePos != FormatSpec.NO_FORWARD_LINK_ADDRESS) { - final int nextPos = forwardLinkPos + nextRelativePos; - if (nextPos >= 0 && nextPos < mDictBuffer.limit()) { - mDictBuffer.position(nextPos); - return true; - } + final int nextAddress = mDictBuffer.readUnsignedInt24(); + if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) { + mDictBuffer.position(nextAddress); + return true; } return false; } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java index 8b80ebe63..8d5b48a9b 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -25,8 +25,6 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.utils.CollectionUtils; -import com.android.inputmethod.latin.utils.FileUtils; import java.io.File; import java.io.FileNotFoundException; @@ -34,8 +32,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.Iterator; /** @@ -46,8 +42,8 @@ public class Ver4DictEncoder implements DictEncoder { private final File mDictPlacedDir; private byte[] mTrieBuf; private int mTriePos; + private int mHeaderSize; private OutputStream mTrieOutStream; - private OutputStream mHeaderOutStream; private OutputStream mFreqOutStream; private OutputStream mUnigramTimestampOutStream; private OutputStream mTerminalAddressTableOutStream; @@ -61,6 +57,62 @@ public class Ver4DictEncoder implements DictEncoder { mDictPlacedDir = dictPlacedDir; } + private interface SparseTableContentWriterInterface { + public void write(final OutputStream outStream) throws IOException; + } + + private static class SparseTableContentWriter { + private final int mContentCount; + private final SparseTable mSparseTable; + private final File mLookupTableFile; + protected final File mBaseDir; + private final File[] mAddressTableFiles; + private final File[] mContentFiles; + protected final OutputStream[] mContentOutStreams; + + public SparseTableContentWriter(final String name, final int initialCapacity, + final int blockSize, final File baseDir, final String[] contentFilenames, + final String[] contentIds) { + if (contentFilenames.length != contentIds.length) { + throw new RuntimeException("The length of contentFilenames and the length of" + + " contentIds are different " + contentFilenames.length + ", " + + contentIds.length); + } + mContentCount = contentFilenames.length; + mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount); + mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + mAddressTableFiles = new File[mContentCount]; + mContentFiles = new File[mContentCount]; + mBaseDir = baseDir; + for (int i = 0; i < mContentCount; ++i) { + mAddressTableFiles[i] = new File(mBaseDir, + name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]); + mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]); + } + mContentOutStreams = new OutputStream[mContentCount]; + } + + public void openStreams() throws FileNotFoundException { + for (int i = 0; i < mContentCount; ++i) { + mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]); + } + } + + protected void write(final int contentIndex, final int index, + final SparseTableContentWriterInterface writer) throws IOException { + mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length()); + writer.write(mContentOutStreams[contentIndex]); + mContentOutStreams[contentIndex].flush(); + } + + public void closeStreams() throws IOException { + mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles); + for (int i = 0; i < mContentCount; ++i) { + mContentOutStreams[i].close(); + } + } + } + private static class BigramContentWriter extends SparseTableContentWriter { private final boolean mWriteTimestamp; @@ -186,21 +238,16 @@ public class Ver4DictEncoder implements DictEncoder { mBaseFilename = header.getId() + "." + header.getVersion(); mDictDir = new File(mDictPlacedDir, mBaseFilename); final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION); - final File headerFile = new File(mDictDir, - mBaseFilename + FormatSpec.HEADER_FILE_EXTENSION); final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION); final File timestampFile = new File(mDictDir, mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION); final File terminalAddressTableFile = new File(mDictDir, mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); if (!mDictDir.isDirectory()) { - if (mDictDir.exists()) { - FileUtils.deleteRecursively(mDictDir); - } + if (mDictDir.exists()) mDictDir.delete(); mDictDir.mkdirs(); } mTrieOutStream = new FileOutputStream(trieFile); - mHeaderOutStream = new FileOutputStream(headerFile); mFreqOutStream = new FileOutputStream(freqFile); mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile); if (formatOptions.mHasTimestamp) { @@ -213,9 +260,6 @@ public class Ver4DictEncoder implements DictEncoder { if (mTrieOutStream != null) { mTrieOutStream.close(); } - if (mHeaderOutStream != null) { - mHeaderOutStream.close(); - } if (mFreqOutStream != null) { mFreqOutStream.close(); } @@ -227,7 +271,6 @@ public class Ver4DictEncoder implements DictEncoder { } } finally { mTrieOutStream = null; - mHeaderOutStream = null; mFreqOutStream = null; mTerminalAddressTableOutStream = null; } @@ -248,34 +291,16 @@ public class Ver4DictEncoder implements DictEncoder { openStreams(formatOptions, dict.mOptions); } - BinaryDictEncoderUtils.writeDictionaryHeader(mHeaderOutStream, dict, formatOptions); + mHeaderSize = BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict, + formatOptions); MakedictLog.i("Flattening the tree..."); ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); int terminalCount = 0; - final ArrayList<PtNode> nodes = CollectionUtils.newArrayList(); for (final PtNodeArray array : flatNodes) { for (final PtNode node : array.mData) { - if (node.isTerminal()) { - nodes.add(node); - node.mTerminalId = terminalCount++; - } - } - } - Collections.sort(nodes, new Comparator<PtNode>() { - @Override - public int compare(final PtNode lhs, final PtNode rhs) { - if (lhs.mFrequency != rhs.mFrequency) { - return lhs.mFrequency < rhs.mFrequency ? -1 : 1; - } - if (lhs.mTerminalId < rhs.mTerminalId) return -1; - if (lhs.mTerminalId > rhs.mTerminalId) return 1; - return 0; + if (node.isTerminal()) node.mTerminalId = terminalCount++; } - }); - int count = 0; - for (final PtNode node : nodes) { - node.mTerminalId = count++; } MakedictLog.i("Computing addresses..."); @@ -312,7 +337,7 @@ public class Ver4DictEncoder implements DictEncoder { @Override public void setPosition(int position) { - if (mTrieBuf == null || position < 0 || position > mTrieBuf.length) return; + if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return; mTriePos = position; } @@ -365,7 +390,7 @@ public class Ver4DictEncoder implements DictEncoder { private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) { final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); - if (formatOptions.supportsDynamicUpdate()) { + if (formatOptions.mSupportsDynamicUpdate) { mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf, mTriePos, childrenPos); } else { @@ -432,7 +457,7 @@ public class Ver4DictEncoder implements DictEncoder { ptNode.mFrequency, FormatSpec.FREQUENCY_AND_FLAGS_SIZE); BinaryDictEncoderUtils.writeUIntToBuffer(terminalAddressTableBuf, ptNode.mTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, - ptNode.mCachedAddressAfterUpdate, + ptNode.mCachedAddressAfterUpdate + mHeaderSize, FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); } } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java index c46bc36bb..3d8f186ba 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java @@ -17,130 +17,29 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; -import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; -import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; -import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.utils.CollectionUtils; - -import android.util.Log; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.OutputStream; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; /** * An implementation of DictUpdater for version 4 binary dictionary. */ @UsedForTesting public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater { - private static final String TAG = Ver4DictUpdater.class.getSimpleName(); - - private OutputStream mDictStream; - private final File mFrequencyFile; @UsedForTesting - public Ver4DictUpdater(final File dictDirectory, final int factoryType) - throws UnsupportedFormatException { + public Ver4DictUpdater(final File dictDirectory, final int factoryType) { // DictUpdater must have an updatable DictBuffer. super(dictDirectory, ((factoryType & MASK_DICTBUFFER) == USE_BYTEARRAY) ? USE_BYTEARRAY : USE_WRITABLE_BYTEBUFFER); - mFrequencyFile = getFile(FILETYPE_FREQUENCY); - } - - private static class BigramContentUpdater extends SparseTableContentUpdater { - public BigramContentUpdater(final String name, final File baseDir, - final boolean hasTimestamp) { - super(name + FormatSpec.BIGRAM_FILE_EXTENSION, - FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - BigramContentReader.getContentFilenames(name, hasTimestamp), - BigramContentReader.getContentIds(hasTimestamp), - new DictionaryBufferFromWritableByteBufferFactory()); - } - - public void insertBigramEntries(final int terminalId, final int frequency, - final ArrayList<PendingAttribute> entries) throws IOException { - if (terminalId < 0) { - throw new RuntimeException("Invalid terminal id : " + terminalId); - } - openStreamsAndBuffers(); - - if (entries == null || entries.isEmpty()) { - setContentValue(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, - SparseTable.NOT_EXIST); - return; - } - final int positionOfEntries = - (int) mContentFiles[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX].length(); - setContentValue(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, positionOfEntries); - - final Iterator<PendingAttribute> bigramIterator = entries.iterator(); - while (bigramIterator.hasNext()) { - final PendingAttribute entry = bigramIterator.next(); - final int flags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), - 0 /* offset */, entry.mFrequency, frequency, "" /* word */); - BinaryDictEncoderUtils.writeUIntToStream( - mContentOutStreams[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX], flags, - FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - BinaryDictEncoderUtils.writeUIntToStream( - mContentOutStreams[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX], entry.mAddress, - FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE); - } - close(); - } - } - - private static class ShortcutContentUpdater extends SparseTableContentUpdater { - public ShortcutContentUpdater(final String name, final File baseDir) { - super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, - FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, - new String[] { FormatSpec.SHORTCUT_CONTENT_ID }, - new DictionaryBufferFromWritableByteBufferFactory()); - } - - public void insertShortcuts(final int terminalId, - final ArrayList<WeightedString> shortcuts) throws IOException { - if (terminalId < 0) { - throw new RuntimeException("Invalid terminal id : " + terminalId); - } - openStreamsAndBuffers(); - if (shortcuts == null || shortcuts.isEmpty()) { - setContentValue(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, - SparseTable.NOT_EXIST); - return; - } - - final int positionOfShortcuts = - (int) mContentFiles[FormatSpec.SHORTCUT_CONTENT_INDEX].length(); - setContentValue(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, positionOfShortcuts); - - final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); - while (shortcutIterator.hasNext()) { - final WeightedString target = shortcutIterator.next(); - final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( - shortcutIterator.hasNext(), target.mFrequency); - BinaryDictEncoderUtils.writeUIntToStream( - mContentOutStreams[FormatSpec.SHORTCUT_CONTENT_INDEX], shortcutFlags, - FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - CharEncoding.writeString(mContentOutStreams[FormatSpec.SHORTCUT_CONTENT_INDEX], - target.mWord); - } - close(); - } } @Override public void deleteWord(final String word) throws IOException, UnsupportedFormatException { - if (mDictBuffer == null) { - openDictBuffer(); - readHeader(); - } + if (mDictBuffer == null) openDictBuffer(); + readHeader(); final int wordPos = getTerminalPosition(word); if (wordPos != FormatSpec.NOT_VALID_WORD) { mDictBuffer.position(wordPos); @@ -150,623 +49,11 @@ public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater { } } - private int getNewTerminalId() { - // The size of frequency file is FormatSpec.FREQUENCY_AND_FLAGS_SIZE * number of terminals - // because each terminal always has a frequency. - // So we can get a fresh terminal id by this logic. - // CAVEAT: we are reading the file size from the disk each time: beware of race conditions, - // even on one thread. - return (int) (mFrequencyFile.length() / FormatSpec.FREQUENCY_AND_FLAGS_SIZE); - } - - private void updateParentPosIfNotMoved(final int nodePos, final int newParentPos, - final FormatOptions formatOptions) { - final int originalPos = getPosition(); - setPosition(nodePos); - final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - if (!BinaryDictIOUtils.isMovedPtNode(flags, formatOptions)) { - final int parentOffset = newParentPos - nodePos; - BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, parentOffset); - } - setPosition(originalPos); - } - - private void updateParentPositions(final int nodeArrayPos, final int newParentPos, - final FormatOptions formatOptions) { - final int originalPos = mDictBuffer.position(); - mDictBuffer.position(nodeArrayPos); - int jumpCount = 0; - do { - final int count = readPtNodeCount(); - for (int i = 0; i < count; ++i) { - updateParentPosIfNotMoved(getPosition(), newParentPos, formatOptions); - skipPtNode(formatOptions); - } - if (!readAndFollowForwardLink()) break; - } while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS); - setPosition(originalPos); - } - - private void updateChildrenPos(final int nodePos, final int newChildrenPos, - final FormatOptions options) { - final int originalPos = getPosition(); - setPosition(nodePos); - final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - PtNodeReader.readParentAddress(mDictBuffer, options); - BinaryDictIOUtils.skipString(mDictBuffer, - (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); - if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer); - final int basePos = getPosition(); - BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newChildrenPos - basePos); - setPosition(originalPos); - } - - private void updateTerminalPosition(final int terminalId, final int position) { - if (terminalId == PtNode.NOT_A_TERMINAL - || terminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE - >= mTerminalAddressTableBuffer.limit()) return; - mTerminalAddressTableBuffer.position(terminalId - * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); - BinaryDictEncoderUtils.writeUIntToDictBuffer(mTerminalAddressTableBuffer, position, - FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); - } - - private void updateForwardLink(final int nodeArrayPos, final int newForwardLink, - final FormatOptions formatOptions) { - final int originalPos = getPosition(); - setPosition(nodeArrayPos); - int jumpCount = 0; - while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS) { - final int ptNodeCount = readPtNodeCount(); - for (int i = 0; i < ptNodeCount; ++i) { - skipPtNode(formatOptions); - } - final int forwardLinkPos = getPosition(); - if (!readAndFollowForwardLink()) { - setPosition(forwardLinkPos); - BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newForwardLink - forwardLinkPos); - break; - } - } - setPosition(originalPos); - } - - private void markPtNodeAsMoved(final int nodePos, final int newNodePos, - final FormatOptions options) { - final int originalPos = getPosition(); - updateParentPosIfNotMoved(nodePos, newNodePos, options); - setPosition(nodePos); - final int currentFlags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - setPosition(nodePos); - mDictBuffer.put((byte) (FormatSpec.FLAG_IS_MOVED - | (currentFlags & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); - final int offset = newNodePos - nodePos; - BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, offset); - setPosition(originalPos); - } - - /** - * Writes a PtNode to an output stream from a Ver4PtNodeInfo. - * - * @param nodePos the position of the head of the PtNode. - * @param info the PtNode info to be written. - * @return the size written, in bytes. - */ - private int writePtNode(final int nodePos, final Ver4PtNodeInfo info) throws IOException { - int written = 0; - - // Write flags. - mDictStream.write((byte) (info.mFlags & 0xFF)); - written += FormatSpec.PTNODE_FLAGS_SIZE; - - // Write the parent position. - final int parentOffset = info.mParentPos == FormatSpec.NO_PARENT_ADDRESS ? - FormatSpec.NO_PARENT_ADDRESS : info.mParentPos - nodePos; - BinaryDictIOUtils.writeSInt24ToStream(mDictStream, parentOffset); - written += FormatSpec.PARENT_ADDRESS_SIZE; - - // Write a string. - if (((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0) - != (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters > 1)) { - throw new RuntimeException("Inconsistent flags : hasMultipleChars = " - + ((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0) + ", length = " - + (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters)); - } - written += CharEncoding.writeCodePoints(mDictStream, info.mCharacters, - info.mStartIndexOfCharacters, info.mEndIndexOfCharacters); - - // Write the terminal id. - if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { - BinaryDictEncoderUtils.writeUIntToStream(mDictStream, info.mTerminalId, - FormatSpec.PTNODE_TERMINAL_ID_SIZE); - written += FormatSpec.PTNODE_TERMINAL_ID_SIZE; - } - - // Write the children position. - final int childrenOffset = info.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS - ? 0 : info.mChildrenPos - (nodePos + written); - BinaryDictIOUtils.writeSInt24ToStream(mDictStream, childrenOffset); - written += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; - - return written; - } - - /** - * Helper method to split and move PtNode. - * - * @param ptNodeArrayPos the position of PtNodeArray which contains the split and moved PtNode. - * @param splittedPtNodeToMovePos the position of the split and moved PtNode. - * @param newParent the parent PtNode after splitting. - * @param newChildren the children PtNodes after splitting. - * @param newParentStartPos where to write the new parent. - * @param formatOptions the format options. - */ - private void writeSplittedPtNodes(final int ptNodeArrayPos, final int splittedPtNodeToMovePos, - final Ver4PtNodeInfo newParent, final Ver4PtNodeInfo[] newChildren, - final int newParentStartPos, - final FormatOptions formatOptions) throws IOException { - updateTerminalPosition(newParent.mTerminalId, - newParentStartPos + 1 /* size of PtNodeCount */); - int written = writePtNodeArray(newParentStartPos, new Ver4PtNodeInfo[] { newParent }, - FormatSpec.NO_FORWARD_LINK_ADDRESS); - final int childrenStartPos = newParentStartPos + written; - writePtNodeArray(childrenStartPos, newChildren, FormatSpec.NO_FORWARD_LINK_ADDRESS); - int childrenNodePos = childrenStartPos + 1 /* size of PtNodeCount */; - for (final Ver4PtNodeInfo info : newChildren) { - updateTerminalPosition(info.mTerminalId, childrenNodePos); - childrenNodePos += computePtNodeSize(info.mCharacters, info.mStartIndexOfCharacters, - info.mEndIndexOfCharacters, - (info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0); - } - - // Mark as moved. - markPtNodeAsMoved(splittedPtNodeToMovePos, newParentStartPos + 1 /* size of PtNodeCount */, - formatOptions); - updateForwardLink(ptNodeArrayPos, newParentStartPos, formatOptions); - } - - /** - * Writes a node array to the stream. - * - * @param nodeArrayPos the position of the head of the node array. - * @param infos an array of Ver4PtNodeInfo to be written. - * @return the written length in bytes. - */ - private int writePtNodeArray(final int nodeArrayPos, final Ver4PtNodeInfo[] infos, - final int forwardLink) throws IOException { - int written = BinaryDictIOUtils.writePtNodeCount(mDictStream, infos.length); - for (int i = 0; i < infos.length; ++i) { - written += writePtNode(nodeArrayPos + written, infos[i]); - } - BinaryDictIOUtils.writeSInt24ToStream(mDictStream, forwardLink); - written += FormatSpec.FORWARD_LINK_ADDRESS_SIZE; - return written; - } - - private int computePtNodeSize(final int[] codePoints, final int startIndex, final int endIndex, - final boolean isTerminal) { - return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE - + CharEncoding.getCharArraySize(codePoints, startIndex, endIndex) - + (endIndex - startIndex > 1 ? FormatSpec.PTNODE_TERMINATOR_SIZE : 0) - + (isTerminal ? FormatSpec.PTNODE_TERMINAL_ID_SIZE : 0) - + FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; - } - - private void writeNewSinglePtNodeWithAttributes(final int[] codePoints, - final boolean hasShortcuts, final int terminalId, final boolean hasBigrams, - final boolean isNotAWord, final boolean isBlackListEntry, final int parentPos, - final FormatOptions formatOptions) throws IOException { - final int newNodeArrayPos = mDictBuffer.limit(); - final int newNodeFlags = BinaryDictEncoderUtils.makePtNodeFlags(codePoints.length > 1, - terminalId != PtNode.NOT_A_TERMINAL, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, - hasBigrams, isNotAWord, isBlackListEntry, formatOptions); - final Ver4PtNodeInfo info = new Ver4PtNodeInfo(newNodeFlags, codePoints, terminalId, - FormatSpec.NO_CHILDREN_ADDRESS, parentPos, 0 /* nodeSize */); - writePtNodeArray(newNodeArrayPos, new Ver4PtNodeInfo[] { info }, - FormatSpec.NO_FORWARD_LINK_ADDRESS); - } - - private int setMultipleCharsInFlags(final int currentFlags, final boolean hasMultipleChars) { - final int flags; - if (hasMultipleChars) { - flags = currentFlags | FormatSpec.FLAG_HAS_MULTIPLE_CHARS; - } else { - flags = currentFlags & (~FormatSpec.FLAG_HAS_MULTIPLE_CHARS); - } - return flags; - } - - private int setIsNotAWordInFlags(final int currentFlags, final boolean isNotAWord) { - final int flags; - if (isNotAWord) { - flags = currentFlags | FormatSpec.FLAG_IS_NOT_A_WORD; - } else { - flags = currentFlags & (~FormatSpec.FLAG_IS_NOT_A_WORD); - } - return flags; - } - - private int setIsBlackListEntryInFlags(final int currentFlags, final boolean isBlackListEntry) { - final int flags; - if (isBlackListEntry) { - flags = currentFlags | FormatSpec.FLAG_IS_BLACKLISTED; - } else { - flags = currentFlags & (~FormatSpec.FLAG_IS_BLACKLISTED); - } - return flags; - } - - /** - * Splits a PtNode. - * - * abcd - ef - * - * -> inserting "abc" - * - * abc - d - ef - * - * @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split. - * @param nodeToSplitPos the position of the PtNode to split. - * @param nodeToSplitInfo the information of the PtNode to split. - * @param indexToSplit the index where to split in the code points array. - * @param parentOfNodeToSplitPos the absolute position of a parent of the node to split. - * @param newTerminalId the terminal id of the inserted node (corresponds to "d"). - * @param hasShortcuts whether the inserted word should have shortcuts. - * @param hasBigrams whether the inserted word should have bigrams. - * @param isNotAWord whether the inserted word should be not a word. - * @param isBlackListEntry whether the inserted word should be a black list entry. - * @param formatOptions the format options. - */ - private void splitOnly(final int nodeArrayToSplitPos, final int nodeToSplitPos, - final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit, - final int parentOfNodeToSplitPos, final int newTerminalId, final boolean hasShortcuts, - final boolean hasBigrams, final boolean isNotAWord, final boolean isBlackListEntry, - final FormatOptions formatOptions) throws IOException { - final int parentNodeArrayStartPos = mDictBuffer.limit(); - final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */; - final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags(indexToSplit > 1, - true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams, - isNotAWord, isBlackListEntry, formatOptions); - final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags, - nodeToSplitInfo.mCharacters, newTerminalId, parentNodeStartPos - + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, true) - + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, - parentOfNodeToSplitPos, 0 /* nodeSize */); - parentInfo.mStartIndexOfCharacters = 0; - parentInfo.mEndIndexOfCharacters = indexToSplit; - - // Write the child. - final int childrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags, - nodeToSplitInfo.mCharacters.length - indexToSplit > 1); - final Ver4PtNodeInfo childrenInfo = new Ver4PtNodeInfo(childrenFlags, - nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId, - nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */); - childrenInfo.mStartIndexOfCharacters = indexToSplit; - childrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length; - if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) { - updateParentPositions(nodeToSplitInfo.mChildrenPos, - parentInfo.mChildrenPos + 1 /* size of PtNodeCount */, formatOptions); - } - - writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo, - new Ver4PtNodeInfo[] { childrenInfo }, parentNodeArrayStartPos, formatOptions); - } - - /** - * Split and branch a PtNode. - * - * ab - cd - * - * -> inserting "ac" - * - * a - b - cd - * | - * - c - * - * @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split. - * @param nodeToSplitPos the position of the PtNode to split. - * @param nodeToSplitInfo the information of the PtNode to split. - * @param indexToSplit the index where to split in the code points array. - * @param parentOfNodeToSplitPos the absolute position of parent of the node to split. - * @param newWordSuffixCodePoints the suffix of the newly inserted word (corresponds to "c"). - * @param startIndexOfNewWordSuffixCodePoints the start index in newWordSuffixCodePoints where - * the suffix starts. - * @param newTerminalId the terminal id of the inserted node (correspond to "c"). - * @param hasShortcuts whether the inserted word should have shortcuts. - * @param hasBigrams whether the inserted word should have bigrams. - * @param isNotAWord whether the inserted word should be not a word. - * @param isBlackListEntry whether the inserted word should be a black list entry. - * @param formatOptions the format options. - */ - private void splitAndBranch(final int nodeArrayToSplitPos, final int nodeToSplitPos, - final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit, - final int parentOfNodeToSplitPos, final int[] newWordSuffixCodePoints, - final int startIndexOfNewWordSuffixCodePoints, - final int newTerminalId, - final boolean hasShortcuts, final boolean hasBigrams, final boolean isNotAWord, - final boolean isBlackListEntry, final FormatOptions formatOptions) throws IOException { - final int parentNodeArrayStartPos = mDictBuffer.limit(); - final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */; - final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags( - indexToSplit > 1, - false /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, - false /* hasShortcut */, false /* hasBigrams */, - false /* isNotAWord */, false /* isBlackListEntry */, formatOptions); - final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags, - nodeToSplitInfo.mCharacters, PtNode.NOT_A_TERMINAL, - parentNodeStartPos - + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false) - + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, - parentOfNodeToSplitPos, 0 /* nodeSize */); - parentInfo.mStartIndexOfCharacters = 0; - parentInfo.mEndIndexOfCharacters = indexToSplit; - - final int childrenNodeArrayStartPos = parentNodeStartPos - + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false) - + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; - final int firstChildrenFlags = BinaryDictEncoderUtils.makePtNodeFlags( - newWordSuffixCodePoints.length - startIndexOfNewWordSuffixCodePoints > 1, - true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams, - isNotAWord, isBlackListEntry, formatOptions); - final Ver4PtNodeInfo firstChildrenInfo = new Ver4PtNodeInfo(firstChildrenFlags, - newWordSuffixCodePoints, newTerminalId, - FormatSpec.NO_CHILDREN_ADDRESS, parentNodeStartPos, - 0 /* nodeSize */); - firstChildrenInfo.mStartIndexOfCharacters = startIndexOfNewWordSuffixCodePoints; - firstChildrenInfo.mEndIndexOfCharacters = newWordSuffixCodePoints.length; - - final int secondChildrenStartPos = childrenNodeArrayStartPos + 1 /* size of ptNodeCount */ - + computePtNodeSize(newWordSuffixCodePoints, startIndexOfNewWordSuffixCodePoints, - newWordSuffixCodePoints.length, true /* isTerminal */); - final int secondChildrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags, - nodeToSplitInfo.mCharacters.length - indexToSplit > 1); - final Ver4PtNodeInfo secondChildrenInfo = new Ver4PtNodeInfo(secondChildrenFlags, - nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId, - nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */); - secondChildrenInfo.mStartIndexOfCharacters = indexToSplit; - secondChildrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length; - if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) { - updateParentPositions(nodeToSplitInfo.mChildrenPos, secondChildrenStartPos, - formatOptions); - } - - writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo, - new Ver4PtNodeInfo[] { firstChildrenInfo, secondChildrenInfo }, - parentNodeArrayStartPos, formatOptions); - } - - /** - * Inserts a word into the trie file and returns the position of inserted terminal node. - * If the insertion is failed, returns FormatSpec.NOT_VALID_WORD. - */ - @UsedForTesting - private int insertWordToTrie(final String word, final int newTerminalId, - final boolean isNotAWord, final boolean isBlackListEntry, final boolean hasBigrams, - final boolean hasShortcuts) throws IOException, UnsupportedFormatException { - setPosition(0); - final FileHeader header = readHeader(); - - final int[] codePoints = FusionDictionary.getCodePoints(word); - final int wordLen = codePoints.length; - - int wordPos = 0; - for (int depth = 0; depth < FormatSpec.MAX_WORD_LENGTH; /* nop */) { - final int nodeArrayPos = getPosition(); - final int ptNodeCount = readPtNodeCount(); - boolean goToChildren = false; - int parentPos = FormatSpec.NO_PARENT_ADDRESS; - for (int i = 0; i < ptNodeCount; ++i) { - final int nodePos = getPosition(); - final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(nodePos, header.mFormatOptions); - if (BinaryDictIOUtils.isMovedPtNode(nodeInfo.mFlags, header.mFormatOptions)) { - continue; - } - if (nodeInfo.mParentPos != FormatSpec.NO_PARENT_ADDRESS) { - parentPos = nodePos + nodeInfo.mParentPos; - } - - final boolean firstCharacterMatched = - codePoints[wordPos] == nodeInfo.mCharacters[0]; - boolean allCharactersMatched = true; - int firstDifferentCharacterIndex = -1; - for (int p = 0; p < nodeInfo.mCharacters.length; ++p) { - if (wordPos + p >= codePoints.length) break; - if (codePoints[wordPos + p] != nodeInfo.mCharacters[p]) { - if (firstDifferentCharacterIndex == -1) { - firstDifferentCharacterIndex = p; - } - allCharactersMatched = false; - } - } - - if (!firstCharacterMatched) { - // Go to the next sibling node. - continue; - } - - if (!allCharactersMatched) { - final int parentNodeArrayStartPos = mDictBuffer.limit(); - splitAndBranch(nodeArrayPos, nodePos, nodeInfo, firstDifferentCharacterIndex, - parentPos, codePoints, wordPos + firstDifferentCharacterIndex, - newTerminalId, hasShortcuts, hasBigrams, isNotAWord, - isBlackListEntry, header.mFormatOptions); - - return parentNodeArrayStartPos + computePtNodeSize(codePoints, wordPos, - wordPos + firstDifferentCharacterIndex, false) - + FormatSpec.FORWARD_LINK_ADDRESS_SIZE + 1 /* size of PtNodeCount */; - } - - if (wordLen - wordPos < nodeInfo.mCharacters.length) { - final int parentNodeArrayStartPos = mDictBuffer.limit(); - splitOnly(nodeArrayPos, nodePos, nodeInfo, wordLen - wordPos, parentPos, - newTerminalId, hasShortcuts, hasBigrams, isNotAWord, isBlackListEntry, - header.mFormatOptions); - - // Return the position of the inserted word. - return parentNodeArrayStartPos + 1 /* size of PtNodeCount */; - } - - wordPos += nodeInfo.mCharacters.length; - if (wordPos == wordLen) { - // This dictionary already contains the word. - Log.e(TAG, "Something went wrong. If the word is already contained, " - + " there is no need to insert new PtNode."); - return FormatSpec.NOT_VALID_WORD; - } - if (nodeInfo.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS) { - // There are no children. - // We need to add a new node as a child of this node. - final int newNodeArrayPos = mDictBuffer.limit(); - final int[] newNodeCodePoints = Arrays.copyOfRange(codePoints, wordPos, - codePoints.length); - writeNewSinglePtNodeWithAttributes(newNodeCodePoints, hasShortcuts, - newTerminalId, hasBigrams, isNotAWord, isBlackListEntry, nodePos, - header.mFormatOptions); - updateChildrenPos(nodePos, newNodeArrayPos, header.mFormatOptions); - return newNodeArrayPos + 1 /* size of PtNodeCount */; - } else { - // Found the matched node. - // Go to the children of this node. - setPosition(nodeInfo.mChildrenPos); - goToChildren = true; - depth++; - break; - } - } - - if (goToChildren) continue; - if (!readAndFollowForwardLink()) { - // Add a new node that contains [wordPos, word.length()-1]. - // and update the forward link. - final int newNodeArrayPos = mDictBuffer.limit(); - final int[] newCodePoints = Arrays.copyOfRange(codePoints, wordPos, - codePoints.length); - writeNewSinglePtNodeWithAttributes(newCodePoints, hasShortcuts, newTerminalId, - hasBigrams, isNotAWord, isBlackListEntry, parentPos, header.mFormatOptions); - updateForwardLink(nodeArrayPos, newNodeArrayPos, header.mFormatOptions); - return newNodeArrayPos + 1 /* size of PtNodeCount */; - } - } - return FormatSpec.NOT_VALID_WORD; - } - - private void updateFrequency(final int terminalId, final int frequency) { - mFrequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE); - BinaryDictEncoderUtils.writeUIntToDictBuffer(mFrequencyBuffer, frequency, - FormatSpec.FREQUENCY_AND_FLAGS_SIZE); - } - - private void insertFrequency(final int frequency) throws IOException { - final OutputStream frequencyStream = new FileOutputStream(mFrequencyFile, - true /* append */); - BinaryDictEncoderUtils.writeUIntToStream(frequencyStream, frequency, - FormatSpec.FREQUENCY_AND_FLAGS_SIZE); - frequencyStream.close(); - } - - private void insertTerminalPosition(final int posOfTerminal) throws IOException, - UnsupportedFormatException { - final OutputStream terminalPosStream = new FileOutputStream( - getFile(FILETYPE_TERMINAL_ADDRESS_TABLE), true /* append */); - BinaryDictEncoderUtils.writeUIntToStream(terminalPosStream, posOfTerminal, - FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); - terminalPosStream.close(); - } - - private void insertBigrams(final int terminalId, final int frequency, - final ArrayList<PendingAttribute> bigramAddresses) - throws IOException, UnsupportedFormatException { - openDictBuffer(); - final BigramContentUpdater updater = new BigramContentUpdater(mDictDirectory.getName(), - mDictDirectory, false); - - // Convert addresses to terminal ids. - final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList(); - mDictBuffer.position(0); - final FileHeader header = readHeader(); - for (PendingAttribute attr : bigramAddresses) { - mDictBuffer.position(attr.mAddress); - final Ver4PtNodeInfo info = readVer4PtNodeInfo(attr.mAddress, header.mFormatOptions); - if (info.mTerminalId == PtNode.NOT_A_TERMINAL) { - throw new RuntimeException("We can't have a bigram target that's not a terminal."); - } - bigrams.add(new PendingAttribute(frequency, info.mTerminalId)); - } - updater.insertBigramEntries(terminalId, frequency, bigrams); - close(); - } - - private void insertShortcuts(final int terminalId, final ArrayList<WeightedString> shortcuts) - throws IOException { - final ShortcutContentUpdater updater = new ShortcutContentUpdater(mDictDirectory.getName(), - mDictDirectory); - updater.insertShortcuts(terminalId, shortcuts); - } - - private void openBuffersAndStream() throws IOException, UnsupportedFormatException { - openDictBuffer(); - mDictStream = new FileOutputStream(getFile(FILETYPE_TRIE), true /* append */); - } - - private void close() throws IOException { - if (mDictStream != null) { - mDictStream.close(); - mDictStream = null; - } - mDictBuffer = null; - mFrequencyBuffer = null; - mTerminalAddressTableBuffer = null; - } - - private void updateAttributes(final int posOfWord, final int frequency, - final ArrayList<WeightedString> bigramStrings, - final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, - final boolean isBlackListEntry) throws IOException, UnsupportedFormatException { - mDictBuffer.position(0); - final FileHeader header = readHeader(); - mDictBuffer.position(posOfWord); - final Ver4PtNodeInfo info = readVer4PtNodeInfo(posOfWord, header.mFormatOptions); - final int terminalId = info.mTerminalId; - - // Update the flags. - final int newFlags = setIsNotAWordInFlags( - setIsBlackListEntryInFlags(info.mFlags, isBlackListEntry), isNotAWord); - mDictBuffer.position(posOfWord); - mDictBuffer.put((byte) newFlags); - - updateFrequency(terminalId, frequency); - insertBigrams(terminalId, frequency, - DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings)); - insertShortcuts(terminalId, shortcuts); - } - - @Override @UsedForTesting + @Override public void insertWord(final String word, final int frequency, final ArrayList<WeightedString> bigramStrings, final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, final boolean isBlackListEntry) throws IOException, UnsupportedFormatException { - final int newTerminalId = getNewTerminalId(); - - openBuffersAndStream(); - final int posOfWord = getTerminalPosition(word); - if (posOfWord != FormatSpec.NOT_VALID_WORD) { - // The word is already contained in the dictionary. - updateAttributes(posOfWord, frequency, bigramStrings, shortcuts, isNotAWord, - isBlackListEntry); - close(); - return; - } - - // Insert new PtNode into trie. - final int posOfTerminal = insertWordToTrie(word, newTerminalId, isNotAWord, - isBlackListEntry, bigramStrings != null && !bigramStrings.isEmpty(), - shortcuts != null && !shortcuts.isEmpty()); - insertFrequency(frequency); - insertTerminalPosition(posOfTerminal); - close(); - - insertBigrams(newTerminalId, frequency, - DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings)); - insertShortcuts(newTerminalId, shortcuts); + // TODO: Implement this method. } } |