diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
13 files changed, 521 insertions, 196 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java index 665c7a27c..2c3d1346f 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -295,7 +295,6 @@ public final class BinaryDictDecoderUtils { return address; } } - int address; switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: return dictBuffer.readUnsignedByte(); diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java index 6cc0bfb76..b6024243f 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -278,7 +278,6 @@ public class BinaryDictEncoderUtils { // For future reference, the code to remove duplicate is a simple : list.remove(node); list.add(ptNodeArray); final ArrayList<PtNode> branches = ptNodeArray.mData; - final int nodeSize = branches.size(); for (PtNode ptNode : branches) { if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren); } @@ -385,12 +384,14 @@ public class BinaryDictEncoderUtils { nodeSize + size, ptNode.mChildren)); } nodeSize += getShortcutListSize(ptNode.mShortcutTargets); - if (null != ptNode.mBigrams) { - for (WeightedString bigram : ptNode.mBigrams) { - final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, - nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, - FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); - nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; + if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) { + if (null != ptNode.mBigrams) { + for (WeightedString bigram : ptNode.mBigrams) { + final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, + nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); + nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; + } } } ptNode.mCachedSize = nodeSize; @@ -425,9 +426,6 @@ public class BinaryDictEncoderUtils { nodeCountSize + nodeArrayOffset + nodeffset; nodeffset += ptNode.mCachedSize; } - final int nodeSize = nodeCountSize + nodeffset - + (formatOptions.mSupportsDynamicUpdate - ? FormatSpec.FORWARD_LINK_ADDRESS_SIZE : 0); nodeArrayOffset += nodeArray.mCachedSize; } return nodeArrayOffset; @@ -651,8 +649,8 @@ public class BinaryDictEncoderUtils { return flags; } - /* package */ static byte makePtNodeFlags(final PtNode node, final int ptNodeAddress, - final int childrenOffset, final FormatOptions formatOptions) { + /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset, + final FormatOptions formatOptions) { return (byte) makePtNodeFlags(node.mChars.length > 1, node.mFrequency >= 0, getByteSize(childrenOffset), node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(), diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java index a282f595c..0f7d2f6c9 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -288,40 +288,6 @@ public final class BinaryDictIOUtils { return BinaryDictEncoderUtils.getByteSize(value); } - static void skipPtNode(final DictBuffer dictBuffer, final FormatOptions formatOptions) { - final int flags = dictBuffer.readUnsignedByte(); - BinaryDictDecoderUtils.readParentAddress(dictBuffer, formatOptions); - skipString(dictBuffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); - BinaryDictDecoderUtils.readChildrenAddress(dictBuffer, flags, formatOptions); - if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) dictBuffer.readUnsignedByte(); - if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { - final int shortcutsSize = dictBuffer.readUnsignedShort(); - dictBuffer.position(dictBuffer.position() + shortcutsSize - - FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); - } - if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { - int bigramCount = 0; - while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - final int bigramFlags = dictBuffer.readUnsignedByte(); - switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: - dictBuffer.readUnsignedByte(); - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: - dictBuffer.readUnsignedShort(); - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: - dictBuffer.readUnsignedInt24(); - break; - } - if ((bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) == 0) break; - } - if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - throw new RuntimeException("Too many bigrams in a PtNode."); - } - } - } - static void skipString(final DictBuffer dictBuffer, final boolean hasMultipleChars) { if (hasMultipleChars) { diff --git a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java index 3796a466c..e251f7df7 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java @@ -391,4 +391,6 @@ public abstract class DictDecoder { return readLength; } } + + public abstract void skipPtNode(final FormatOptions formatOptions); } diff --git a/java/src/com/android/inputmethod/latin/makedict/DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/DictUpdater.java new file mode 100644 index 000000000..413d0301c --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/DictUpdater.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.IOException; +import java.util.ArrayList; + +/** + * An interface of a binary dictionary updater. + */ +public interface DictUpdater { + + /** + * Deletes the word from the binary dictionary. + * + * @param word the word to be deleted. + */ + public void deleteWord(final String word) throws IOException, UnsupportedFormatException; + + /** + * Inserts a word into a binary dictionary. + * + * @param word the word to be inserted. + * @param frequency the frequency of the new word. + * @param bigramStrings bigram list, or null if none. + * @param shortcuts shortcut list, or null if none. + * @param isBlackListEntry whether this should be a blacklist entry. + */ + // TODO: Support batch insertion. + public void insertWord(final String word, final int frequency, + final ArrayList<WeightedString> bigramStrings, + final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, + final boolean isBlackListEntry) throws IOException, UnsupportedFormatException; +} diff --git a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java index bf3d19101..336277196 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java @@ -42,44 +42,22 @@ public final class DynamicBinaryDictIOUtils { // This utility class is not publicly instantiable. } - private static int markAsDeleted(final int flags) { + /* package */ static int markAsDeleted(final int flags) { return (flags & (~FormatSpec.MASK_CHILDREN_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED; } /** - * Delete the word from the binary file. - * - * @param dictDecoder the dict decoder. - * @param word the word we delete - * @throws IOException - * @throws UnsupportedFormatException - */ - @UsedForTesting - public static void deleteWord(final Ver3DictDecoder dictDecoder, final String word) - throws IOException, UnsupportedFormatException { - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); - dictBuffer.position(0); - final FileHeader header = dictDecoder.readHeader(); - final int wordPosition = dictDecoder.getTerminalPosition(word); - if (wordPosition == FormatSpec.NOT_VALID_WORD) return; - - dictBuffer.position(wordPosition); - final int flags = dictBuffer.readUnsignedByte(); - dictBuffer.position(wordPosition); - dictBuffer.put((byte)markAsDeleted(flags)); - } - - /** * Update a parent address in a PtNode that is referred to by ptNodeOriginAddress. * - * @param dictBuffer the DictBuffer to write. + * @param dictUpdater the DictUpdater to write. * @param ptNodeOriginAddress the address of the PtNode. * @param newParentAddress the absolute address of the parent. * @param formatOptions file format options. */ - public static void updateParentAddress(final DictBuffer dictBuffer, + private static void updateParentAddress(final Ver3DictUpdater dictUpdater, final int ptNodeOriginAddress, final int newParentAddress, final FormatOptions formatOptions) { + final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); final int originalPosition = dictBuffer.position(); dictBuffer.position(ptNodeOriginAddress); if (!formatOptions.mSupportsDynamicUpdate) { @@ -104,46 +82,45 @@ public final class DynamicBinaryDictIOUtils { /** * Update parent addresses in a node array stored at ptNodeOriginAddress. * - * @param dictBuffer the DictBuffer to be modified. + * @param dictUpdater the DictUpdater to be modified. * @param ptNodeOriginAddress the address of the node array to update. * @param newParentAddress the address to be written. * @param formatOptions file format options. */ - public static void updateParentAddresses(final DictBuffer dictBuffer, + private static void updateParentAddresses(final Ver3DictUpdater dictUpdater, final int ptNodeOriginAddress, final int newParentAddress, final FormatOptions formatOptions) { - final int originalPosition = dictBuffer.position(); - dictBuffer.position(ptNodeOriginAddress); + final int originalPosition = dictUpdater.getPosition(); + dictUpdater.setPosition(ptNodeOriginAddress); do { - final int count = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer); + final int count = dictUpdater.readPtNodeCount(); for (int i = 0; i < count; ++i) { - updateParentAddress(dictBuffer, dictBuffer.position(), newParentAddress, + updateParentAddress(dictUpdater, dictUpdater.getPosition(), newParentAddress, formatOptions); - BinaryDictIOUtils.skipPtNode(dictBuffer, formatOptions); + dictUpdater.skipPtNode(formatOptions); } - final int forwardLinkAddress = dictBuffer.readUnsignedInt24(); - dictBuffer.position(forwardLinkAddress); - } while (formatOptions.mSupportsDynamicUpdate - && dictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS); - dictBuffer.position(originalPosition); + if (!dictUpdater.readAndFollowForwardLink()) break; + if (dictUpdater.getPosition() == FormatSpec.NO_FORWARD_LINK_ADDRESS) break; + } while (formatOptions.mSupportsDynamicUpdate); + dictUpdater.setPosition(originalPosition); } /** * Update a children address in a PtNode that is addressed by ptNodeOriginAddress. * - * @param dictBuffer the DictBuffer to write. + * @param dictUpdater the DictUpdater to write. * @param ptNodeOriginAddress the address of the PtNode. * @param newChildrenAddress the absolute address of the child. * @param formatOptions file format options. */ - public static void updateChildrenAddress(final DictBuffer dictBuffer, + private static void updateChildrenAddress(final Ver3DictUpdater dictUpdater, final int ptNodeOriginAddress, final int newChildrenAddress, final FormatOptions formatOptions) { + final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); final int originalPosition = dictBuffer.position(); dictBuffer.position(ptNodeOriginAddress); final int flags = dictBuffer.readUnsignedByte(); - final int parentAddress = BinaryDictDecoderUtils.readParentAddress(dictBuffer, - formatOptions); + BinaryDictDecoderUtils.readParentAddress(dictBuffer, formatOptions); BinaryDictIOUtils.skipString(dictBuffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) dictBuffer.readUnsignedByte(); final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS @@ -156,31 +133,33 @@ public final class DynamicBinaryDictIOUtils { * Helper method to move a PtNode to the tail of the file. */ private static int movePtNode(final OutputStream destination, - final DictBuffer dictBuffer, final PtNodeInfo info, + final Ver3DictUpdater dictUpdater, final PtNodeInfo info, final int nodeArrayOriginAddress, final int oldNodeAddress, final FormatOptions formatOptions) throws IOException { - updateParentAddress(dictBuffer, oldNodeAddress, dictBuffer.limit() + 1, formatOptions); + final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); + updateParentAddress(dictUpdater, oldNodeAddress, dictBuffer.limit() + 1, formatOptions); dictBuffer.position(oldNodeAddress); final int currentFlags = dictBuffer.readUnsignedByte(); dictBuffer.position(oldNodeAddress); dictBuffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG)))); int size = FormatSpec.PTNODE_FLAGS_SIZE; - updateForwardLink(dictBuffer, nodeArrayOriginAddress, dictBuffer.limit(), formatOptions); + updateForwardLink(dictUpdater, nodeArrayOriginAddress, dictBuffer.limit(), formatOptions); size += BinaryDictIOUtils.writeNodes(destination, new PtNodeInfo[] { info }); return size; } @SuppressWarnings("unused") - private static void updateForwardLink(final DictBuffer dictBuffer, + private static void updateForwardLink(final Ver3DictUpdater dictUpdater, final int nodeArrayOriginAddress, final int newNodeArrayAddress, final FormatOptions formatOptions) { - dictBuffer.position(nodeArrayOriginAddress); + final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); + dictUpdater.setPosition(nodeArrayOriginAddress); int jumpCount = 0; while (jumpCount++ < MAX_JUMPS) { - final int count = BinaryDictDecoderUtils.readPtNodeCount(dictBuffer); + final int count = dictUpdater.readPtNodeCount(); for (int i = 0; i < count; ++i) { - BinaryDictIOUtils.skipPtNode(dictBuffer, formatOptions); + dictUpdater.readPtNode(dictUpdater.getPosition(), formatOptions); } final int forwardLinkAddress = dictBuffer.readUnsignedInt24(); if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { @@ -208,7 +187,7 @@ public final class DynamicBinaryDictIOUtils { * @param shortcutTargets the shortcut targets for this PtNode. * @param bigrams the bigrams for this PtNode. * @param destination the stream representing the tail of the file. - * @param dictBuffer the DictBuffer representing the (constant-size) body of the file. + * @param dictUpdater the DictUpdater. * @param oldPtNodeArrayOrigin the origin of the old PtNode array this PtNode was a part of. * @param oldPtNodeOrigin the old origin where this PtNode used to be stored. * @param formatOptions format options for this dictionary. @@ -219,7 +198,7 @@ public final class DynamicBinaryDictIOUtils { final int length, final int flags, final int frequency, final int parentAddress, final ArrayList<WeightedString> shortcutTargets, final ArrayList<PendingAttribute> bigrams, final OutputStream destination, - final DictBuffer dictBuffer, final int oldPtNodeArrayOrigin, + final Ver3DictUpdater dictUpdater, final int oldPtNodeArrayOrigin, final int oldPtNodeOrigin, final FormatOptions formatOptions) throws IOException { int size = 0; final int newPtNodeOrigin = fileEndAddress + 1; @@ -232,7 +211,7 @@ public final class DynamicBinaryDictIOUtils { flags, writtenCharacters, frequency, parentAddress, fileEndAddress + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets, bigrams); - movePtNode(destination, dictBuffer, newInfo, oldPtNodeArrayOrigin, oldPtNodeOrigin, + movePtNode(destination, dictUpdater, newInfo, oldPtNodeArrayOrigin, oldPtNodeOrigin, formatOptions); return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; } @@ -240,7 +219,7 @@ public final class DynamicBinaryDictIOUtils { /** * Insert a word into a binary dictionary. * - * @param dictDecoder the dict decoder. + * @param dictUpdater the dict updater. * @param destination a stream to the underlying file, with the pointer at the end of the file. * @param word the word to insert. * @param frequency the frequency of the new word. @@ -253,17 +232,17 @@ public final class DynamicBinaryDictIOUtils { // TODO: Support batch insertion. // TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary. @UsedForTesting - public static void insertWord(final Ver3DictDecoder dictDecoder, + public static void insertWord(final Ver3DictUpdater dictUpdater, final OutputStream destination, final String word, final int frequency, final ArrayList<WeightedString> bigramStrings, final ArrayList<WeightedString> shortcuts, final boolean isNotAWord, final boolean isBlackListEntry) throws IOException, UnsupportedFormatException { final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>(); - final DictBuffer dictBuffer = dictDecoder.getDictBuffer(); + final DictBuffer dictBuffer = dictUpdater.getDictBuffer(); if (bigramStrings != null) { for (final WeightedString bigram : bigramStrings) { - int position = dictDecoder.getTerminalPosition(bigram.mWord); + int position = dictUpdater.getTerminalPosition(bigram.mWord); if (position == FormatSpec.NOT_VALID_WORD) { // TODO: figure out what is the correct thing to do here. } else { @@ -278,7 +257,7 @@ public final class DynamicBinaryDictIOUtils { // find the insert position of the word. if (dictBuffer.position() != 0) dictBuffer.position(0); - final FileHeader fileHeader = dictDecoder.readHeader(); + final FileHeader fileHeader = dictUpdater.readHeader(); int wordPos = 0, address = dictBuffer.position(), nodeOriginAddress = dictBuffer.position(); final int[] codePoints = FusionDictionary.getCodePoints(word); @@ -293,7 +272,7 @@ public final class DynamicBinaryDictIOUtils { for (int i = 0; i < ptNodeCount; ++i) { address = dictBuffer.position(); - final PtNodeInfo currentInfo = dictDecoder.readPtNode(address, + final PtNodeInfo currentInfo = dictUpdater.readPtNode(address, fileHeader.mFormatOptions); final boolean isMovedNode = BinaryDictIOUtils.isMovedPtNode(currentInfo.mFlags, fileHeader.mFormatOptions); @@ -319,12 +298,12 @@ public final class DynamicBinaryDictIOUtils { false /* isBlackListEntry */, fileHeader.mFormatOptions); int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p, flags, frequency, nodeParentAddress, shortcuts, bigrams, destination, - dictBuffer, nodeOriginAddress, address, fileHeader.mFormatOptions); + dictUpdater, nodeOriginAddress, address, fileHeader.mFormatOptions); final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p, currentInfo.mCharacters.length); if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { - updateParentAddresses(dictBuffer, currentInfo.mChildrenAddress, + updateParentAddresses(dictUpdater, currentInfo.mChildrenAddress, newNodeAddress + written + 1, fileHeader.mFormatOptions); } final PtNodeInfo newInfo2 = new PtNodeInfo( @@ -360,13 +339,13 @@ public final class DynamicBinaryDictIOUtils { fileHeader.mFormatOptions); int written = movePtNode(newNodeAddress, currentInfo.mCharacters, p, prefixFlags, -1 /* frequency */, nodeParentAddress, null, null, - destination, dictBuffer, nodeOriginAddress, address, + destination, dictUpdater, nodeOriginAddress, address, fileHeader.mFormatOptions); final int[] suffixCharacters = Arrays.copyOfRange( currentInfo.mCharacters, p, currentInfo.mCharacters.length); if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { - updateParentAddresses(dictBuffer, currentInfo.mChildrenAddress, + updateParentAddresses(dictUpdater, currentInfo.mChildrenAddress, newNodeAddress + written + 1, fileHeader.mFormatOptions); } final int suffixFlags = BinaryDictEncoderUtils.makePtNodeFlags( @@ -417,7 +396,7 @@ public final class DynamicBinaryDictIOUtils { -1 /* endAddress */, flags, currentInfo.mCharacters, frequency, nodeParentAddress, currentInfo.mChildrenAddress, shortcuts, bigrams); - movePtNode(destination, dictBuffer, newInfo, nodeOriginAddress, address, + movePtNode(destination, dictUpdater, newInfo, nodeOriginAddress, address, fileHeader.mFormatOptions); return; } @@ -436,7 +415,7 @@ public final class DynamicBinaryDictIOUtils { * ab - cd - e */ final int newNodeArrayAddress = dictBuffer.limit(); - updateChildrenAddress(dictBuffer, address, newNodeArrayAddress, + updateChildrenAddress(dictUpdater, address, newNodeArrayAddress, fileHeader.mFormatOptions); final int newNodeAddress = newNodeArrayAddress + 1; final boolean hasMultipleChars = (wordLen - wordPos) > 1; diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index 849bff050..a5516bd41 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -265,8 +265,15 @@ public final class FormatSpec { static final String FREQ_FILE_EXTENSION = ".freq"; // tat = Terminal Address Table static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; + static final String BIGRAM_FILE_EXTENSION = ".bigram"; + static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; + static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; + static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; + static final int BIGRAM_CONTENT_COUNT = 1; + static final int BIGRAM_FREQ_CONTENT_INDEX = 0; + static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_PARENT_ADDRESS = 0; @@ -331,9 +338,9 @@ public final class FormatSpec { public static final String USES_FORGETTING_CURVE_ATTRIBUTE = "USES_FORGETTING_CURVE"; public static final String ATTRIBUTE_VALUE_TRUE = "1"; - private static final String DICTIONARY_VERSION_ATTRIBUTE = "version"; - private static final String DICTIONARY_LOCALE_ATTRIBUTE = "locale"; - private static final String DICTIONARY_ID_ATTRIBUTE = "dictionary"; + public static final String DICTIONARY_VERSION_ATTRIBUTE = "version"; + public static final String DICTIONARY_LOCALE_ATTRIBUTE = "locale"; + public static final String DICTIONARY_ID_ATTRIBUTE = "dictionary"; private static final String DICTIONARY_DESCRIPTION_ATTRIBUTE = "description"; public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions, final FormatOptions formatOptions) { diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTable.java b/java/src/com/android/inputmethod/latin/makedict/SparseTable.java index 0b9cf91d2..7592a0c13 100644 --- a/java/src/com/android/inputmethod/latin/makedict/SparseTable.java +++ b/java/src/com/android/inputmethod/latin/makedict/SparseTable.java @@ -17,7 +17,11 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.utils.CollectionUtils; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; @@ -34,35 +38,39 @@ public class SparseTable { /** * mLookupTable is indexed by terminal ID, containing exactly one entry for every mBlockSize * terminals. - * It contains at index i = j / mBlockSize the index in mContentsTable where the values for - * terminals with IDs j to j + mBlockSize - 1 are stored as an mBlockSize-sized integer array. + * It contains at index i = j / mBlockSize the index in each ArrayList in mContentsTables where + * the values for terminals with IDs j to j + mBlockSize - 1 are stored as an mBlockSize-sized + * integer array. */ private final ArrayList<Integer> mLookupTable; - private final ArrayList<Integer> mContentTable; + private final ArrayList<ArrayList<Integer>> mContentTables; private final int mBlockSize; + private final int mContentTableCount; public static final int NOT_EXIST = -1; + public static final int SIZE_OF_INT_IN_BYTES = 4; @UsedForTesting - public SparseTable(final int initialCapacity, final int blockSize) { + public SparseTable(final int initialCapacity, final int blockSize, + final int contentTableCount) { mBlockSize = blockSize; final int lookupTableSize = initialCapacity / mBlockSize + (initialCapacity % mBlockSize > 0 ? 1 : 0); mLookupTable = new ArrayList<Integer>(Collections.nCopies(lookupTableSize, NOT_EXIST)); - mContentTable = new ArrayList<Integer>(); + mContentTableCount = contentTableCount; + mContentTables = CollectionUtils.newArrayList(); + for (int i = 0; i < mContentTableCount; ++i) { + mContentTables.add(new ArrayList<Integer>()); + } } @UsedForTesting - public SparseTable(final int[] lookupTable, final int[] contentTable, final int blockSize) { + public SparseTable(final ArrayList<Integer> lookupTable, + final ArrayList<ArrayList<Integer>> contentTables, final int blockSize) { mBlockSize = blockSize; - mLookupTable = new ArrayList<Integer>(lookupTable.length); - for (int i = 0; i < lookupTable.length; ++i) { - mLookupTable.add(lookupTable[i]); - } - mContentTable = new ArrayList<Integer>(contentTable.length); - for (int i = 0; i < contentTable.length; ++i) { - mContentTable.add(contentTable[i]); - } + mContentTableCount = contentTables.size(); + mLookupTable = lookupTable; + mContentTables = contentTables; } /** @@ -72,8 +80,8 @@ public class SparseTable { * Otherwise, IndexOutOfBoundsException will be raised. */ @UsedForTesting - private static void convertByteArrayToIntegerArray(final byte[] byteArray, - final ArrayList<Integer> integerArray) { + private static ArrayList<Integer> convertByteArrayToIntegerArray(final byte[] byteArray) { + final ArrayList<Integer> integerArray = new ArrayList<Integer>(byteArray.length / 4); for (int i = 0; i < byteArray.length; i += 4) { int value = 0; for (int j = i; j < i + 4; ++j) { @@ -82,39 +90,43 @@ public class SparseTable { } integerArray.add(value); } + return integerArray; } @UsedForTesting - public SparseTable(final byte[] lookupTable, final byte[] contentTable, final int blockSize) { - mBlockSize = blockSize; - mLookupTable = new ArrayList<Integer>(lookupTable.length / 4); - mContentTable = new ArrayList<Integer>(contentTable.length / 4); - convertByteArrayToIntegerArray(lookupTable, mLookupTable); - convertByteArrayToIntegerArray(contentTable, mContentTable); + public int get(final int contentTableIndex, final int index) { + if (!contains(index)) { + return NOT_EXIST; + } + return mContentTables.get(contentTableIndex).get( + mLookupTable.get(index / mBlockSize) + (index % mBlockSize)); } @UsedForTesting - public int get(final int index) { - if (index < 0 || index / mBlockSize >= mLookupTable.size() - || mLookupTable.get(index / mBlockSize) == NOT_EXIST) { - return NOT_EXIST; + public ArrayList<Integer> getAll(final int index) { + final ArrayList<Integer> ret = CollectionUtils.newArrayList(); + for (int i = 0; i < mContentTableCount; ++i) { + ret.add(get(i, index)); } - return mContentTable.get(mLookupTable.get(index / mBlockSize) + (index % mBlockSize)); + return ret; } @UsedForTesting - public void set(final int index, final int value) { + public void set(final int contentTableIndex, final int index, final int value) { if (mLookupTable.get(index / mBlockSize) == NOT_EXIST) { - mLookupTable.set(index / mBlockSize, mContentTable.size()); - for (int i = 0; i < mBlockSize; ++i) { - mContentTable.add(NOT_EXIST); + mLookupTable.set(index / mBlockSize, mContentTables.get(contentTableIndex).size()); + for (int i = 0; i < mContentTableCount; ++i) { + for (int j = 0; j < mBlockSize; ++j) { + mContentTables.get(i).add(NOT_EXIST); + } } } - mContentTable.set(mLookupTable.get(index / mBlockSize) + (index % mBlockSize), value); + mContentTables.get(contentTableIndex).set( + mLookupTable.get(index / mBlockSize) + (index % mBlockSize), value); } - public void remove(final int index) { - set(index, NOT_EXIST); + public void remove(final int indexOfContent, final int index) { + set(indexOfContent, index, NOT_EXIST); } @UsedForTesting @@ -124,7 +136,8 @@ public class SparseTable { @UsedForTesting /* package */ int getContentTableSize() { - return mContentTable.size(); + // This class always has at least one content table. + return mContentTables.get(0).size(); } @UsedForTesting @@ -133,18 +146,78 @@ public class SparseTable { } public boolean contains(final int index) { - return get(index) != NOT_EXIST; + if (index < 0 || index / mBlockSize >= mLookupTable.size() + || mLookupTable.get(index / mBlockSize) == NOT_EXIST) { + return false; + } + return true; } @UsedForTesting - public void write(final OutputStream lookupOutStream, final OutputStream contentOutStream) + public void write(final OutputStream lookupOutStream, final OutputStream[] contentOutStreams) throws IOException { + if (contentOutStreams.length != mContentTableCount) { + throw new RuntimeException(contentOutStreams.length + " streams are given, but the" + + " table has " + mContentTableCount + " content tables."); + } for (final int index : mLookupTable) { - BinaryDictEncoderUtils.writeUIntToStream(lookupOutStream, index, 4); + BinaryDictEncoderUtils.writeUIntToStream(lookupOutStream, index, SIZE_OF_INT_IN_BYTES); } - for (final int index : mContentTable) { - BinaryDictEncoderUtils.writeUIntToStream(contentOutStream, index, 4); + for (int i = 0; i < contentOutStreams.length; ++i) { + for (final int data : mContentTables.get(i)) { + BinaryDictEncoderUtils.writeUIntToStream(contentOutStreams[i], data, + SIZE_OF_INT_IN_BYTES); + } + } + } + + @UsedForTesting + public void writeToFiles(final File lookupTableFile, final File[] contentFiles) + throws IOException { + FileOutputStream lookupTableOutStream = null; + final FileOutputStream[] contentTableOutStreams = new FileOutputStream[mContentTableCount]; + try { + lookupTableOutStream = new FileOutputStream(lookupTableFile); + for (int i = 0; i < contentFiles.length; ++i) { + contentTableOutStreams[i] = new FileOutputStream(contentFiles[i]); + } + write(lookupTableOutStream, contentTableOutStreams); + } finally { + if (lookupTableOutStream != null) { + lookupTableOutStream.close(); + } + for (int i = 0; i < contentTableOutStreams.length; ++i) { + if (contentTableOutStreams[i] != null) { + contentTableOutStreams[i].close(); + } + } + } + } + + private static byte[] readFileToByteArray(final File file) throws IOException { + final byte[] contents = new byte[(int) file.length()]; + FileInputStream inStream = null; + try { + inStream = new FileInputStream(file); + inStream.read(contents); + } finally { + if (inStream != null) { + inStream.close(); + } + } + return contents; + } + + @UsedForTesting + public static SparseTable readFromFiles(final File lookupTableFile, final File[] contentFiles, + final int blockSize) throws IOException { + final ArrayList<ArrayList<Integer>> contentTables = + new ArrayList<ArrayList<Integer>>(contentFiles.length); + for (int i = 0; i < contentFiles.length; ++i) { + contentTables.add(convertByteArrayToIntegerArray(readFileToByteArray(contentFiles[i]))); } + return new SparseTable(convertByteArrayToIntegerArray(readFileToByteArray(lookupTableFile)), + contentTables, blockSize); } } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java index 848277cd4..b87259c38 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java @@ -53,9 +53,9 @@ public class Ver3DictDecoder extends DictDecoder { } } - private final File mDictionaryBinaryFile; + protected final File mDictionaryBinaryFile; private final DictionaryBufferFactory mBufferFactory; - private DictBuffer mDictBuffer; + protected DictBuffer mDictBuffer; /* package */ Ver3DictDecoder(final File file, final int factoryFlag) { mDictionaryBinaryFile = file; @@ -169,7 +169,8 @@ public class Ver3DictDecoder extends DictDecoder { addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, addressPointer); if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - MakedictLog.d("too many bigrams in a PtNode."); + throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() + + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); } } else { bigrams = null; @@ -231,4 +232,40 @@ public class Ver3DictDecoder extends DictDecoder { public boolean hasNextPtNodeArray() { return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS; } + + @Override + public void skipPtNode(final FormatOptions formatOptions) { + final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); + PtNodeReader.readParentAddress(mDictBuffer, formatOptions); + BinaryDictIOUtils.skipString(mDictBuffer, + (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readFrequency(mDictBuffer); + if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) { + final int shortcutsSize = mDictBuffer.readUnsignedShort(); + mDictBuffer.position(mDictBuffer.position() + shortcutsSize + - FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) { + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + final int bigramFlags = mDictBuffer.readUnsignedByte(); + switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: + mDictBuffer.readUnsignedByte(); + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: + mDictBuffer.readUnsignedShort(); + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: + mDictBuffer.readUnsignedInt24(); + break; + } + if ((bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) == 0) break; + } + if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + throw new RuntimeException("Too many bigrams in a PtNode."); + } + } + } } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java index 76f0f4052..d9e19899c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java @@ -133,12 +133,10 @@ public class Ver3DictEncoder implements DictEncoder { countSize); } - private void writePtNodeFlags(final PtNode ptNode, final int parentAddress, - final FormatOptions formatOptions) { + private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) { final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, - BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mPosition, childrenPos, - formatOptions), + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions), FormatSpec.PTNODE_FLAGS_SIZE); } @@ -244,7 +242,7 @@ public class Ver3DictEncoder implements DictEncoder { @Override public void writePtNode(final PtNode ptNode, final int parentPosition, final FormatOptions formatOptions, final FusionDictionary dict) { - writePtNodeFlags(ptNode, parentPosition, formatOptions); + writePtNodeFlags(ptNode, formatOptions); writeParentPosition(parentPosition, ptNode, formatOptions); writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); writeFrequency(ptNode.mFrequency); diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictUpdater.java new file mode 100644 index 000000000..fa7ae310a --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictUpdater.java @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; + +/** + * An implementation of DictUpdater for version 3 binary dictionary. + */ +@UsedForTesting +public class Ver3DictUpdater extends Ver3DictDecoder implements DictUpdater { + private OutputStream mOutStream; + + @UsedForTesting + public Ver3DictUpdater(final File dictFile, final int factoryType) { + // DictUpdater must have an updatable DictBuffer. + super(dictFile, ((factoryType & MASK_DICTBUFFER) == USE_BYTEARRAY) + ? USE_BYTEARRAY : USE_WRITABLE_BYTEBUFFER); + mOutStream = null; + } + + private void openStreamAndBuffer() throws FileNotFoundException, IOException { + super.openDictBuffer(); + mOutStream = new FileOutputStream(mDictionaryBinaryFile, true /* append */); + } + + private void close() throws IOException { + if (mOutStream != null) { + mOutStream.close(); + mOutStream = null; + } + } + + @Override @UsedForTesting + public void deleteWord(final String word) throws IOException, UnsupportedFormatException { + if (mOutStream == null) openStreamAndBuffer(); + mDictBuffer.position(0); + super.readHeader(); + final int wordPos = getTerminalPosition(word); + if (wordPos != FormatSpec.NOT_VALID_WORD) { + mDictBuffer.position(wordPos); + final int flags = mDictBuffer.readUnsignedByte(); + mDictBuffer.position(wordPos); + mDictBuffer.put((byte) DynamicBinaryDictIOUtils.markAsDeleted(flags)); + } + close(); + } + + @Override @UsedForTesting + public void insertWord(final String word, final int frequency, + final ArrayList<WeightedString> bigramStrings, + final ArrayList<WeightedString> shortcuts, + final boolean isNotAWord, final boolean isBlackListEntry) + throws IOException, UnsupportedFormatException { + if (mOutStream == null) openStreamAndBuffer(); + DynamicBinaryDictIOUtils.insertWord(this, mOutStream, word, frequency, bigramStrings, + shortcuts, isNotAWord, isBlackListEntry); + close(); + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java index 4c8ff8ea4..5089687da 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -42,12 +42,15 @@ public class Ver4DictDecoder extends DictDecoder { private static final int FILETYPE_TRIE = 1; private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; + private static final int FILETYPE_BIGRAM_FREQ = 4; private final File mDictDirectory; private final DictionaryBufferFactory mBufferFactory; private DictBuffer mDictBuffer; private DictBuffer mFrequencyBuffer; private DictBuffer mTerminalAddressTableBuffer; + private DictBuffer mBigramBuffer; + private SparseTable mBigramAddressTable; @UsedForTesting /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { @@ -82,6 +85,10 @@ public class Ver4DictDecoder extends DictDecoder { } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } else if (fileType == FILETYPE_BIGRAM_FREQ) { + return new File(mDictDirectory, + mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION + + FormatSpec.BIGRAM_FREQ_CONTENT_ID); } else { throw new RuntimeException("Unsupported kind of file : " + fileType); } @@ -89,11 +96,12 @@ public class Ver4DictDecoder extends DictDecoder { @Override public void openDictBuffer() throws FileNotFoundException, IOException { - final String filename = mDictDirectory.getName(); mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE)); mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY)); mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer( getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); + mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); + loadBigramAddressSparseTable(); } @Override @@ -118,6 +126,16 @@ public class Ver4DictDecoder extends DictDecoder { return header; } + private void loadBigramAddressSparseTable() throws IOException { + final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + final File freqsFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.BIGRAM_FREQ_CONTENT_ID); + mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile }, + FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); + } + protected static class PtNodeReader extends DictDecoder.PtNodeReader { protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); @@ -191,10 +209,24 @@ public class Ver4DictDecoder extends DictDecoder { final ArrayList<PendingAttribute> bigrams; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); - addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, - addressPointer); + final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId); + mBigramBuffer.position(posOfBigrams); + while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, + // remaining bigram entries are ignored. + final int bigramFlags = mBigramBuffer.readUnsignedByte(); + final int targetTerminalId = mBigramBuffer.readUnsignedInt24(); + mTerminalAddressTableBuffer.position( + targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); + final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24(); + bigrams.add(new PendingAttribute( + bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, + targetAddress)); + if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - MakedictLog.d("too many bigrams in a node."); + throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() + + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); } } else { bigrams = null; @@ -263,4 +295,14 @@ public class Ver4DictDecoder extends DictDecoder { public boolean hasNextPtNodeArray() { return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS; } + + @Override + public void skipPtNode(final FormatOptions formatOptions) { + final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); + PtNodeReader.readParentAddress(mDictBuffer, formatOptions); + BinaryDictIOUtils.skipString(mDictBuffer, + (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); + if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer); + PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions); + } } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java index 4fb89671f..b38c33019 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -46,21 +46,121 @@ public class Ver4DictEncoder implements DictEncoder { private OutputStream mTrieOutStream; private OutputStream mFreqOutStream; private OutputStream mTerminalAddressTableOutStream; + private File mDictDir; + private String mBaseFilename; + private BigramContentWriter mBigramWriter; @UsedForTesting public Ver4DictEncoder(final File dictPlacedDir) { mDictPlacedDir = dictPlacedDir; } + private interface SparseTableContentWriterInterface { + public void write(final OutputStream outStream) throws IOException; + } + + private static class SparseTableContentWriter { + private final int mContentCount; + private final SparseTable mSparseTable; + private final File mLookupTableFile; + protected final File mBaseDir; + private final File[] mAddressTableFiles; + private final File[] mContentFiles; + protected final OutputStream[] mContentOutStreams; + + public SparseTableContentWriter(final String name, final int contentCount, + final int initialCapacity, final int blockSize, final File baseDir, + final String[] contentFilenames, final String[] contentIds) { + if (contentFilenames.length != contentIds.length) { + throw new RuntimeException("The length of contentFilenames and the length of" + + " contentIds are different " + contentFilenames.length + ", " + + contentIds.length); + } + mContentCount = contentCount; + mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount); + mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + mAddressTableFiles = new File[mContentCount]; + mContentFiles = new File[mContentCount]; + mBaseDir = baseDir; + for (int i = 0; i < mContentCount; ++i) { + mAddressTableFiles[i] = new File(mBaseDir, + name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]); + mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]); + } + mContentOutStreams = new OutputStream[mContentCount]; + } + + public void openStreams() throws FileNotFoundException { + for (int i = 0; i < mContentCount; ++i) { + mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]); + } + } + + protected void write(final int contentIndex, final int index, + final SparseTableContentWriterInterface writer) throws IOException { + mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length()); + writer.write(mContentOutStreams[contentIndex]); + mContentOutStreams[contentIndex].flush(); + } + + public void closeStreams() throws IOException { + mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles); + for (int i = 0; i < mContentCount; ++i) { + mContentOutStreams[i].close(); + } + } + } + + private static class BigramContentWriter extends SparseTableContentWriter { + + public BigramContentWriter(final String name, final int initialCapacity, + final File baseDir) { + super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT, + initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, + new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }, + new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }); + } + + public void writeBigramsForOneWord(final int terminalId, + final Iterator<WeightedString> bigramIterator, final FusionDictionary dict) + throws IOException { + write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, + new SparseTableContentWriterInterface() { + @Override + public void write(final OutputStream outStream) throws IOException { + writeBigramsForOneWordInternal(outStream, bigramIterator, dict); + } + }); + } + + private void writeBigramsForOneWordInternal(final OutputStream outStream, + final Iterator<WeightedString> bigramIterator, final FusionDictionary dict) + throws IOException { + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int unigramFrequencyForThisWord = target.mFrequency; + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags( + bigramIterator.hasNext(), 0, bigram.mFrequency, + unigramFrequencyForThisWord, bigram.mWord); + BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId, + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE); + } + } + } + private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) throws FileNotFoundException, IOException { final FileHeader header = new FileHeader(0, dictOptions, formatOptions); - final String filename = header.getId() + "." + header.getVersion(); - final File mDictDir = new File(mDictPlacedDir, filename); - final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION); - final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION); + mBaseFilename = header.getId() + "." + header.getVersion(); + mDictDir = new File(mDictPlacedDir, mBaseFilename); + final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION); + final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION); final File terminalAddressTableFile = new File(mDictDir, - filename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); if (!mDictDir.isDirectory()) { if (mDictDir.exists()) mDictDir.delete(); mDictDir.mkdirs(); @@ -123,6 +223,8 @@ public class Ver4DictEncoder implements DictEncoder { if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); writeTerminalData(flatNodes, terminalCount); + mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); + writeBigrams(flatNodes, dict); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; @@ -165,12 +267,10 @@ public class Ver4DictEncoder implements DictEncoder { countSize); } - private void writePtNodeFlags(final PtNode ptNode, final int parentAddress, - final FormatOptions formatOptions) { + private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) { final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, - BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mTriePos, childrenPos, - formatOptions), + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions), FormatSpec.PTNODE_FLAGS_SIZE); } @@ -215,8 +315,7 @@ public class Ver4DictEncoder implements DictEncoder { while (shortcutIterator.hasNext()) { final WeightedString target = shortcutIterator.next(); final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( - shortcutIterator.hasNext(), - target.mFrequency); + shortcutIterator.hasNext(), target.mFrequency); mTrieBuf[mTriePos++] = (byte)shortcutFlags; final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos, target.mWord); @@ -230,24 +329,18 @@ public class Ver4DictEncoder implements DictEncoder { shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); } - private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) { - if (bigrams == null) return; - - final Iterator<WeightedString> bigramIterator = bigrams.iterator(); - while (bigramIterator.hasNext()) { - final WeightedString bigram = bigramIterator.next(); - final PtNode target = - FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); - final int addressOfBigram = target.mCachedAddressAfterUpdate; - final int unigramFrequencyForThisWord = target.mFrequency; - final int offset = addressOfBigram - - (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); - int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), - offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); - mTrieBuf[mTriePos++] = (byte) bigramFlags; - mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, - mTriePos, Math.abs(offset)); + private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) + throws IOException { + mBigramWriter.openStreams(); + for (final PtNodeArray nodeArray : flatNodes) { + for (final PtNode ptNode : nodeArray.mData) { + if (ptNode.mBigrams != null) { + mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, + ptNode.mBigrams.iterator(), dict); + } + } } + mBigramWriter.closeStreams(); } @Override @@ -259,7 +352,7 @@ public class Ver4DictEncoder implements DictEncoder { @Override public void writePtNode(final PtNode ptNode, final int parentPosition, final FormatOptions formatOptions, final FusionDictionary dict) { - writePtNodeFlags(ptNode, parentPosition, formatOptions); + writePtNodeFlags(ptNode, formatOptions); writeParentPosition(parentPosition, ptNode, formatOptions); writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); if (ptNode.isTerminal()) { @@ -267,7 +360,6 @@ public class Ver4DictEncoder implements DictEncoder { } writeChildrenPosition(ptNode, formatOptions); writeShortcuts(ptNode.mShortcutTargets); - writeBigrams(ptNode.mBigrams, dict); } private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, |