diff options
author | 2014-03-27 15:30:32 +0900 | |
---|---|---|
committer | 2014-03-27 15:30:32 +0900 | |
commit | 93cda5bb396c22f1781e390debaf75d54cf7c0dc (patch) | |
tree | 391903a9f8875520980da23f1bc5911c8fa78fc2 /tests/src/com/android | |
parent | 37b9562fd7b593c90d7ab383ec650f39a7c0f621 (diff) | |
download | latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.gz latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.xz latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.zip |
Move code only used for dicttool and tests under tests.
Bug: 13035567
Change-Id: I13c6df013ef2b67c9bf67455d9c32d283bf9ea2e
Diffstat (limited to 'tests/src/com/android')
17 files changed, 3652 insertions, 4 deletions
diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index 918f09067..ae2205b36 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -20,6 +20,7 @@ import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; import android.util.Pair; +import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.DictDecoder; import com.android.inputmethod.latin.makedict.DictionaryHeader; @@ -151,7 +152,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { binaryDictionary.flushWithGC(); binaryDictionary.close(); - final DictDecoder dictDecoder = FormatSpec.getDictDecoder(dictFile, 0, dictFile.length()); + final DictDecoder dictDecoder = + BinaryDictIOUtils.getDictDecoder(dictFile, 0, dictFile.length()); try { final FusionDictionary dict = dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); diff --git a/tests/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java b/tests/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java new file mode 100644 index 000000000..bc856f113 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.TreeMap; + +/** + * A base class of the binary dictionary decoder. + */ +public abstract class AbstractDictDecoder implements DictDecoder { + private static final int SUCCESS = 0; + private static final int ERROR_CANNOT_READ = 1; + private static final int ERROR_WRONG_FORMAT = 2; + + @Override @UsedForTesting + public int getTerminalPosition(final String word) + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + return BinaryDictIOUtils.getTerminalPosition(this, word); + } + + @Override @UsedForTesting + public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, + final TreeMap<Integer, Integer> frequencies, + final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams); + } + + /** + * Check whether the header contains the expected information. This is a no-error method, + * that will return an error code and never throw a checked exception. + * @return an error code, either ERROR_* or SUCCESS. + */ + private int checkHeader() { + try { + readHeader(); + } catch (IOException e) { + return ERROR_CANNOT_READ; + } catch (UnsupportedFormatException e) { + return ERROR_WRONG_FORMAT; + } + return SUCCESS; + } + + @Override + public boolean hasValidRawBinaryDictionary() { + return checkHeader() == SUCCESS; + } + + // Placeholder implementations below. These are actually unused. + @Override + public void openDictBuffer() throws FileNotFoundException, IOException, + UnsupportedFormatException { + } + + @Override + public boolean isDictBufferOpen() { + return false; + } + + @Override + public PtNodeInfo readPtNode(final int ptNodePos) { + return null; + } + + @Override + public void setPosition(int newPos) { + } + + @Override + public int getPosition() { + return 0; + } + + @Override + public int readPtNodeCount() { + return 0; + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java index 4bf61747c..f29fc21c1 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java @@ -251,7 +251,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { FusionDictionary dict = null; try { - final DictDecoder dictDecoder = FormatSpec.getDictDecoder(file, 0, file.length(), + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), bufferType); now = System.currentTimeMillis(); dict = dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); @@ -414,7 +414,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { long now = -1, diff = -1; try { - final DictDecoder dictDecoder = FormatSpec.getDictDecoder(file, 0, file.length(), + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), bufferType); now = System.currentTimeMillis(); dictDecoder.readUnigramsAndBigramsBinary(resultWords, resultFreqs, resultBigrams); @@ -539,7 +539,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { addBigrams(dict, words, bigrams); timeWritingDictToFile(file, dict, formatOptions); - final DictDecoder dictDecoder = FormatSpec.getDictDecoder(file, 0, file.length(), + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), DictDecoder.USE_BYTEARRAY); try { dictDecoder.openDictBuffer(); diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java new file mode 100644 index 000000000..6f8b07a34 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -0,0 +1,364 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; + +/** + * Decodes binary files for a FusionDictionary. + * + * All the methods in this class are static. + * + * TODO: Move this file to makedict/internal. + * TODO: Rename this class to DictDecoderUtils. + */ +public final class BinaryDictDecoderUtils { + + private static final boolean DBG = MakedictLog.DBG; + + private BinaryDictDecoderUtils() { + // This utility class is not publicly instantiable. + } + + @UsedForTesting + public interface DictBuffer { + public int readUnsignedByte(); + public int readUnsignedShort(); + public int readUnsignedInt24(); + public int readInt(); + public int position(); + public void position(int newPosition); + @UsedForTesting + public void put(final byte b); + public int limit(); + @UsedForTesting + public int capacity(); + } + + public static final class ByteBufferDictBuffer implements DictBuffer { + private ByteBuffer mBuffer; + + public ByteBufferDictBuffer(final ByteBuffer buffer) { + mBuffer = buffer; + } + + @Override + public int readUnsignedByte() { + return mBuffer.get() & 0xFF; + } + + @Override + public int readUnsignedShort() { + return mBuffer.getShort() & 0xFFFF; + } + + @Override + public int readUnsignedInt24() { + final int retval = readUnsignedByte(); + return (retval << 16) + readUnsignedShort(); + } + + @Override + public int readInt() { + return mBuffer.getInt(); + } + + @Override + public int position() { + return mBuffer.position(); + } + + @Override + public void position(int newPos) { + mBuffer.position(newPos); + } + + @Override + public void put(final byte b) { + mBuffer.put(b); + } + + @Override + public int limit() { + return mBuffer.limit(); + } + + @Override + public int capacity() { + return mBuffer.capacity(); + } + } + + /** + * A class grouping utility function for our specific character encoding. + */ + static final class CharEncoding { + private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; + + /** + * Helper method to find out whether this code fits on one byte + */ + private static boolean fitsOnOneByte(final int character) { + return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE + && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; + } + + /** + * Compute the size of a character given its character code. + * + * Char format is: + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + * + * @param character the character code. + * @return the size in binary encoded-form, either 1 or 3 bytes. + */ + static int getCharSize(final int character) { + // See char encoding in FusionDictionary.java + if (fitsOnOneByte(character)) return 1; + if (FormatSpec.INVALID_CHARACTER == character) return 1; + return 3; + } + + /** + * Compute the byte size of a character array. + */ + static int getCharArraySize(final int[] chars) { + int size = 0; + for (int character : chars) size += getCharSize(character); + return size; + } + + /** + * Writes a char array to a byte buffer. + * + * @param codePoints the code point array to write. + * @param buffer the byte buffer to write to. + * @param index the index in buffer to write the character array to. + * @return the index after the last character. + */ + static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { + for (int codePoint : codePoints) { + if (1 == getCharSize(codePoint)) { + buffer[index++] = (byte)codePoint; + } else { + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); + } + } + return index; + } + + /** + * Writes a string with our character format to a byte buffer. + * + * This will also write the terminator byte. + * + * @param buffer the byte buffer to write to. + * @param origin the offset to write from. + * @param word the string to write. + * @return the size written, in bytes. + */ + static int writeString(final byte[] buffer, final int origin, final String word) { + final int length = word.length(); + int index = origin; + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + if (1 == getCharSize(codePoint)) { + buffer[index++] = (byte)codePoint; + } else { + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); + } + } + buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + return index - origin; + } + + /** + * Writes a string with our character format to an OutputStream. + * + * This will also write the terminator byte. + * + * @param stream the OutputStream to write to. + * @param word the string to write. + * @return the size written, in bytes. + */ + static int writeString(final OutputStream stream, final String word) throws IOException { + final int length = word.length(); + int written = 0; + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + final int charSize = getCharSize(codePoint); + if (1 == charSize) { + stream.write((byte) codePoint); + } else { + stream.write((byte) (0xFF & (codePoint >> 16))); + stream.write((byte) (0xFF & (codePoint >> 8))); + stream.write((byte) (0xFF & codePoint)); + } + written += charSize; + } + stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); + written += FormatSpec.PTNODE_TERMINATOR_SIZE; + return written; + } + + /** + * Reads a string from a DictBuffer. This is the converse of the above method. + */ + static String readString(final DictBuffer dictBuffer) { + final StringBuilder s = new StringBuilder(); + int character = readChar(dictBuffer); + while (character != FormatSpec.INVALID_CHARACTER) { + s.appendCodePoint(character); + character = readChar(dictBuffer); + } + return s.toString(); + } + + /** + * Reads a character from the buffer. + * + * This follows the character format documented earlier in this source file. + * + * @param dictBuffer the buffer, positioned over an encoded character. + * @return the character code. + */ + static int readChar(final DictBuffer dictBuffer) { + int character = dictBuffer.readUnsignedByte(); + if (!fitsOnOneByte(character)) { + if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { + return FormatSpec.INVALID_CHARACTER; + } + character <<= 16; + character += dictBuffer.readUnsignedShort(); + } + return character; + } + } + + /** + * Reads and returns the PtNode count out of a buffer and forwards the pointer. + */ + /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) { + final int msb = dictBuffer.readUnsignedByte(); + if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) { + return msb; + } else { + return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8) + + dictBuffer.readUnsignedByte(); + } + } + + /** + * Finds, as a string, the word at the position passed as an argument. + * + * @param dictDecoder the dict decoder. + * @param headerSize the size of the header. + * @param pos the position to seek. + * @return the word with its frequency, as a weighted string. + */ + @UsedForTesting + /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, + final int headerSize, final int pos) { + final WeightedString result; + final int originalPos = dictDecoder.getPosition(); + dictDecoder.setPosition(pos); + result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos); + dictDecoder.setPosition(originalPos); + return result; + } + + private static WeightedString getWordAtPositionWithoutParentAddress( + final DictDecoder dictDecoder, final int headerSize, final int pos) { + dictDecoder.setPosition(headerSize); + final int count = dictDecoder.readPtNodeCount(); + int groupPos = dictDecoder.getPosition(); + final StringBuilder builder = new StringBuilder(); + WeightedString result = null; + + PtNodeInfo last = null; + for (int i = count - 1; i >= 0; --i) { + PtNodeInfo info = dictDecoder.readPtNode(groupPos); + groupPos = info.mEndAddress; + if (info.mOriginalAddress == pos) { + builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); + result = new WeightedString(builder.toString(), info.mProbabilityInfo); + break; // and return + } + if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { + if (info.mChildrenAddress > pos) { + if (null == last) continue; + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); + groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); + last = null; + continue; + } + last = info; + } + if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); + groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); + last = null; + continue; + } + } + return result; + } + + /** + * Helper method to pass a file name instead of a File object to isBinaryDictionary. + */ + public static boolean isBinaryDictionary(final String filename) { + final File file = new File(filename); + return isBinaryDictionary(file); + } + + /** + * Basic test to find out whether the file is a binary dictionary or not. + * + * @param file The file to test. + * @return true if it's a binary dictionary, false otherwise + */ + public static boolean isBinaryDictionary(final File file) { + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length()); + if (dictDecoder == null) { + return false; + } + return dictDecoder.hasValidRawBinaryDictionary(); + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java new file mode 100644 index 000000000..39bd98bad --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -0,0 +1,881 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; + +/** + * Encodes binary files for a FusionDictionary. + * + * All the methods in this class are static. + * + * TODO: Rename this class to DictEncoderUtils. + */ +public class BinaryDictEncoderUtils { + + private static final boolean DBG = MakedictLog.DBG; + + private BinaryDictEncoderUtils() { + // This utility class is not publicly instantiable. + } + + // Arbitrary limit to how much passes we consider address size compression should + // terminate in. At the time of this writing, our largest dictionary completes + // compression in five passes. + // If the number of passes exceeds this number, makedict bails with an exception on + // suspicion that a bug might be causing an infinite loop. + private static final int MAX_PASSES = 24; + + /** + * Compute the binary size of the character array. + * + * If only one character, this is the size of this character. If many, it's the sum of their + * sizes + 1 byte for the terminator. + * + * @param characters the character array + * @return the size of the char array, including the terminator if any + */ + static int getPtNodeCharactersSize(final int[] characters) { + int size = CharEncoding.getCharArraySize(characters); + if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; + return size; + } + + /** + * Compute the binary size of the character array in a PtNode + * + * If only one character, this is the size of this character. If many, it's the sum of their + * sizes + 1 byte for the terminator. + * + * @param ptNode the PtNode + * @return the size of the char array, including the terminator if any + */ + private static int getPtNodeCharactersSize(final PtNode ptNode) { + return getPtNodeCharactersSize(ptNode.mChars); + } + + /** + * Compute the binary size of the PtNode count for a node array. + * @param nodeArray the nodeArray + * @return the size of the PtNode count, either 1 or 2 bytes. + */ + private static int getPtNodeCountSize(final PtNodeArray nodeArray) { + return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size()); + } + + /** + * Compute the size of a shortcut in bytes. + */ + private static int getShortcutSize(final WeightedString shortcut) { + int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; + final String word = shortcut.mWord; + final int length = word.length(); + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + size += CharEncoding.getCharSize(codePoint); + } + size += FormatSpec.PTNODE_TERMINATOR_SIZE; + return size; + } + + /** + * Compute the size of a shortcut list in bytes. + * + * This is known in advance and does not change according to position in the file + * like address lists do. + */ + static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { + if (null == shortcutList || shortcutList.isEmpty()) return 0; + int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; + for (final WeightedString shortcut : shortcutList) { + size += getShortcutSize(shortcut); + } + return size; + } + + /** + * Compute the maximum size of a PtNode, assuming 3-byte addresses for everything. + * + * @param ptNode the PtNode to compute the size of. + * @return the maximum size of the PtNode. + */ + private static int getPtNodeMaximumSize(final PtNode ptNode) { + int size = getNodeHeaderSize(ptNode); + if (ptNode.isTerminal()) { + // If terminal, one byte for the frequency. + size += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address + size += getShortcutListSize(ptNode.mShortcutTargets); + if (null != ptNode.mBigrams) { + size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE + + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) + * ptNode.mBigrams.size(); + } + return size; + } + + /** + * Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for + * everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of + * the containing node array, and cache it it its 'mCachedSize' member. + * + * @param ptNodeArray the node array to compute the maximum size of. + */ + private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray) { + int size = getPtNodeCountSize(ptNodeArray); + for (PtNode node : ptNodeArray.mData) { + final int nodeSize = getPtNodeMaximumSize(node); + node.mCachedSize = nodeSize; + size += nodeSize; + } + ptNodeArray.mCachedSize = size; + } + + /** + * Compute the size of the header (flag + [parent address] + characters size) of a PtNode. + * + * @param ptNode the PtNode of which to compute the size of the header + */ + private static int getNodeHeaderSize(final PtNode ptNode) { + return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode); + } + + /** + * Compute the size, in bytes, that an address will occupy. + * + * This can be used either for children addresses (which are always positive) or for + * attribute, which may be positive or negative but + * store their sign bit separately. + * + * @param address the address + * @return the byte size. + */ + static int getByteSize(final int address) { + assert(address <= FormatSpec.UINT24_MAX); + if (!BinaryDictIOUtils.hasChildrenAddress(address)) { + return 0; + } else if (Math.abs(address) <= FormatSpec.UINT8_MAX) { + return 1; + } else if (Math.abs(address) <= FormatSpec.UINT16_MAX) { + return 2; + } else { + return 3; + } + } + + static int writeUIntToBuffer(final byte[] buffer, int position, final int value, + final int size) { + switch(size) { + case 4: + buffer[position++] = (byte) ((value >> 24) & 0xFF); + /* fall through */ + case 3: + buffer[position++] = (byte) ((value >> 16) & 0xFF); + /* fall through */ + case 2: + buffer[position++] = (byte) ((value >> 8) & 0xFF); + /* fall through */ + case 1: + buffer[position++] = (byte) (value & 0xFF); + break; + default: + /* nop */ + } + return position; + } + + static void writeUIntToStream(final OutputStream stream, final int value, final int size) + throws IOException { + switch(size) { + case 4: + stream.write((value >> 24) & 0xFF); + /* fall through */ + case 3: + stream.write((value >> 16) & 0xFF); + /* fall through */ + case 2: + stream.write((value >> 8) & 0xFF); + /* fall through */ + case 1: + stream.write(value & 0xFF); + break; + default: + /* nop */ + } + } + + @UsedForTesting + static void writeUIntToDictBuffer(final DictBuffer dictBuffer, final int value, + final int size) { + switch(size) { + case 4: + dictBuffer.put((byte) ((value >> 24) & 0xFF)); + /* fall through */ + case 3: + dictBuffer.put((byte) ((value >> 16) & 0xFF)); + /* fall through */ + case 2: + dictBuffer.put((byte) ((value >> 8) & 0xFF)); + /* fall through */ + case 1: + dictBuffer.put((byte) (value & 0xFF)); + break; + default: + /* nop */ + } + } + + // End utility methods + + // This method is responsible for finding a nice ordering of the nodes that favors run-time + // cache performance and dictionary size. + /* package for tests */ static ArrayList<PtNodeArray> flattenTree( + final PtNodeArray rootNodeArray) { + final int treeSize = FusionDictionary.countPtNodes(rootNodeArray); + MakedictLog.i("Counted nodes : " + treeSize); + final ArrayList<PtNodeArray> flatTree = new ArrayList<PtNodeArray>(treeSize); + return flattenTreeInner(flatTree, rootNodeArray); + } + + private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list, + final PtNodeArray ptNodeArray) { + // Removing the node is necessary if the tails are merged, because we would then + // add the same node several times when we only want it once. A number of places in + // the code also depends on any node being only once in the list. + // Merging tails can only be done if there are no attributes. Searching for attributes + // in LatinIME code depends on a total breadth-first ordering, which merging tails + // breaks. If there are no attributes, it should be fine (and reduce the file size) + // to merge tails, and removing the node from the list would be necessary. However, + // we don't merge tails because breaking the breadth-first ordering would result in + // extreme overhead at bigram lookup time (it would make the search function O(n) instead + // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty + // high). + // If no nodes are ever merged, we can't have the same node twice in the list, hence + // searching for duplicates in unnecessary. It is also very performance consuming, + // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making + // this simple list.remove operation O(n*n) overall. On Android this overhead is very + // high. + // For future reference, the code to remove duplicate is a simple : list.remove(node); + list.add(ptNodeArray); + final ArrayList<PtNode> branches = ptNodeArray.mData; + for (PtNode ptNode : branches) { + if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren); + } + return list; + } + + /** + * Get the offset from a position inside a current node array to a target node array, during + * update. + * + * If the current node array is before the target node array, the target node array has not + * been updated yet, so we should return the offset from the old position of the current node + * array to the old position of the target node array. If on the other hand the target is + * before the current node array, it already has been updated, so we should return the offset + * from the new position in the current node array to the new position in the target node + * array. + * + * @param currentNodeArray node array containing the PtNode where the offset will be written + * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray + * @param targetNodeArray the target node array to get the offset to + * @return the offset to the target node array + */ + private static int getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray, + final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray) { + final boolean isTargetBeforeCurrent = (targetNodeArray.mCachedAddressBeforeUpdate + < currentNodeArray.mCachedAddressBeforeUpdate); + if (isTargetBeforeCurrent) { + return targetNodeArray.mCachedAddressAfterUpdate + - (currentNodeArray.mCachedAddressAfterUpdate + + offsetFromStartOfCurrentNodeArray); + } else { + return targetNodeArray.mCachedAddressBeforeUpdate + - (currentNodeArray.mCachedAddressBeforeUpdate + + offsetFromStartOfCurrentNodeArray); + } + } + + /** + * Get the offset from a position inside a current node array to a target PtNode, during + * update. + * + * @param currentNodeArray node array containing the PtNode where the offset will be written + * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray + * @param targetPtNode the target PtNode to get the offset to + * @return the offset to the target PtNode + */ + // TODO: is there any way to factorize this method with the one above? + private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray, + final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) { + final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate + + offsetFromStartOfCurrentNodeArray; + final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate + < oldOffsetBasePoint); + // If the target is before the current node array, then its address has already been + // updated. We can use the AfterUpdate member, and compare it to our own member after + // update. Otherwise, the AfterUpdate member is not updated yet, so we need to use the + // BeforeUpdate member, and of course we have to compare this to our own address before + // update. + if (isTargetBeforeCurrent) { + final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate + + offsetFromStartOfCurrentNodeArray; + return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint; + } else { + return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint; + } + } + + /** + * Computes the actual node array size, based on the cached addresses of the children nodes. + * + * Each node array stores its tentative address. During dictionary address computing, these + * are not final, but they can be used to compute the node array size (the node array size + * depends on the address of the children because the number of bytes necessary to store an + * address depends on its numeric value. The return value indicates whether the node array + * contents (as in, any of the addresses stored in the cache fields) have changed with + * respect to their previous value. + * + * @param ptNodeArray the node array to compute the size of. + * @param dict the dictionary in which the word/attributes are to be found. + * @return false if none of the cached addresses inside the node array changed, true otherwise. + */ + private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, + final FusionDictionary dict) { + boolean changed = false; + int size = getPtNodeCountSize(ptNodeArray); + for (PtNode ptNode : ptNodeArray.mData) { + ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size; + if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { + changed = true; + } + int nodeSize = getNodeHeaderSize(ptNode); + if (ptNode.isTerminal()) { + nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + if (null != ptNode.mChildren) { + nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, + nodeSize + size, ptNode.mChildren)); + } + nodeSize += getShortcutListSize(ptNode.mShortcutTargets); + if (null != ptNode.mBigrams) { + for (WeightedString bigram : ptNode.mBigrams) { + final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, + nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); + nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; + } + } + ptNode.mCachedSize = nodeSize; + size += nodeSize; + } + if (ptNodeArray.mCachedSize != size) { + ptNodeArray.mCachedSize = size; + changed = true; + } + return changed; + } + + /** + * Initializes the cached addresses of node arrays and their containing nodes from their size. + * + * @param flatNodes the list of node arrays. + * @return the byte size of the entire stack. + */ + private static int initializePtNodeArraysCachedAddresses( + final ArrayList<PtNodeArray> flatNodes) { + int nodeArrayOffset = 0; + for (final PtNodeArray nodeArray : flatNodes) { + nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset; + int nodeCountSize = getPtNodeCountSize(nodeArray); + int nodeffset = 0; + for (final PtNode ptNode : nodeArray.mData) { + ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate = + nodeCountSize + nodeArrayOffset + nodeffset; + nodeffset += ptNode.mCachedSize; + } + nodeArrayOffset += nodeArray.mCachedSize; + } + return nodeArrayOffset; + } + + /** + * Updates the cached addresses of node arrays after recomputing their new positions. + * + * @param flatNodes the list of node arrays. + */ + private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) { + for (final PtNodeArray nodeArray : flatNodes) { + nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate; + for (final PtNode ptNode : nodeArray.mData) { + ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate; + } + } + } + + /** + * Compute the addresses and sizes of an ordered list of PtNode arrays. + * + * This method takes a list of PtNode arrays and will update their cached address and size + * values so that they can be written into a file. It determines the smallest size each of the + * PtNode arrays can be given the addresses of its children and attributes, and store that into + * each PtNode. + * The order of the PtNode is given by the order of the array. This method makes no effort + * to find a good order; it only mechanically computes the size this order results in. + * + * @param dict the dictionary + * @param flatNodes the ordered list of PtNode arrays + * @return the same array it was passed. The nodes have been updated for address and size. + */ + /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, + final ArrayList<PtNodeArray> flatNodes) { + // First get the worst possible sizes and offsets + for (final PtNodeArray n : flatNodes) { + calculatePtNodeArrayMaximumSize(n); + } + final int offset = initializePtNodeArraysCachedAddresses(flatNodes); + + MakedictLog.i("Compressing the array addresses. Original size : " + offset); + MakedictLog.i("(Recursively seen size : " + offset + ")"); + + int passes = 0; + boolean changesDone = false; + do { + changesDone = false; + int ptNodeArrayStartOffset = 0; + for (final PtNodeArray ptNodeArray : flatNodes) { + ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; + final int oldNodeArraySize = ptNodeArray.mCachedSize; + final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict); + final int newNodeArraySize = ptNodeArray.mCachedSize; + if (oldNodeArraySize < newNodeArraySize) { + throw new RuntimeException("Increased size ?!"); + } + ptNodeArrayStartOffset += newNodeArraySize; + changesDone |= changed; + } + updatePtNodeArraysCachedAddresses(flatNodes); + ++passes; + if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); + } while (changesDone); + + final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1); + MakedictLog.i("Compression complete in " + passes + " passes."); + MakedictLog.i("After address compression : " + + (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize)); + + return flatNodes; + } + + /** + * Sanity-checking method. + * + * This method checks a list of PtNode arrays for juxtaposition, that is, it will do + * nothing if each node array's cached address is actually the previous node array's address + * plus the previous node's size. + * If this is not the case, it will throw an exception. + * + * @param arrays the list of node arrays to check + */ + /* package */ static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) { + int offset = 0; + int index = 0; + for (final PtNodeArray ptNodeArray : arrays) { + // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter + // which we use. + if (ptNodeArray.mCachedAddressAfterUpdate != offset) { + throw new RuntimeException("Wrong address for node " + index + + " : expected " + offset + ", got " + + ptNodeArray.mCachedAddressAfterUpdate); + } + ++index; + offset += ptNodeArray.mCachedSize; + } + } + + /** + * Helper method to write a children position to a file. + * + * @param buffer the buffer to write to. + * @param index the index in the buffer to write the address to. + * @param position the position to write. + * @return the size in bytes the address actually took. + */ + /* package */ static int writeChildrenPosition(final byte[] buffer, int index, + final int position) { + switch (getByteSize(position)) { + case 1: + buffer[index++] = (byte)position; + return 1; + case 2: + buffer[index++] = (byte)(0xFF & (position >> 8)); + buffer[index++] = (byte)(0xFF & position); + return 2; + case 3: + buffer[index++] = (byte)(0xFF & (position >> 16)); + buffer[index++] = (byte)(0xFF & (position >> 8)); + buffer[index++] = (byte)(0xFF & position); + return 3; + case 0: + return 0; + default: + throw new RuntimeException("Position " + position + " has a strange size"); + } + } + + /** + * Helper method to write a signed children position to a file. + * + * @param buffer the buffer to write to. + * @param index the index in the buffer to write the address to. + * @param position the position to write. + * @return the size in bytes the address actually took. + */ + /* package */ static int writeSignedChildrenPosition(final byte[] buffer, int index, + final int position) { + if (!BinaryDictIOUtils.hasChildrenAddress(position)) { + buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; + } else { + final int absPosition = Math.abs(position); + buffer[index++] = + (byte)((position < 0 ? FormatSpec.MSB8 : 0) | (0xFF & (absPosition >> 16))); + buffer[index++] = (byte)(0xFF & (absPosition >> 8)); + buffer[index++] = (byte)(0xFF & absPosition); + } + return 3; + } + + /** + * Makes the flag value for a PtNode. + * + * @param hasMultipleChars whether the PtNode has multiple chars. + * @param isTerminal whether the PtNode is terminal. + * @param childrenAddressSize the size of a children address. + * @param hasShortcuts whether the PtNode has shortcuts. + * @param hasBigrams whether the PtNode has bigrams. + * @param isNotAWord whether the PtNode is not a word. + * @param isBlackListEntry whether the PtNode is a blacklist entry. + * @return the flags + */ + static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal, + final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams, + final boolean isNotAWord, final boolean isBlackListEntry) { + byte flags = 0; + if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; + if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; + switch (childrenAddressSize) { + case 1: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES; + break; + case 0: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS; + break; + default: + throw new RuntimeException("Node with a strange address"); + } + if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; + if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; + if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; + if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED; + return flags; + } + + /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset) { + return (byte) makePtNodeFlags(node.mChars.length > 1, node.isTerminal(), + getByteSize(childrenOffset), + node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(), + node.mBigrams != null && !node.mBigrams.isEmpty(), + node.mIsNotAWord, node.mIsBlacklistEntry); + } + + /** + * Makes the flag value for a bigram. + * + * @param more whether there are more bigrams after this one. + * @param offset the offset of the bigram. + * @param bigramFrequency the frequency of the bigram, 0..255. + * @param unigramFrequency the unigram frequency of the same word, 0..255. + * @param word the second bigram, for debugging purposes + * @return the flags + */ + /* package */ static final int makeBigramFlags(final boolean more, final int offset, + int bigramFrequency, final int unigramFrequency, final String word) { + int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) + + (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0); + switch (getByteSize(offset)) { + case 1: + bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES; + break; + default: + throw new RuntimeException("Strange offset size"); + } + if (unigramFrequency > bigramFrequency) { + MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word + + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " + + word + " is " + unigramFrequency); + bigramFrequency = unigramFrequency; + } + bigramFlags += getBigramFrequencyDiff(unigramFrequency, bigramFrequency) + & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; + return bigramFlags; + } + + public static int getBigramFrequencyDiff(final int unigramFrequency, + final int bigramFrequency) { + // We compute the difference between 255 (which means probability = 1) and the + // unigram score. We split this into a number of discrete steps. + // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 + // represents an increase of 16 steps: a value of 15 will be interpreted as the median + // value of the 16th step. In all justice, if the bigram frequency is low enough to be + // rounded below the first step (which means it is less than half a step higher than the + // unigram frequency) then the unigram frequency itself is the best approximation of the + // bigram freq that we could possibly supply, hence we should *not* include this bigram + // in the file at all. + // until this is done, we'll write 0 and slightly overestimate this case. + // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step + // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to + // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the + // step size. Then we compute the start of the first step (the one where value 0 starts) + // by adding half-a-step to the unigramFrequency. From there, we compute the integer + // number of steps to the bigramFrequency. One last thing: we want our steps to include + // their lower bound and exclude their higher bound so we need to have the first step + // start at exactly 1 unit higher than floor(unigramFreq + half a step). + // Note : to reconstruct the score, the dictionary reader will need to divide + // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, + // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best + // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the + // step pointed by the discretized frequency. + final float stepSize = + (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) + / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); + final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); + final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); + // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 + // here. The best approximation would be the unigram freq itself, so we should not + // include this bigram in the dictionary. For now, register as 0, and live with the + // small over-estimation that we get in this case. TODO: actually remove this bigram + // if discretizedFrequency < 0. + return discretizedFrequency > 0 ? discretizedFrequency : 0; + } + + /** + * Makes the flag value for a shortcut. + * + * @param more whether there are more attributes after this one. + * @param frequency the frequency of the attribute, 0..15 + * @return the flags + */ + static final int makeShortcutFlags(final boolean more, final int frequency) { + return (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) + + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY); + } + + /* package */ static final int getChildrenPosition(final PtNode ptNode) { + int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate + + getNodeHeaderSize(ptNode); + if (ptNode.isTerminal()) { + // A terminal node has the frequency. + // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children + // position. + positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS + : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField; + } + + /** + * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached. + * + * @param dict the dictionary the node array is a part of (for relative offsets). + * @param dictEncoder the dictionary encoder. + * @param ptNodeArray the node array to write. + */ + @SuppressWarnings("unused") + /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, + final DictEncoder dictEncoder, final PtNodeArray ptNodeArray) { + // TODO: Make the code in common with BinaryDictIOUtils#writePtNode + dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); + + final int ptNodeCount = ptNodeArray.mData.size(); + dictEncoder.writePtNodeCount(ptNodeCount); + final int parentPosition = + (ptNodeArray.mCachedParentAddress == FormatSpec.NO_PARENT_ADDRESS) + ? FormatSpec.NO_PARENT_ADDRESS + : ptNodeArray.mCachedParentAddress + ptNodeArray.mCachedAddressAfterUpdate; + for (int i = 0; i < ptNodeCount; ++i) { + final PtNode ptNode = ptNodeArray.mData.get(i); + if (dictEncoder.getPosition() != ptNode.mCachedAddressAfterUpdate) { + throw new RuntimeException("Bug: write index is not the same as the cached address " + + "of the node : " + dictEncoder.getPosition() + " <> " + + ptNode.mCachedAddressAfterUpdate); + } + // Sanity checks. + if (DBG && ptNode.getProbability() > FormatSpec.MAX_TERMINAL_FREQUENCY) { + throw new RuntimeException("A node has a frequency > " + + FormatSpec.MAX_TERMINAL_FREQUENCY + + " : " + ptNode.mProbabilityInfo.toString()); + } + dictEncoder.writePtNode(ptNode, dict); + } + if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate + + ptNodeArray.mCachedSize) { + throw new RuntimeException("Not the same size : written " + + (dictEncoder.getPosition() - ptNodeArray.mCachedAddressAfterUpdate) + + " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes"); + } + } + + /** + * Dumps a collection of useful statistics about a list of PtNode arrays. + * + * This prints purely informative stuff, like the total estimated file size, the + * number of PtNode arrays, of PtNodes, the repartition of each address size, etc + * + * @param ptNodeArrays the list of PtNode arrays. + */ + /* package */ static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) { + int firstTerminalAddress = Integer.MAX_VALUE; + int lastTerminalAddress = Integer.MIN_VALUE; + int size = 0; + int ptNodes = 0; + int maxNodes = 0; + int maxRuns = 0; + for (final PtNodeArray ptNodeArray : ptNodeArrays) { + if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size(); + for (final PtNode ptNode : ptNodeArray.mData) { + ++ptNodes; + if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length; + if (ptNode.isTerminal()) { + if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress) + firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; + if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress) + lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; + } + } + if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) { + size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize; + } + } + final int[] ptNodeCounts = new int[maxNodes + 1]; + final int[] runCounts = new int[maxRuns + 1]; + for (final PtNodeArray ptNodeArray : ptNodeArrays) { + ++ptNodeCounts[ptNodeArray.mData.size()]; + for (final PtNode ptNode : ptNodeArray.mData) { + ++runCounts[ptNode.mChars.length]; + } + } + + MakedictLog.i("Statistics:\n" + + " total file size " + size + "\n" + + " " + ptNodeArrays.size() + " node arrays\n" + + " " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size()) + + " PtNodes per node)\n" + + " first terminal at " + firstTerminalAddress + "\n" + + " last terminal at " + lastTerminalAddress + "\n" + + " PtNode stats : max = " + maxNodes); + for (int i = 0; i < ptNodeCounts.length; ++i) { + MakedictLog.i(" " + i + " : " + ptNodeCounts[i]); + } + MakedictLog.i(" Character run stats : max = " + maxRuns); + for (int i = 0; i < runCounts.length; ++i) { + MakedictLog.i(" " + i + " : " + runCounts[i]); + } + } + + /** + * Writes a file header to an output stream. + * + * @param destination the stream to write the file header to. + * @param dict the dictionary to write. + * @param formatOptions file format options. + * @return the size of the header. + */ + /* package */ static int writeDictionaryHeader(final OutputStream destination, + final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + final int version = formatOptions.mVersion; + if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION + || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { + throw new UnsupportedFormatException("Requested file format version " + version + + ", but this implementation only supports versions " + + FormatSpec.MINIMUM_SUPPORTED_VERSION + " through " + + FormatSpec.MAXIMUM_SUPPORTED_VERSION); + } + + ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); + + // The magic number in big-endian order. + // Magic number for all versions. + headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 24))); + headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 16))); + headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 8))); + headerBuffer.write((byte) (0xFF & FormatSpec.MAGIC_NUMBER)); + // Dictionary version. + headerBuffer.write((byte) (0xFF & (version >> 8))); + headerBuffer.write((byte) (0xFF & version)); + + // Options flags + // TODO: Remove this field. + final int options = 0; + headerBuffer.write((byte) (0xFF & (options >> 8))); + headerBuffer.write((byte) (0xFF & options)); + final int headerSizeOffset = headerBuffer.size(); + // Placeholder to be written later with header size. + for (int i = 0; i < 4; ++i) { + headerBuffer.write(0); + } + // Write out the options. + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + CharEncoding.writeString(headerBuffer, key); + CharEncoding.writeString(headerBuffer, value); + } + final int size = headerBuffer.size(); + final byte[] bytes = headerBuffer.toByteArray(); + // Write out the header size. + bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); + bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); + bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); + bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); + destination.write(bytes); + + headerBuffer.close(); + return size; + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java new file mode 100644 index 000000000..42a50be66 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -0,0 +1,306 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Map; +import java.util.Stack; + +public final class BinaryDictIOUtils { + private static final boolean DBG = false; + + private BinaryDictIOUtils() { + // This utility class is not publicly instantiable. + } + + /** + * Returns new dictionary decoder. + * + * @param dictFile the dictionary file. + * @param bufferType The type of buffer, as one of USE_* in DictDecoder. + * @return new dictionary decoder if the dictionary file exists, otherwise null. + */ + public static DictDecoder getDictDecoder(final File dictFile, final long offset, + final long length, final int bufferType) { + if (dictFile.isDirectory()) { + return new Ver4DictDecoder(dictFile, bufferType); + } else if (dictFile.isFile()) { + return new Ver2DictDecoder(dictFile, offset, length, bufferType); + } + return null; + } + + public static DictDecoder getDictDecoder(final File dictFile, final long offset, + final long length, final DictionaryBufferFactory factory) { + if (dictFile.isDirectory()) { + return new Ver4DictDecoder(dictFile, factory); + } else if (dictFile.isFile()) { + return new Ver2DictDecoder(dictFile, offset, length, factory); + } + return null; + } + + public static DictDecoder getDictDecoder(final File dictFile, final long offset, + final long length) { + return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER); + } + + private static final class Position { + public static final int NOT_READ_PTNODE_COUNT = -1; + + public int mAddress; + public int mNumOfPtNode; + public int mPosition; + public int mLength; + + public Position(int address, int length) { + mAddress = address; + mLength = length; + mNumOfPtNode = NOT_READ_PTNODE_COUNT; + } + } + + /** + * Retrieves all node arrays without recursive call. + */ + private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, + final int bodyOffset, final Map<Integer, String> words, + final Map<Integer, Integer> frequencies, + final Map<Integer, ArrayList<PendingAttribute>> bigrams) { + int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; + + Stack<Position> stack = new Stack<Position>(); + int index = 0; + + Position initPos = new Position(bodyOffset, 0); + stack.push(initPos); + + while (!stack.empty()) { + Position p = stack.peek(); + + if (DBG) { + MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" + + p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength); + } + + if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress); + if (index != p.mLength) index = p.mLength; + + if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) { + p.mNumOfPtNode = dictDecoder.readPtNodeCount(); + p.mAddress = dictDecoder.getPosition(); + p.mPosition = 0; + } + if (p.mNumOfPtNode == 0) { + stack.pop(); + continue; + } + final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress); + for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) { + pushedChars[index++] = ptNodeInfo.mCharacters[i]; + } + p.mPosition++; + if (ptNodeInfo.isTerminal()) {// found word + words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index)); + frequencies.put( + ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability); + if (ptNodeInfo.mBigrams != null) { + bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams); + } + } + + if (p.mPosition == p.mNumOfPtNode) { + stack.pop(); + } else { + // The PtNode array has more PtNodes. + p.mAddress = dictDecoder.getPosition(); + } + + if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) { + final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index); + stack.push(childrenPos); + } + } + } + + /** + * Reads unigrams and bigrams from the binary file. + * Doesn't store a full memory representation of the dictionary. + * + * @param dictDecoder the dict decoder. + * @param words the map to store the address as a key and the word as a value. + * @param frequencies the map to store the address as a key and the frequency as a value. + * @param bigrams the map to store the address as a key and the list of address as a value. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, + final Map<Integer, String> words, final Map<Integer, Integer> frequencies, + final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, + UnsupportedFormatException { + // Read header + final DictionaryHeader header = dictDecoder.readHeader(); + readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words, + frequencies, bigrams); + } + + /** + * Gets the address of the last PtNode of the exact matching word in the dictionary. + * If no match is found, returns NOT_VALID_WORD. + * + * @param dictDecoder the dict decoder. + * @param word the word we search for. + * @return the address of the terminal node. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + @UsedForTesting + /* package */ static int getTerminalPosition(final DictDecoder dictDecoder, + final String word) throws IOException, UnsupportedFormatException { + if (word == null) return FormatSpec.NOT_VALID_WORD; + dictDecoder.setPosition(0); + dictDecoder.readHeader(); + int wordPos = 0; + final int wordLen = word.codePointCount(0, word.length()); + for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { + if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; + + do { + final int ptNodeCount = dictDecoder.readPtNodeCount(); + boolean foundNextPtNode = false; + for (int i = 0; i < ptNodeCount; ++i) { + final int ptNodePos = dictDecoder.getPosition(); + final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos); + boolean same = true; + for (int p = 0, j = word.offsetByCodePoints(0, wordPos); + p < currentInfo.mCharacters.length; + ++p, j = word.offsetByCodePoints(j, 1)) { + if (wordPos + p >= wordLen + || word.codePointAt(j) != currentInfo.mCharacters[p]) { + same = false; + break; + } + } + + if (same) { + // found the PtNode matches the word. + if (wordPos + currentInfo.mCharacters.length == wordLen) { + if (!currentInfo.isTerminal()) { + return FormatSpec.NOT_VALID_WORD; + } else { + return ptNodePos; + } + } + wordPos += currentInfo.mCharacters.length; + if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { + return FormatSpec.NOT_VALID_WORD; + } + foundNextPtNode = true; + dictDecoder.setPosition(currentInfo.mChildrenAddress); + break; + } + } + if (foundNextPtNode) break; + return FormatSpec.NOT_VALID_WORD; + } while(true); + } + return FormatSpec.NOT_VALID_WORD; + } + + /** + * Writes a PtNodeCount to the stream. + * + * @param destination the stream to write. + * @param ptNodeCount the count. + * @return the size written in bytes. + */ + @UsedForTesting + static int writePtNodeCount(final OutputStream destination, final int ptNodeCount) + throws IOException { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + // the count must fit on one byte or two bytes. + // Please see comments in FormatSpec. + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); + } + final int encodedPtNodeCount = (countSize == 2) ? + (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; + BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize); + return countSize; + } + + /** + * Helper method to hide the actual value of the no children address. + */ + public static boolean hasChildrenAddress(final int address) { + return FormatSpec.NO_CHILDREN_ADDRESS != address; + } + + /** + * Compute the binary size of the node count + * @param count the node count + * @return the size of the node count, either 1 or 2 bytes. + */ + public static int getPtNodeCountSize(final int count) { + if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) { + return 1; + } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) { + return 2; + } else { + throw new RuntimeException("Can't have more than " + + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found " + + count + ")"); + } + } + + static int getChildrenAddressSize(final int optionFlags) { + switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: + return 1; + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: + return 2; + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: + return 3; + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: + default: + return 0; + } + } + + /** + * Calculate bigram frequency from compressed value + * + * @param unigramFrequency + * @param bigramFrequency compressed frequency + * @return approximate bigram frequency + */ + @UsedForTesting + public static int reconstructBigramFrequency(final int unigramFrequency, + final int bigramFrequency) { + final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) + / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); + final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f); + return (int)resultFreqFloat; + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/DictDecoder.java b/tests/src/com/android/inputmethod/latin/makedict/DictDecoder.java new file mode 100644 index 000000000..a3b28a702 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/DictDecoder.java @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import com.android.inputmethod.latin.utils.ByteArrayDictBuffer; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.TreeMap; + +/** + * An interface of binary dictionary decoders. + */ +// TODO: Straighten out responsibility for the buffer's file pointer. +public interface DictDecoder { + + /** + * Reads and returns the file header. + */ + public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException; + + /** + * Reads PtNode from ptNodePos. + * @param ptNodePos the position of PtNode. + * @return PtNodeInfo. + */ + public PtNodeInfo readPtNode(final int ptNodePos); + + /** + * Reads a buffer and returns the memory representation of the dictionary. + * + * This high-level method takes a buffer and reads its contents, populating a + * FusionDictionary structure. + * + * @param deleteDictIfBroken a flag indicating whether this method should remove the broken + * dictionary or not. + * @return the created dictionary. + */ + @UsedForTesting + public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException; + + /** + * Gets the address of the last PtNode of the exact matching word in the dictionary. + * If no match is found, returns NOT_VALID_WORD. + * + * @param word the word we search for. + * @return the address of the terminal node. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + @UsedForTesting + public int getTerminalPosition(final String word) + throws IOException, UnsupportedFormatException; + + /** + * Reads unigrams and bigrams from the binary file. + * Doesn't store a full memory representation of the dictionary. + * + * @param words the map to store the address as a key and the word as a value. + * @param frequencies the map to store the address as a key and the frequency as a value. + * @param bigrams the map to store the address as a key and the list of address as a value. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + @UsedForTesting + public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, + final TreeMap<Integer, Integer> frequencies, + final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) + throws IOException, UnsupportedFormatException; + + /** + * Sets the position of the buffer to the given value. + * + * @param newPos the new position + */ + public void setPosition(final int newPos); + + /** + * Gets the position of the buffer. + * + * @return the position + */ + public int getPosition(); + + /** + * Reads and returns the PtNode count out of a buffer and forwards the pointer. + */ + public int readPtNodeCount(); + + /** + * Opens the dictionary file and makes DictBuffer. + */ + @UsedForTesting + public void openDictBuffer() throws FileNotFoundException, IOException, + UnsupportedFormatException; + @UsedForTesting + public boolean isDictBufferOpen(); + + // Constants for DictionaryBufferFactory. + public static final int USE_READONLY_BYTEBUFFER = 0x01000000; + public static final int USE_BYTEARRAY = 0x02000000; + public static final int USE_WRITABLE_BYTEBUFFER = 0x03000000; + public static final int MASK_DICTBUFFER = 0x0F000000; + + public interface DictionaryBufferFactory { + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException; + } + + /** + * Creates DictionaryBuffer using a ByteBuffer + * + * This class uses less memory than DictionaryBufferFromByteArrayFactory, + * but doesn't perform as fast. + * When operating on a big dictionary, this class is preferred. + */ + public static final class DictionaryBufferFromReadOnlyByteBufferFactory + implements DictionaryBufferFactory { + @Override + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException { + FileInputStream inStream = null; + ByteBuffer buffer = null; + try { + inStream = new FileInputStream(file); + buffer = inStream.getChannel().map(FileChannel.MapMode.READ_ONLY, + 0, file.length()); + } finally { + if (inStream != null) { + inStream.close(); + } + } + if (buffer != null) { + return new BinaryDictDecoderUtils.ByteBufferDictBuffer(buffer); + } + return null; + } + } + + /** + * Creates DictionaryBuffer using a byte array + * + * This class performs faster than other classes, but consumes more memory. + * When operating on a small dictionary, this class is preferred. + */ + public static final class DictionaryBufferFromByteArrayFactory + implements DictionaryBufferFactory { + @Override + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException { + FileInputStream inStream = null; + try { + inStream = new FileInputStream(file); + final byte[] array = new byte[(int) file.length()]; + inStream.read(array); + return new ByteArrayDictBuffer(array); + } finally { + if (inStream != null) { + inStream.close(); + } + } + } + } + + /** + * Creates DictionaryBuffer using a writable ByteBuffer and a RandomAccessFile. + * + * This class doesn't perform as fast as other classes, + * but this class is the only option available for destructive operations (insert or delete) + * on a dictionary. + */ + @UsedForTesting + public static final class DictionaryBufferFromWritableByteBufferFactory + implements DictionaryBufferFactory { + @Override + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException { + RandomAccessFile raFile = null; + ByteBuffer buffer = null; + try { + raFile = new RandomAccessFile(file, "rw"); + buffer = raFile.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, file.length()); + } finally { + if (raFile != null) { + raFile.close(); + } + } + if (buffer != null) { + return new BinaryDictDecoderUtils.ByteBufferDictBuffer(buffer); + } + return null; + } + } + + /** + * @return whether this decoder has a valid binary dictionary that it can decode. + */ + public boolean hasValidRawBinaryDictionary(); +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/DictEncoder.java b/tests/src/com/android/inputmethod/latin/makedict/DictEncoder.java new file mode 100644 index 000000000..678c5ca6b --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/DictEncoder.java @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; + +import java.io.IOException; + +/** + * An interface of binary dictionary encoder. + */ +public interface DictEncoder { + @UsedForTesting + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException; + + public void setPosition(final int position); + public int getPosition(); + public void writePtNodeCount(final int ptNodeCount); + public void writeForwardLinkAddress(final int forwardLinkAddress); + public void writePtNode(final PtNode ptNode, final FusionDictionary dict); +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/tests/src/com/android/inputmethod/latin/makedict/FusionDictionary.java new file mode 100644 index 000000000..f60b3af4f --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -0,0 +1,717 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedList; + +/** + * A dictionary that can fusion heads and tails of words for more compression. + */ +@UsedForTesting +public final class FusionDictionary implements Iterable<WordProperty> { + private static final boolean DBG = MakedictLog.DBG; + + private static int CHARACTER_NOT_FOUND_INDEX = -1; + + /** + * A node array of the dictionary, containing several PtNodes. + * + * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the + * real information. + * This class also contains fields to cache size and address, to help with binary + * generation. + */ + public static final class PtNodeArray { + ArrayList<PtNode> mData; + // To help with binary generation + int mCachedSize = Integer.MIN_VALUE; + // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They + // always hold the same value except between dictionary address compression, during which + // the update process needs to know about both values at the same time. Updating will + // update the AfterUpdate value, and the code will move them to BeforeUpdate before + // the next update pass. + int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; + int mCachedAddressAfterUpdate = Integer.MIN_VALUE; + int mCachedParentAddress = 0; + + public PtNodeArray() { + mData = new ArrayList<PtNode>(); + } + public PtNodeArray(ArrayList<PtNode> data) { + Collections.sort(data, PTNODE_COMPARATOR); + mData = data; + } + } + + /** + * PtNode is a group of characters, with probability information, shortcut targets, bigrams, + * and children (Pt means Patricia Trie). + * + * This is the central class of the in-memory representation. A PtNode is what can + * be seen as a traditional "trie node", except it can hold several characters at the + * same time. A PtNode essentially represents one or several characters in the middle + * of the trie tree; as such, it can be a terminal, and it can have children. + * In this in-memory representation, whether the PtNode is a terminal or not is represented + * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the + * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null + * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present, + * are non-null. + */ + public static final class PtNode { + private static final int NOT_A_TERMINAL = -1; + final int mChars[]; + ArrayList<WeightedString> mShortcutTargets; + ArrayList<WeightedString> mBigrams; + // null == mProbabilityInfo indicates this is not a terminal. + ProbabilityInfo mProbabilityInfo; + int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. + PtNodeArray mChildren; + boolean mIsNotAWord; // Only a shortcut + boolean mIsBlacklistEntry; + // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary + // generation. Before and After always hold the same value except during dictionary + // address compression, where the update process needs to know about both values at the + // same time. Updating will update the AfterUpdate value, and the code will move them + // to BeforeUpdate before the next update pass. + // The update process does not need two versions of mCachedSize. + int mCachedSize; // The size, in bytes, of this PtNode. + int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) + int mCachedAddressAfterUpdate; // The address of this PtNode (after update) + + public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, + final boolean isNotAWord, final boolean isBlacklistEntry) { + mChars = chars; + mProbabilityInfo = probabilityInfo; + mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability; + mShortcutTargets = shortcutTargets; + mBigrams = bigrams; + mChildren = null; + mIsNotAWord = isNotAWord; + mIsBlacklistEntry = isBlacklistEntry; + } + + public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, + final boolean isNotAWord, final boolean isBlacklistEntry, + final PtNodeArray children) { + mChars = chars; + mProbabilityInfo = probabilityInfo; + mShortcutTargets = shortcutTargets; + mBigrams = bigrams; + mChildren = children; + mIsNotAWord = isNotAWord; + mIsBlacklistEntry = isBlacklistEntry; + } + + public void addChild(PtNode n) { + if (null == mChildren) { + mChildren = new PtNodeArray(); + } + mChildren.mData.add(n); + } + + public int getTerminalId() { + return mTerminalId; + } + + public boolean isTerminal() { + return mProbabilityInfo != null; + } + + public int getProbability() { + if (isTerminal()) { + return mProbabilityInfo.mProbability; + } else { + return NOT_A_TERMINAL; + } + } + + public boolean getIsNotAWord() { + return mIsNotAWord; + } + + public boolean getIsBlacklistEntry() { + return mIsBlacklistEntry; + } + + public ArrayList<WeightedString> getShortcutTargets() { + // We don't want write permission to escape outside the package, so we return a copy + if (null == mShortcutTargets) return null; + final ArrayList<WeightedString> copyOfShortcutTargets = + new ArrayList<WeightedString>(mShortcutTargets); + return copyOfShortcutTargets; + } + + public ArrayList<WeightedString> getBigrams() { + // We don't want write permission to escape outside the package, so we return a copy + if (null == mBigrams) return null; + final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams); + return copyOfBigrams; + } + + public boolean hasSeveralChars() { + assert(mChars.length > 0); + return 1 < mChars.length; + } + + /** + * Adds a word to the bigram list. Updates the probability information if the word already + * exists. + */ + public void addBigram(final String word, final ProbabilityInfo probabilityInfo) { + if (mBigrams == null) { + mBigrams = new ArrayList<WeightedString>(); + } + WeightedString bigram = getBigram(word); + if (bigram != null) { + bigram.mProbabilityInfo = probabilityInfo; + } else { + bigram = new WeightedString(word, probabilityInfo); + mBigrams.add(bigram); + } + } + + /** + * Gets the shortcut target for the given word. Returns null if the word is not in the + * shortcut list. + */ + public WeightedString getShortcut(final String word) { + // TODO: Don't do a linear search + if (mShortcutTargets != null) { + final int size = mShortcutTargets.size(); + for (int i = 0; i < size; ++i) { + WeightedString shortcut = mShortcutTargets.get(i); + if (shortcut.mWord.equals(word)) { + return shortcut; + } + } + } + return null; + } + + /** + * Gets the bigram for the given word. + * Returns null if the word is not in the bigrams list. + */ + public WeightedString getBigram(final String word) { + // TODO: Don't do a linear search + if (mBigrams != null) { + final int size = mBigrams.size(); + for (int i = 0; i < size; ++i) { + WeightedString bigram = mBigrams.get(i); + if (bigram.mWord.equals(word)) { + return bigram; + } + } + } + return null; + } + + /** + * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to + * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only + * updated if they are higher than the existing ones. + */ + private void update(final ProbabilityInfo probabilityInfo, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams, + final boolean isNotAWord, final boolean isBlacklistEntry) { + mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo); + if (shortcutTargets != null) { + if (mShortcutTargets == null) { + mShortcutTargets = shortcutTargets; + } else { + final int size = shortcutTargets.size(); + for (int i = 0; i < size; ++i) { + final WeightedString shortcut = shortcutTargets.get(i); + final WeightedString existingShortcut = getShortcut(shortcut.mWord); + if (existingShortcut == null) { + mShortcutTargets.add(shortcut); + } else { + existingShortcut.mProbabilityInfo = ProbabilityInfo.max( + existingShortcut.mProbabilityInfo, shortcut.mProbabilityInfo); + } + } + } + } + if (bigrams != null) { + if (mBigrams == null) { + mBigrams = bigrams; + } else { + final int size = bigrams.size(); + for (int i = 0; i < size; ++i) { + final WeightedString bigram = bigrams.get(i); + final WeightedString existingBigram = getBigram(bigram.mWord); + if (existingBigram == null) { + mBigrams.add(bigram); + } else { + existingBigram.mProbabilityInfo = ProbabilityInfo.max( + existingBigram.mProbabilityInfo, bigram.mProbabilityInfo); + } + } + } + } + mIsNotAWord = isNotAWord; + mIsBlacklistEntry = isBlacklistEntry; + } + } + + public final DictionaryOptions mOptions; + public final PtNodeArray mRootNodeArray; + + public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { + mRootNodeArray = rootNodeArray; + mOptions = options; + } + + public void addOptionAttribute(final String key, final String value) { + mOptions.mAttributes.put(key, value); + } + + /** + * Helper method to convert a String to an int array. + */ + static int[] getCodePoints(final String word) { + // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, + // which is not visible from the makedict package. Factor this code. + final int length = word.length(); + if (length <= 0) return new int[] {}; + final char[] characters = word.toCharArray(); + final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; + int codePoint = Character.codePointAt(characters, 0); + int dsti = 0; + for (int srci = Character.charCount(codePoint); + srci < length; srci += Character.charCount(codePoint), ++dsti) { + codePoints[dsti] = codePoint; + codePoint = Character.codePointAt(characters, srci); + } + codePoints[dsti] = codePoint; + return codePoints; + } + + /** + * Helper method to add a word as a string. + * + * This method adds a word to the dictionary with the given frequency. Optional + * lists of bigrams and shortcuts can be passed here. For each word inside, + * they will be added to the dictionary as necessary. + * + * @param word the word to add. + * @param probabilityInfo probability information of the word. + * @param shortcutTargets a list of shortcut targets for this word, or null. + * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) + */ + public void add(final String word, final ProbabilityInfo probabilityInfo, + final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { + add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord, + false /* isBlacklistEntry */); + } + + /** + * Helper method to add a blacklist entry as a string. + * + * @param word the word to add as a blacklist entry. + * @param shortcutTargets a list of shortcut targets for this word, or null. + * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) + */ + public void addBlacklistEntry(final String word, + final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { + add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord, + true /* isBlacklistEntry */); + } + + /** + * Sanity check for a PtNode array. + * + * This method checks that all PtNodes in a node array are ordered as expected. + * If they are, nothing happens. If they aren't, an exception is thrown. + */ + private void checkStack(PtNodeArray ptNodeArray) { + ArrayList<PtNode> stack = ptNodeArray.mData; + int lastValue = -1; + for (int i = 0; i < stack.size(); ++i) { + int currentValue = stack.get(i).mChars[0]; + if (currentValue <= lastValue) + throw new RuntimeException("Invalid stack"); + else + lastValue = currentValue; + } + } + + /** + * Helper method to add a new bigram to the dictionary. + * + * @param word0 the previous word of the context + * @param word1 the next word of the context + * @param probabilityInfo the bigram probability info + */ + public void setBigram(final String word0, final String word1, + final ProbabilityInfo probabilityInfo) { + PtNode ptNode0 = findWordInTree(mRootNodeArray, word0); + if (ptNode0 != null) { + final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1); + if (ptNode1 == null) { + add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */, + false /* isBlacklistEntry */); + // The PtNode for the first word may have moved by the above insertion, + // if word1 and word2 share a common stem that happens not to have been + // a cutting point until now. In this case, we need to refresh ptNode. + ptNode0 = findWordInTree(mRootNodeArray, word0); + } + ptNode0.addBigram(word1, probabilityInfo); + } else { + throw new RuntimeException("First word of bigram not found " + word0); + } + } + + /** + * Add a word to this dictionary. + * + * The shortcuts, if any, have to be in the dictionary already. If they aren't, + * an exception is thrown. + * + * @param word the word, as an int array. + * @param probabilityInfo the probability information of the word. + * @param shortcutTargets an optional list of shortcut targets for this word (null if none). + * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) + * @param isBlacklistEntry true if this is a blacklisted word, false otherwise + */ + private void add(final int[] word, final ProbabilityInfo probabilityInfo, + final ArrayList<WeightedString> shortcutTargets, + final boolean isNotAWord, final boolean isBlacklistEntry) { + assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY); + if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) { + MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); + return; + } + + PtNodeArray currentNodeArray = mRootNodeArray; + int charIndex = 0; + + PtNode currentPtNode = null; + int differentCharIndex = 0; // Set by the loop to the index of the char that differs + int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); + while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { + currentPtNode = currentNodeArray.mData.get(nodeIndex); + differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); + if (ARRAYS_ARE_EQUAL != differentCharIndex + && differentCharIndex < currentPtNode.mChars.length) break; + if (null == currentPtNode.mChildren) break; + charIndex += currentPtNode.mChars.length; + if (charIndex >= word.length) break; + currentNodeArray = currentPtNode.mChildren; + nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); + } + + if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { + // No node at this point to accept the word. Create one. + final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); + final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), + shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord, + isBlacklistEntry); + currentNodeArray.mData.add(insertionIndex, newPtNode); + if (DBG) checkStack(currentNodeArray); + } else { + // There is a word with a common prefix. + if (differentCharIndex == currentPtNode.mChars.length) { + if (charIndex + differentCharIndex >= word.length) { + // The new word is a prefix of an existing word, but the node on which it + // should end already exists as is. Since the old PtNode was not a terminal, + // make it one by filling in its frequency and other attributes + currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord, + isBlacklistEntry); + } else { + // The new word matches the full old word and extends past it. + // We only have to create a new node and add it to the end of this. + final PtNode newNode = new PtNode( + Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), + shortcutTargets, null /* bigrams */, probabilityInfo, + isNotAWord, isBlacklistEntry); + currentPtNode.mChildren = new PtNodeArray(); + currentPtNode.mChildren.mData.add(newNode); + } + } else { + if (0 == differentCharIndex) { + // Exact same word. Update the frequency if higher. This will also add the + // new shortcuts to the existing shortcut list if it already exists. + currentPtNode.update(probabilityInfo, shortcutTargets, null, + currentPtNode.mIsNotAWord && isNotAWord, + currentPtNode.mIsBlacklistEntry || isBlacklistEntry); + } else { + // Partial prefix match only. We have to replace the current node with a node + // containing the current prefix and create two new ones for the tails. + PtNodeArray newChildren = new PtNodeArray(); + final PtNode newOldWord = new PtNode( + Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, + currentPtNode.mChars.length), currentPtNode.mShortcutTargets, + currentPtNode.mBigrams, currentPtNode.mProbabilityInfo, + currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry, + currentPtNode.mChildren); + newChildren.mData.add(newOldWord); + + final PtNode newParent; + if (charIndex + differentCharIndex >= word.length) { + newParent = new PtNode( + Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), + shortcutTargets, null /* bigrams */, probabilityInfo, + isNotAWord, isBlacklistEntry, newChildren); + } else { + newParent = new PtNode( + Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), + null /* shortcutTargets */, null /* bigrams */, + null /* probabilityInfo */, false /* isNotAWord */, + false /* isBlacklistEntry */, newChildren); + final PtNode newWord = new PtNode(Arrays.copyOfRange(word, + charIndex + differentCharIndex, word.length), + shortcutTargets, null /* bigrams */, probabilityInfo, + isNotAWord, isBlacklistEntry); + final int addIndex = word[charIndex + differentCharIndex] + > currentPtNode.mChars[differentCharIndex] ? 1 : 0; + newChildren.mData.add(addIndex, newWord); + } + currentNodeArray.mData.set(nodeIndex, newParent); + } + if (DBG) checkStack(currentNodeArray); + } + } + } + + private static int ARRAYS_ARE_EQUAL = 0; + + /** + * Custom comparison of two int arrays taken to contain character codes. + * + * This method compares the two arrays passed as an argument in a lexicographic way, + * with an offset in the dst string. + * This method does NOT test for the first character. It is taken to be equal. + * I repeat: this method starts the comparison at 1 <> dstOffset + 1. + * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the + * strings are equal. This works BECAUSE we don't look at the first character. + * + * @param src the left-hand side string of the comparison. + * @param dst the right-hand side string of the comparison. + * @param dstOffset the offset in the right-hand side string. + * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. + */ + private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { + // We do NOT test the first char, because we come from a method that already + // tested it. + for (int i = 1; i < src.length; ++i) { + if (dstOffset + i >= dst.length) return i; + if (src[i] != dst[dstOffset + i]) return i; + } + if (dst.length > src.length) return src.length; + return ARRAYS_ARE_EQUAL; + } + + /** + * Helper class that compares and sorts two PtNodes according to their + * first element only. I repeat: ONLY the first element is considered, the rest + * is ignored. + * This comparator imposes orderings that are inconsistent with equals. + */ + static private final class PtNodeComparator implements java.util.Comparator<PtNode> { + @Override + public int compare(PtNode p1, PtNode p2) { + if (p1.mChars[0] == p2.mChars[0]) return 0; + return p1.mChars[0] < p2.mChars[0] ? -1 : 1; + } + } + final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); + + /** + * Finds the insertion index of a character within a node array. + */ + private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { + final ArrayList<PtNode> data = nodeArray.mData; + final PtNode reference = new PtNode(new int[] { character }, + null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */, + false /* isNotAWord */, false /* isBlacklistEntry */); + int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); + return result >= 0 ? result : -result - 1; + } + + /** + * Find the index of a char in a node array, if it exists. + * + * @param nodeArray the node array to search in. + * @param character the character to search for. + * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. + */ + private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { + final int insertionIndex = findInsertionIndex(nodeArray, character); + if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; + return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex + : CHARACTER_NOT_FOUND_INDEX; + } + + /** + * Helper method to find a word in a given branch. + */ + @SuppressWarnings("unused") + public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) { + int index = 0; + final StringBuilder checker = DBG ? new StringBuilder() : null; + final int[] codePoints = getCodePoints(string); + + PtNode currentPtNode; + do { + int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); + if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; + currentPtNode = nodeArray.mData.get(indexOfGroup); + + if (codePoints.length - index < currentPtNode.mChars.length) return null; + int newIndex = index; + while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { + if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; + newIndex++; + } + index = newIndex; + + if (DBG) { + checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); + } + if (index < codePoints.length) { + nodeArray = currentPtNode.mChildren; + } + } while (null != nodeArray && index < codePoints.length); + + if (index < codePoints.length) return null; + if (!currentPtNode.isTerminal()) return null; + if (DBG && !string.equals(checker.toString())) return null; + return currentPtNode; + } + + /** + * Helper method to find out whether a word is in the dict or not. + */ + public boolean hasWord(final String s) { + if (null == s || "".equals(s)) { + throw new RuntimeException("Can't search for a null or empty string"); + } + return null != findWordInTree(mRootNodeArray, s); + } + + /** + * Recursively count the number of PtNodes in a given branch of the trie. + * + * @param nodeArray the parent node. + * @return the number of PtNodes in all the branch under this node. + */ + public static int countPtNodes(final PtNodeArray nodeArray) { + final int nodeSize = nodeArray.mData.size(); + int size = nodeSize; + for (int i = nodeSize - 1; i >= 0; --i) { + PtNode ptNode = nodeArray.mData.get(i); + if (null != ptNode.mChildren) + size += countPtNodes(ptNode.mChildren); + } + return size; + } + + /** + * Iterator to walk through a dictionary. + * + * This is purely for convenience. + */ + public static final class DictionaryIterator implements Iterator<WordProperty> { + private static final class Position { + public Iterator<PtNode> pos; + public int length; + public Position(ArrayList<PtNode> ptNodes) { + pos = ptNodes.iterator(); + length = 0; + } + } + final StringBuilder mCurrentString; + final LinkedList<Position> mPositions; + + public DictionaryIterator(ArrayList<PtNode> ptRoot) { + mCurrentString = new StringBuilder(); + mPositions = new LinkedList<Position>(); + final Position rootPos = new Position(ptRoot); + mPositions.add(rootPos); + } + + @Override + public boolean hasNext() { + for (Position p : mPositions) { + if (p.pos.hasNext()) { + return true; + } + } + return false; + } + + @Override + public WordProperty next() { + Position currentPos = mPositions.getLast(); + mCurrentString.setLength(currentPos.length); + + do { + if (currentPos.pos.hasNext()) { + final PtNode currentPtNode = currentPos.pos.next(); + currentPos.length = mCurrentString.length(); + for (int i : currentPtNode.mChars) { + mCurrentString.append(Character.toChars(i)); + } + if (null != currentPtNode.mChildren) { + currentPos = new Position(currentPtNode.mChildren.mData); + currentPos.length = mCurrentString.length(); + mPositions.addLast(currentPos); + } + if (currentPtNode.isTerminal()) { + return new WordProperty(mCurrentString.toString(), + currentPtNode.mProbabilityInfo, + currentPtNode.mShortcutTargets, currentPtNode.mBigrams, + currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry); + } + } else { + mPositions.removeLast(); + currentPos = mPositions.getLast(); + mCurrentString.setLength(mPositions.getLast().length); + } + } while (true); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Unsupported yet"); + } + + } + + /** + * Method to return an iterator. + * + * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x + * and say : for (Word w : x) {} + */ + @Override + public Iterator<WordProperty> iterator() { + return new DictionaryIterator(mRootNodeArray.mData); + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/MakedictLog.java b/tests/src/com/android/inputmethod/latin/makedict/MakedictLog.java new file mode 100644 index 000000000..7eccff2b4 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/MakedictLog.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.makedict; + +/** + * Wrapper to redirect log events to the right output medium. + */ +public class MakedictLog { + public static final boolean DBG = true; + + private static void print(String message) { + System.out.println(message); + } + + public static void d(String message) { + print(message); + } + + public static void i(String message) { + print(message); + } + + public static void w(String message) { + print(message); + } + + public static void e(String message) { + print(message); + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/PendingAttribute.java b/tests/src/com/android/inputmethod/latin/makedict/PendingAttribute.java new file mode 100644 index 000000000..70e24cc98 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/PendingAttribute.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +/** + * A not-yet-resolved attribute. + * + * An attribute is either a bigram or a shortcut. + * All instances of this class are always immutable. + */ +public final class PendingAttribute { + public final int mFrequency; + public final int mAddress; + public PendingAttribute(final int frequency, final int address) { + mFrequency = frequency; + mAddress = address; + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/PtNodeInfo.java b/tests/src/com/android/inputmethod/latin/makedict/PtNodeInfo.java new file mode 100644 index 000000000..862e8c101 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/PtNodeInfo.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import java.util.ArrayList; + +/** + * Raw PtNode info straight out of a file. This will contain numbers for addresses. + */ +public final class PtNodeInfo { + public final int mOriginalAddress; + public final int mEndAddress; + public final int mFlags; + public final int[] mCharacters; + public final ProbabilityInfo mProbabilityInfo; + public final int mChildrenAddress; + public final ArrayList<WeightedString> mShortcutTargets; + public final ArrayList<PendingAttribute> mBigrams; + + public PtNodeInfo(final int originalAddress, final int endAddress, final int flags, + final int[] characters, final ProbabilityInfo probabilityInfo, + final int childrenAddress, final ArrayList<WeightedString> shortcutTargets, + final ArrayList<PendingAttribute> bigrams) { + mOriginalAddress = originalAddress; + mEndAddress = endAddress; + mFlags = flags; + mCharacters = characters; + mProbabilityInfo = probabilityInfo; + mChildrenAddress = childrenAddress; + mShortcutTargets = shortcutTargets; + mBigrams = bigrams; + } + + public boolean isTerminal() { + return mProbabilityInfo != null; + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java new file mode 100644 index 000000000..7091c119e --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictDecoder.java @@ -0,0 +1,324 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import com.android.inputmethod.latin.utils.CollectionUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * An implementation of DictDecoder for version 2 binary dictionary. + */ +// TODO: Separate logics that are used only for testing. +@UsedForTesting +public class Ver2DictDecoder extends AbstractDictDecoder { + private static final String TAG = Ver2DictDecoder.class.getSimpleName(); + + /** + * A utility class for reading a PtNode. + */ + protected static class PtNodeReader { + private static ProbabilityInfo readProbabilityInfo(final DictBuffer dictBuffer) { + // Ver2 dicts don't contain historical information. + return new ProbabilityInfo(dictBuffer.readUnsignedByte()); + } + + protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) { + return dictBuffer.readUnsignedByte(); + } + + protected static int readChildrenAddress(final DictBuffer dictBuffer, + final int ptNodeFlags) { + switch (ptNodeFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: + return dictBuffer.readUnsignedByte(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: + return dictBuffer.readUnsignedShort(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: + return dictBuffer.readUnsignedInt24(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: + default: + return FormatSpec.NO_CHILDREN_ADDRESS; + } + } + + // Reads shortcuts and returns the read length. + protected static int readShortcut(final DictBuffer dictBuffer, + final ArrayList<WeightedString> shortcutTargets) { + final int pointerBefore = dictBuffer.position(); + dictBuffer.readUnsignedShort(); // skip the size + while (true) { + final int targetFlags = dictBuffer.readUnsignedByte(); + final String word = CharEncoding.readString(dictBuffer); + shortcutTargets.add(new WeightedString(word, + targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); + if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return dictBuffer.position() - pointerBefore; + } + + protected static int readBigramAddresses(final DictBuffer dictBuffer, + final ArrayList<PendingAttribute> bigrams, final int baseAddress) { + int readLength = 0; + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + final int bigramFlags = dictBuffer.readUnsignedByte(); + ++readLength; + final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) + ? 1 : -1; + int bigramAddress = baseAddress + readLength; + switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: + bigramAddress += sign * dictBuffer.readUnsignedByte(); + readLength += 1; + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: + bigramAddress += sign * dictBuffer.readUnsignedShort(); + readLength += 2; + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: + bigramAddress += sign * dictBuffer.readUnsignedInt24(); + readLength += 3; + break; + default: + throw new RuntimeException("Has bigrams with no address"); + } + bigrams.add(new PendingAttribute( + bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, + bigramAddress)); + if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return readLength; + } + } + + protected final File mDictionaryBinaryFile; + protected final long mOffset; + protected final long mLength; + // TODO: Remove mBufferFactory and mDictBuffer from this class members because they are now + // used only for testing. + private final DictionaryBufferFactory mBufferFactory; + protected DictBuffer mDictBuffer; + + @UsedForTesting + /* package */ Ver2DictDecoder(final File file, final long offset, final long length, + final int factoryFlag) { + mDictionaryBinaryFile = file; + mOffset = offset; + mLength = length; + mDictBuffer = null; + if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { + mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); + } else if ((factoryFlag & MASK_DICTBUFFER) == USE_BYTEARRAY) { + mBufferFactory = new DictionaryBufferFromByteArrayFactory(); + } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) { + mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory(); + } else { + mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); + } + } + + /* package */ Ver2DictDecoder(final File file, final long offset, final long length, + final DictionaryBufferFactory factory) { + mDictionaryBinaryFile = file; + mOffset = offset; + mLength = length; + mBufferFactory = factory; + } + + @Override + public void openDictBuffer() throws FileNotFoundException, IOException { + mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile); + } + + @Override + public boolean isDictBufferOpen() { + return mDictBuffer != null; + } + + /* package */ DictBuffer getDictBuffer() { + return mDictBuffer; + } + + @UsedForTesting + /* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException { + openDictBuffer(); + return getDictBuffer(); + } + + @Override + public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException { + // dictType is not being used in dicttool. Passing an empty string. + final BinaryDictionary binaryDictionary = new BinaryDictionary( + mDictionaryBinaryFile.getAbsolutePath(), mOffset, mLength, + true /* useFullEditDistance */, null /* locale */, "" /* dictType */, + false /* isUpdatable */); + final DictionaryHeader header = binaryDictionary.getHeader(); + binaryDictionary.close(); + if (header == null) { + throw new IOException("Cannot read the dictionary header."); + } + if (header.mFormatOptions.mVersion != FormatSpec.VERSION2) { + throw new UnsupportedFormatException("File header has a wrong version : " + + header.mFormatOptions.mVersion); + } + if (!isDictBufferOpen()) { + openDictBuffer(); + } + // Advance buffer reading position to the head of dictionary body. + setPosition(header.mBodyOffset); + return header; + } + + // TODO: Make this buffer multi thread safe. + private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; + @Override + public PtNodeInfo readPtNode(final int ptNodePos) { + int addressPointer = ptNodePos; + final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); + addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; + final int characters[]; + if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { + int index = 0; + int character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + while (FormatSpec.INVALID_CHARACTER != character) { + // FusionDictionary is making sure that the length of the word is smaller than + // MAX_WORD_LENGTH. + // So we'll never write past the end of mCharacterBuffer. + mCharacterBuffer[index++] = character; + character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + } + characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); + } else { + final int character = CharEncoding.readChar(mDictBuffer); + addressPointer += CharEncoding.getCharSize(character); + characters = new int[] { character }; + } + final ProbabilityInfo probabilityInfo; + if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { + probabilityInfo = PtNodeReader.readProbabilityInfo(mDictBuffer); + addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE; + } else { + probabilityInfo = null; + } + int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags); + if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + childrenAddress += addressPointer; + } + addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags); + final ArrayList<WeightedString> shortcutTargets; + if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { + // readShortcut will add shortcuts to shortcutTargets. + shortcutTargets = new ArrayList<WeightedString>(); + addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); + } else { + shortcutTargets = null; + } + + final ArrayList<PendingAttribute> bigrams; + if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { + bigrams = new ArrayList<PendingAttribute>(); + addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams, + addressPointer); + if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() + + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); + } + } else { + bigrams = null; + } + return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, probabilityInfo, + childrenAddress, shortcutTargets, bigrams); + } + + @Override + public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException { + // dictType is not being used in dicttool. Passing an empty string. + final BinaryDictionary binaryDictionary = new BinaryDictionary( + mDictionaryBinaryFile.getAbsolutePath(), 0 /* offset */, + mDictionaryBinaryFile.length() /* length */, true /* useFullEditDistance */, + null /* locale */, "" /* dictType */, false /* isUpdatable */); + final DictionaryHeader header = readHeader(); + final FusionDictionary fusionDict = + new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions); + int token = 0; + final ArrayList<WordProperty> wordProperties = CollectionUtils.newArrayList(); + do { + final BinaryDictionary.GetNextWordPropertyResult result = + binaryDictionary.getNextWordProperty(token); + final WordProperty wordProperty = result.mWordProperty; + if (wordProperty == null) { + binaryDictionary.close(); + if (deleteDictIfBroken) { + mDictionaryBinaryFile.delete(); + } + return null; + } + wordProperties.add(wordProperty); + token = result.mNextToken; + } while (token != 0); + + // Insert unigrams into the fusion dictionary. + for (final WordProperty wordProperty : wordProperties) { + if (wordProperty.mIsBlacklistEntry) { + fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets, + wordProperty.mIsNotAWord); + } else { + fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo, + wordProperty.mShortcutTargets, wordProperty.mIsNotAWord); + } + } + // Insert bigrams into the fusion dictionary. + for (final WordProperty wordProperty : wordProperties) { + if (wordProperty.mBigrams == null) { + continue; + } + final String word0 = wordProperty.mWord; + for (final WeightedString bigram : wordProperty.mBigrams) { + fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo); + } + } + binaryDictionary.close(); + return fusionDict; + } + + @Override + public void setPosition(int newPos) { + mDictBuffer.position(newPos); + } + + @Override + public int getPosition() { + return mDictBuffer.position(); + } + + @Override + public int readPtNodeCount() { + return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer); + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java new file mode 100644 index 000000000..a286190cb --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Iterator; + +/** + * An implementation of DictEncoder for version 2 binary dictionary. + */ +@UsedForTesting +public class Ver2DictEncoder implements DictEncoder { + + private final File mDictFile; + private OutputStream mOutStream; + private byte[] mBuffer; + private int mPosition; + + @UsedForTesting + public Ver2DictEncoder(final File dictFile) { + mDictFile = dictFile; + mOutStream = null; + mBuffer = null; + } + + // This constructor is used only by BinaryDictOffdeviceUtilsTests. + // If you want to use this in the production code, you should consider keeping consistency of + // the interface of Ver3DictDecoder by using factory. + @UsedForTesting + public Ver2DictEncoder(final OutputStream outStream) { + mDictFile = null; + mOutStream = outStream; + } + + private void openStream() throws FileNotFoundException { + mOutStream = new FileOutputStream(mDictFile); + } + + private void close() throws IOException { + if (mOutStream != null) { + mOutStream.close(); + mOutStream = null; + } + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion > FormatSpec.VERSION2) { + throw new UnsupportedFormatException( + "The given format options has wrong version number : " + + formatOptions.mVersion); + } + + if (mOutStream == null) { + openStream(); + } + BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions); + + // Addresses are limited to 3 bytes, but since addresses can be relative to each node + // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding + // the order of the PtNode arrays becomes a quite complicated problem, because though the + // dictionary itself does not have a size limit, each node array must still be within 16MB + // of all its children and parents. As long as this is ensured, the dictionary file may + // grow to any size. + + // Leave the choice of the optimal node order to the flattenTree function. + MakedictLog.i("Flattening the tree..."); + ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes); + MakedictLog.i("Checking PtNode array..."); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + // Create a buffer that matches the final dictionary size. + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mBuffer = new byte[bufferSize]; + + MakedictLog.i("Writing file..."); + + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray); + } + if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); + mOutStream.write(mBuffer, 0, mPosition); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(final int position) { + if (mBuffer == null || position < 0 || position >= mBuffer.length) return; + mPosition = position; + } + + @Override + public int getPosition() { + return mPosition; + } + + @Override + public void writePtNodeCount(final int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); + } + final int encodedPtNodeCount = (countSize == 2) ? + (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars) { + mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition); + if (hasSeveralChars) { + mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeFrequency(final int frequency) { + if (frequency >= 0) { + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, + FormatSpec.PTNODE_FREQUENCY_SIZE); + } + } + + private void writeChildrenPosition(final PtNode ptNode) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + childrenPos); + } + + /** + * Write a shortcut attributes list to mBuffer. + * + * @param shortcuts the shortcut attributes list. + */ + private void writeShortcuts(final ArrayList<WeightedString> shortcuts) { + if (null == shortcuts || shortcuts.isEmpty()) return; + + final int indexOfShortcutByteSize = mPosition; + mPosition += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; + final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( + shortcutIterator.hasNext(), + target.getProbability()); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, shortcutFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + final int shortcutShift = CharEncoding.writeString(mBuffer, mPosition, target.mWord); + mPosition += shortcutShift; + } + final int shortcutByteSize = mPosition - indexOfShortcutByteSize; + if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { + throw new RuntimeException("Shortcut list too large"); + } + BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize, + FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + + /** + * Write a bigram attributes list to mBuffer. + * + * @param bigrams the bigram attributes list. + * @param dict the dictionary the node array is a part of (for relative offsets). + */ + private void writeBigrams(final ArrayList<WeightedString> bigrams, + final FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator<WeightedString> bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.getProbability(); + final int offset = addressOfBigram + - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + Math.abs(offset)); + } + } + + @Override + public void writeForwardLinkAddress(final int forwardLinkAddress) { + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, forwardLinkAddress, + FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + } + + @Override + public void writePtNode(final PtNode ptNode, final FusionDictionary dict) { + writePtNodeFlags(ptNode); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); + writeFrequency(ptNode.getProbability()); + writeChildrenPosition(ptNode); + writeShortcuts(ptNode.mShortcutTargets); + writeBigrams(ptNode.mBigrams, dict); + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java new file mode 100644 index 000000000..f3fad7e99 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.utils.CollectionUtils; +import com.android.inputmethod.latin.utils.FileUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; + +/** + * An implementation of binary dictionary decoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictDecoder extends AbstractDictDecoder { + private static final String TAG = Ver4DictDecoder.class.getSimpleName(); + + final File mDictDirectory; + + @UsedForTesting + /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { + this(dictDirectory, null /* factory */); + } + + @UsedForTesting + /* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) { + mDictDirectory = dictDirectory; + + } + + @Override + public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException { + // dictType is not being used in dicttool. Passing an empty string. + final BinaryDictionary binaryDictionary= new BinaryDictionary( + mDictDirectory.getAbsolutePath(), 0 /* offset */, 0 /* length */, + true /* useFullEditDistance */, null /* locale */, + "" /* dictType */, true /* isUpdatable */); + final DictionaryHeader header = binaryDictionary.getHeader(); + binaryDictionary.close(); + if (header == null) { + throw new IOException("Cannot read the dictionary header."); + } + return header; + } + + @Override + public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException { + // dictType is not being used in dicttool. Passing an empty string. + final BinaryDictionary binaryDictionary = new BinaryDictionary( + mDictDirectory.getAbsolutePath(), 0 /* offset */, 0 /* length */, + true /* useFullEditDistance */, null /* locale */, + "" /* dictType */, true /* isUpdatable */); + final DictionaryHeader header = readHeader(); + final FusionDictionary fusionDict = + new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions); + int token = 0; + final ArrayList<WordProperty> wordProperties = CollectionUtils.newArrayList(); + do { + final BinaryDictionary.GetNextWordPropertyResult result = + binaryDictionary.getNextWordProperty(token); + final WordProperty wordProperty = result.mWordProperty; + if (wordProperty == null) { + binaryDictionary.close(); + if (deleteDictIfBroken) { + FileUtils.deleteRecursively(mDictDirectory); + } + return null; + } + wordProperties.add(wordProperty); + token = result.mNextToken; + } while (token != 0); + + // Insert unigrams into the fusion dictionary. + for (final WordProperty wordProperty : wordProperties) { + if (wordProperty.mIsBlacklistEntry) { + fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets, + wordProperty.mIsNotAWord); + } else { + fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo, + wordProperty.mShortcutTargets, wordProperty.mIsNotAWord); + } + } + // Insert bigrams into the fusion dictionary. + for (final WordProperty wordProperty : wordProperties) { + if (wordProperty.mBigrams == null) { + continue; + } + final String word0 = wordProperty.mWord; + for (final WeightedString bigram : wordProperty.mBigrams) { + fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo); + } + } + binaryDictionary.close(); + return fusionDict; + } +} diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java new file mode 100644 index 000000000..dab9a4315 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.Dictionary; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; +import com.android.inputmethod.latin.utils.LocaleUtils; + +import java.io.File; +import java.io.IOException; + +/** + * An implementation of DictEncoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictEncoder implements DictEncoder { + private final File mDictPlacedDir; + + @UsedForTesting + public Ver4DictEncoder(final File dictPlacedDir) { + mDictPlacedDir = dictPlacedDir; + } + + // TODO: This builds a FusionDictionary first and iterates it to add words to the binary + // dictionary. However, it is possible to just add words directly to the binary dictionary + // instead. + // In the long run, when we stop supporting version 2, FusionDictionary will become deprecated + // and we can remove it. Then we'll be able to just call BinaryDictionary directly. + @Override + public void writeDictionary(FusionDictionary dict, FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion != FormatSpec.VERSION4) { + throw new UnsupportedFormatException("File header has a wrong version number : " + + formatOptions.mVersion); + } + if (!mDictPlacedDir.isDirectory()) { + throw new UnsupportedFormatException("Given path is not a directory."); + } + if (!BinaryDictionaryUtils.createEmptyDictFile(mDictPlacedDir.getAbsolutePath(), + FormatSpec.VERSION4, LocaleUtils.constructLocaleFromString( + dict.mOptions.mAttributes.get(DictionaryHeader.DICTIONARY_LOCALE_KEY)), + dict.mOptions.mAttributes)) { + throw new IOException("Cannot create dictionary file : " + + mDictPlacedDir.getAbsolutePath()); + } + final BinaryDictionary binaryDict = new BinaryDictionary(mDictPlacedDir.getAbsolutePath(), + 0l, mDictPlacedDir.length(), true /* useFullEditDistance */, + LocaleUtils.constructLocaleFromString(dict.mOptions.mAttributes.get( + DictionaryHeader.DICTIONARY_LOCALE_KEY)), + Dictionary.TYPE_USER /* Dictionary type. Does not matter for us */, + true /* isUpdatable */); + if (!binaryDict.isValidDictionary()) { + // Somehow createEmptyDictFile returned true, but the file was not created correctly + throw new IOException("Cannot create dictionary file"); + } + for (final WordProperty wordProperty : dict) { + // TODO: switch to addMultipleDictionaryEntries when they support shortcuts + if (null == wordProperty.mShortcutTargets || wordProperty.mShortcutTargets.isEmpty()) { + binaryDict.addUnigramWord(wordProperty.mWord, wordProperty.getProbability(), + null /* shortcutTarget */, 0 /* shortcutProbability */, + wordProperty.mIsNotAWord, wordProperty.mIsBlacklistEntry, + 0 /* timestamp */); + } else { + for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { + binaryDict.addUnigramWord(wordProperty.mWord, wordProperty.getProbability(), + shortcutTarget.mWord, shortcutTarget.getProbability(), + wordProperty.mIsNotAWord, wordProperty.mIsBlacklistEntry, + 0 /* timestamp */); + } + } + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + binaryDict.flushWithGC(); + } + } + for (final WordProperty word0Property : dict) { + if (null == word0Property.mBigrams) continue; + for (final WeightedString word1 : word0Property.mBigrams) { + binaryDict.addBigramWords(word0Property.mWord, word1.mWord, word1.getProbability(), + 0 /* timestamp */); + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + binaryDict.flushWithGC(); + } + } + } + binaryDict.flushWithGC(); + binaryDict.close(); + } + + @Override + public void setPosition(int position) { + } + + @Override + public int getPosition() { + return 0; + } + + @Override + public void writePtNodeCount(int ptNodeCount) { + } + + @Override + public void writeForwardLinkAddress(int forwardLinkAddress) { + } + + @Override + public void writePtNode(PtNode ptNode, FusionDictionary dict) { + } +} diff --git a/tests/src/com/android/inputmethod/latin/utils/ByteArrayDictBuffer.java b/tests/src/com/android/inputmethod/latin/utils/ByteArrayDictBuffer.java new file mode 100644 index 000000000..2028298f2 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/utils/ByteArrayDictBuffer.java @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.utils; + +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; + +/** + * This class provides an implementation for the FusionDictionary buffer interface that is backed + * by a simpled byte array. It allows to create a binary dictionary in memory. + */ +public final class ByteArrayDictBuffer implements DictBuffer { + private byte[] mBuffer; + private int mPosition; + + public ByteArrayDictBuffer(final byte[] buffer) { + mBuffer = buffer; + mPosition = 0; + } + + @Override + public int readUnsignedByte() { + return mBuffer[mPosition++] & 0xFF; + } + + @Override + public int readUnsignedShort() { + final int retval = readUnsignedByte(); + return (retval << 8) + readUnsignedByte(); + } + + @Override + public int readUnsignedInt24() { + final int retval = readUnsignedShort(); + return (retval << 8) + readUnsignedByte(); + } + + @Override + public int readInt() { + final int retval = readUnsignedShort(); + return (retval << 16) + readUnsignedShort(); + } + + @Override + public int position() { + return mPosition; + } + + @Override + public void position(int position) { + mPosition = position; + } + + @Override + public void put(final byte b) { + mBuffer[mPosition++] = b; + } + + @Override + public int limit() { + return mBuffer.length - 1; + } + + @Override + public int capacity() { + return mBuffer.length; + } +} |