From e276c2401e5702222b21c4dfe2a25219c2f6619f Mon Sep 17 00:00:00 2001 From: Tom Ouyang Date: Wed, 21 Mar 2012 18:10:21 +0900 Subject: Move makedict to LatinIME android keyboard. Bug: 6188977 Change-Id: I4d2ef504bb983abbda3cb52ee450cb46f58d95cf --- .../latin/makedict/BinaryDictInputOutput.java | 1208 ++++++++++++++++++++ 1 file changed, 1208 insertions(+) create mode 100644 java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java new file mode 100644 index 000000000..42dd4df34 --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -0,0 +1,1208 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.OutputStream; +import java.io.RandomAccessFile; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +/** + * Reads and writes XML files for a FusionDictionary. + * + * All the methods in this class are static. + */ +public class BinaryDictInputOutput { + + /* Node layout is as follows: + * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE + * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS + * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE + * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES + * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES + * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS + * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL + * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS + * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS + * | is shortcut only ? 1 bit, 1 = yes, 0 = no : FLAG_IS_SHORTCUT_ONLY + * + * c | IF FLAG_HAS_MULTIPLE_CHARS + * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers + * a | end 1 byte, = 0 + * r | ELSE + * s | char 1 or 3 bytes + * | END + * + * f | + * r | IF FLAG_IS_TERMINAL + * e | frequency 1 byte + * q | + * + * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType + * h | // nothing + * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType + * l | children address, 1 byte + * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType + * r | children address, 2 bytes + * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType + * n | children address, 3 bytes + * A | END + * d + * dress + * + * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS + * | shortcut targets address list + * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS + * | bigrams address list + * + * Char format is: + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + * + * bigram and shortcut address list is: + * = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT + * | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE + * | 1 = must take -address, 0 = must take +address + * | xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE + * | addressFormat = 2 bits, 00 = unused : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE + * | 01 = 1 byte : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE + * | 10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES + * | 11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES + * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY + *
| IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat) + * | read 1 byte, add top 4 bits + * | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat) + * | read 2 bytes, add top 4 bits + * | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat + * | read 3 bytes, add top 4 bits + * | END + * | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address + * if (FLAG_ATTRIBUTE_HAS_NET) goto bigram_and_shortcut_address_list_is + * + */ + + private static final int VERSION_1_MAGIC_NUMBER = 0x78B1; + private static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; + private static final int MINIMUM_SUPPORTED_VERSION = 1; + private static final int MAXIMUM_SUPPORTED_VERSION = 2; + private static final int NOT_A_VERSION_NUMBER = -1; + private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2; + + // No options yet, reserved for future use. + private static final int OPTIONS = 0; + + // TODO: Make this value adaptative to content data, store it in the header, and + // use it in the reading code. + private static final int MAX_WORD_LENGTH = 48; + + private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; + private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; + private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; + private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; + private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; + + private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; + + private static final int FLAG_IS_TERMINAL = 0x10; + private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; + private static final int FLAG_HAS_BIGRAMS = 0x04; + private static final int FLAG_IS_SHORTCUT_ONLY = 0x02; + + private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; + private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; + private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; + private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; + private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; + private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F; + + private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F; + + private static final int GROUP_TERMINATOR_SIZE = 1; + private static final int GROUP_FLAGS_SIZE = 1; + private static final int GROUP_FREQUENCY_SIZE = 1; + private static final int GROUP_MAX_ADDRESS_SIZE = 3; + private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1; + private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; + + private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; + private static final int INVALID_CHARACTER = -1; + + private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 + private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 + + private static final int MAX_TERMINAL_FREQUENCY = 255; + + /** + * A class grouping utility function for our specific character encoding. + */ + private static class CharEncoding { + + private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; + + /** + * Helper method to find out whether this code fits on one byte + */ + private static boolean fitsOnOneByte(int character) { + return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE + && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; + } + + /** + * Compute the size of a character given its character code. + * + * Char format is: + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + * + * @param character the character code. + * @return the size in binary encoded-form, either 1 or 3 bytes. + */ + private static int getCharSize(int character) { + // See char encoding in FusionDictionary.java + if (fitsOnOneByte(character)) return 1; + if (INVALID_CHARACTER == character) return 1; + return 3; + } + + /** + * Compute the byte size of a character array. + */ + private static int getCharArraySize(final int[] chars) { + int size = 0; + for (int character : chars) size += getCharSize(character); + return size; + } + + /** + * Writes a char array to a byte buffer. + * + * @param characters the character array to write. + * @param buffer the byte buffer to write to. + * @param index the index in buffer to write the character array to. + * @return the index after the last character. + */ + private static int writeCharArray(int[] characters, byte[] buffer, int index) { + for (int character : characters) { + if (1 == getCharSize(character)) { + buffer[index++] = (byte)character; + } else { + buffer[index++] = (byte)(0xFF & (character >> 16)); + buffer[index++] = (byte)(0xFF & (character >> 8)); + buffer[index++] = (byte)(0xFF & character); + } + } + return index; + } + + /** + * Reads a character from the file. + * + * This follows the character format documented earlier in this source file. + * + * @param source the file, positioned over an encoded character. + * @return the character code. + */ + private static int readChar(RandomAccessFile source) throws IOException { + int character = source.readUnsignedByte(); + if (!fitsOnOneByte(character)) { + if (GROUP_CHARACTERS_TERMINATOR == character) + return INVALID_CHARACTER; + character <<= 16; + character += source.readUnsignedShort(); + } + return character; + } + } + + /** + * Compute the binary size of the character array in a group + * + * If only one character, this is the size of this character. If many, it's the sum of their + * sizes + 1 byte for the terminator. + * + * @param group the group + * @return the size of the char array, including the terminator if any + */ + private static int getGroupCharactersSize(CharGroup group) { + int size = CharEncoding.getCharArraySize(group.mChars); + if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE; + return size; + } + + /** + * Compute the binary size of the group count + * @param count the group count + * @return the size of the group count, either 1 or 2 bytes. + */ + private static int getGroupCountSize(final int count) { + if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) { + return 1; + } else if (MAX_CHARGROUPS_IN_A_NODE >= count) { + return 2; + } else { + throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE + + " groups in a node (found " + count +")"); + } + } + + /** + * Compute the binary size of the group count for a node + * @param node the node + * @return the size of the group count, either 1 or 2 bytes. + */ + private static int getGroupCountSize(final Node node) { + return getGroupCountSize(node.mData.size()); + } + + /** + * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. + * + * @param group the CharGroup to compute the size of. + * @return the maximum size of the group. + */ + private static int getCharGroupMaximumSize(CharGroup group) { + int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE; + // If terminal, one byte for the frequency + if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; + size += GROUP_MAX_ADDRESS_SIZE; // For children address + if (null != group.mShortcutTargets) { + size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) + * group.mShortcutTargets.size(); + } + if (null != group.mBigrams) { + size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) + * group.mBigrams.size(); + } + return size; + } + + /** + * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches + * it in the 'actualSize' member of the node. + * + * @param node the node to compute the maximum size of. + */ + private static void setNodeMaximumSize(Node node) { + int size = getGroupCountSize(node); + for (CharGroup g : node.mData) { + final int groupSize = getCharGroupMaximumSize(g); + g.mCachedSize = groupSize; + size += groupSize; + } + node.mCachedSize = size; + } + + /** + * Helper method to hide the actual value of the no children address. + */ + private static boolean hasChildrenAddress(int address) { + return NO_CHILDREN_ADDRESS != address; + } + + /** + * Helper method to find out if a character info is a shortcut only. + */ + private static boolean isShortcutOnly(final CharGroupInfo info) { + return 0 != (info.mFlags & FLAG_IS_SHORTCUT_ONLY); + } + + /** + * Compute the size, in bytes, that an address will occupy. + * + * This can be used either for children addresses (which are always positive) or for + * attribute, which may be positive or negative but + * store their sign bit separately. + * + * @param address the address + * @return the byte size. + */ + private static int getByteSize(int address) { + assert(address < 0x1000000); + if (!hasChildrenAddress(address)) { + return 0; + } else if (Math.abs(address) < 0x100) { + return 1; + } else if (Math.abs(address) < 0x10000) { + return 2; + } else { + return 3; + } + } + // End utility methods. + + // This method is responsible for finding a nice ordering of the nodes that favors run-time + // cache performance and dictionary size. + /* package for tests */ static ArrayList flattenTree(Node root) { + final int treeSize = FusionDictionary.countCharGroups(root); + MakedictLog.i("Counted nodes : " + treeSize); + final ArrayList flatTree = new ArrayList(treeSize); + return flattenTreeInner(flatTree, root); + } + + private static ArrayList flattenTreeInner(ArrayList list, Node node) { + // Removing the node is necessary if the tails are merged, because we would then + // add the same node several times when we only want it once. A number of places in + // the code also depends on any node being only once in the list. + // Merging tails can only be done if there are no attributes. Searching for attributes + // in LatinIME code depends on a total breadth-first ordering, which merging tails + // breaks. If there are no attributes, it should be fine (and reduce the file size) + // to merge tails, and the following step would be necessary. + // If eventually the code runs on Android, searching through the whole array each time + // may be a performance concern. + list.remove(node); + list.add(node); + final ArrayList branches = node.mData; + final int nodeSize = branches.size(); + for (CharGroup group : branches) { + if (null != group.mChildren) flattenTreeInner(list, group.mChildren); + } + return list; + } + + /** + * Finds the absolute address of a word in the dictionary. + * + * @param dict the dictionary in which to search. + * @param word the word we are searching for. + * @return the word address. If it is not found, an exception is thrown. + */ + private static int findAddressOfWord(final FusionDictionary dict, final String word) { + return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress; + } + + /** + * Computes the actual node size, based on the cached addresses of the children nodes. + * + * Each node stores its tentative address. During dictionary address computing, these + * are not final, but they can be used to compute the node size (the node size depends + * on the address of the children because the number of bytes necessary to store an + * address depends on its numeric value. + * + * @param node the node to compute the size of. + * @param dict the dictionary in which the word/attributes are to be found. + */ + private static void computeActualNodeSize(Node node, FusionDictionary dict) { + int size = getGroupCountSize(node); + for (CharGroup group : node.mData) { + int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group); + if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; + if (null != group.mChildren) { + final int offsetBasePoint= groupSize + node.mCachedAddress + size; + final int offset = group.mChildren.mCachedAddress - offsetBasePoint; + groupSize += getByteSize(offset); + } + if (null != group.mShortcutTargets) { + for (WeightedString target : group.mShortcutTargets) { + final int offsetBasePoint = groupSize + node.mCachedAddress + size + + GROUP_FLAGS_SIZE; + final int addressOfTarget = findAddressOfWord(dict, target.mWord); + final int offset = addressOfTarget - offsetBasePoint; + groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; + } + } + if (null != group.mBigrams) { + for (WeightedString bigram : group.mBigrams) { + final int offsetBasePoint = groupSize + node.mCachedAddress + size + + GROUP_FLAGS_SIZE; + final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); + final int offset = addressOfBigram - offsetBasePoint; + groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; + } + } + group.mCachedSize = groupSize; + size += groupSize; + } + node.mCachedSize = size; + } + + /** + * Computes the byte size of a list of nodes and updates each node cached position. + * + * @param flatNodes the array of nodes. + * @return the byte size of the entire stack. + */ + private static int stackNodes(ArrayList flatNodes) { + int nodeOffset = 0; + for (Node n : flatNodes) { + n.mCachedAddress = nodeOffset; + int groupCountSize = getGroupCountSize(n); + int groupOffset = 0; + for (CharGroup g : n.mData) { + g.mCachedAddress = groupCountSize + nodeOffset + groupOffset; + groupOffset += g.mCachedSize; + } + if (groupOffset + groupCountSize != n.mCachedSize) { + throw new RuntimeException("Bug : Stored and computed node size differ"); + } + nodeOffset += n.mCachedSize; + } + return nodeOffset; + } + + /** + * Compute the addresses and sizes of an ordered node array. + * + * This method takes a node array and will update its cached address and size values + * so that they can be written into a file. It determines the smallest size each of the + * nodes can be given the addresses of its children and attributes, and store that into + * each node. + * The order of the node is given by the order of the array. This method makes no effort + * to find a good order; it only mechanically computes the size this order results in. + * + * @param dict the dictionary + * @param flatNodes the ordered array of nodes + * @return the same array it was passed. The nodes have been updated for address and size. + */ + private static ArrayList computeAddresses(FusionDictionary dict, + ArrayList flatNodes) { + // First get the worst sizes and offsets + for (Node n : flatNodes) setNodeMaximumSize(n); + final int offset = stackNodes(flatNodes); + + MakedictLog.i("Compressing the array addresses. Original size : " + offset); + MakedictLog.i("(Recursively seen size : " + offset + ")"); + + int passes = 0; + boolean changesDone = false; + do { + changesDone = false; + for (Node n : flatNodes) { + final int oldNodeSize = n.mCachedSize; + computeActualNodeSize(n, dict); + final int newNodeSize = n.mCachedSize; + if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); + if (oldNodeSize != newNodeSize) changesDone = true; + } + stackNodes(flatNodes); + ++passes; + } while (changesDone); + + final Node lastNode = flatNodes.get(flatNodes.size() - 1); + MakedictLog.i("Compression complete in " + passes + " passes."); + MakedictLog.i("After address compression : " + + (lastNode.mCachedAddress + lastNode.mCachedSize)); + + return flatNodes; + } + + /** + * Sanity-checking method. + * + * This method checks an array of node for juxtaposition, that is, it will do + * nothing if each node's cached address is actually the previous node's address + * plus the previous node's size. + * If this is not the case, it will throw an exception. + * + * @param array the array node to check + */ + private static void checkFlatNodeArray(ArrayList array) { + int offset = 0; + int index = 0; + for (Node n : array) { + if (n.mCachedAddress != offset) { + throw new RuntimeException("Wrong address for node " + index + + " : expected " + offset + ", got " + n.mCachedAddress); + } + ++index; + offset += n.mCachedSize; + } + } + + /** + * Helper method to write a variable-size address to a file. + * + * @param buffer the buffer to write to. + * @param index the index in the buffer to write the address to. + * @param address the address to write. + * @return the size in bytes the address actually took. + */ + private static int writeVariableAddress(byte[] buffer, int index, int address) { + switch (getByteSize(address)) { + case 1: + buffer[index++] = (byte)address; + return 1; + case 2: + buffer[index++] = (byte)(0xFF & (address >> 8)); + buffer[index++] = (byte)(0xFF & address); + return 2; + case 3: + buffer[index++] = (byte)(0xFF & (address >> 16)); + buffer[index++] = (byte)(0xFF & (address >> 8)); + buffer[index++] = (byte)(0xFF & address); + return 3; + case 0: + return 0; + default: + throw new RuntimeException("Address " + address + " has a strange size"); + } + } + + private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, + final int childrenOffset) { + byte flags = 0; + if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS; + if (group.mFrequency >= 0) { + flags |= FLAG_IS_TERMINAL; + } + if (null != group.mChildren) { + switch (getByteSize(childrenOffset)) { + case 1: + flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; + break; + default: + throw new RuntimeException("Node with a strange address"); + } + } + if (null != group.mShortcutTargets) { + if (0 == group.mShortcutTargets.size()) { + throw new RuntimeException("0-sized shortcut list must be null"); + } + flags |= FLAG_HAS_SHORTCUT_TARGETS; + } + if (null != group.mBigrams) { + if (0 == group.mBigrams.size()) { + throw new RuntimeException("0-sized bigram list must be null"); + } + flags |= FLAG_HAS_BIGRAMS; + } + if (group.mIsShortcutOnly) { + flags |= FLAG_IS_SHORTCUT_ONLY; + } + return flags; + } + + /** + * Makes the flag value for an attribute. + * + * @param more whether there are more attributes after this one. + * @param offset the offset of the attribute. + * @param frequency the frequency of the attribute, 0..15 + * @return the flags + */ + private static final int makeAttributeFlags(final boolean more, final int offset, + final int frequency) { + int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); + switch (getByteSize(offset)) { + case 1: + bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + break; + default: + throw new RuntimeException("Strange offset size"); + } + bigramFlags += frequency & FLAG_ATTRIBUTE_FREQUENCY; + return bigramFlags; + } + + /** + * Write a node to memory. The node is expected to have its final position cached. + * + * This can be an empty map, but the more is inside the faster the lookups will be. It can + * be carried on as long as nodes do not move. + * + * @param dict the dictionary the node is a part of (for relative offsets). + * @param buffer the memory buffer to write to. + * @param node the node to write. + * @return the address of the END of the node. + */ + private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) { + int index = node.mCachedAddress; + + final int groupCount = node.mData.size(); + final int countSize = getGroupCountSize(node); + if (1 == countSize) { + buffer[index++] = (byte)groupCount; + } else if (2 == countSize) { + // We need to signal 2-byte size by setting the top bit of the MSB to 1, so + // we | 0x80 to do this. + buffer[index++] = (byte)((groupCount >> 8) | 0x80); + buffer[index++] = (byte)(groupCount & 0xFF); + } else { + throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); + } + int groupAddress = index; + for (int i = 0; i < groupCount; ++i) { + CharGroup group = node.mData.get(i); + if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " + + "the same as the cached address of the group"); + groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group); + // Sanity checks. + if (group.mFrequency > MAX_TERMINAL_FREQUENCY) { + throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY + + " : " + group.mFrequency); + } + if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE; + final int childrenOffset = null == group.mChildren + ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress; + byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset); + buffer[index++] = flags; + index = CharEncoding.writeCharArray(group.mChars, buffer, index); + if (group.hasSeveralChars()) { + buffer[index++] = GROUP_CHARACTERS_TERMINATOR; + } + if (group.mFrequency >= 0) { + buffer[index++] = (byte) group.mFrequency; + } + final int shift = writeVariableAddress(buffer, index, childrenOffset); + index += shift; + groupAddress += shift; + + // Write shortcuts + if (null != group.mShortcutTargets) { + final Iterator shortcutIterator = group.mShortcutTargets.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = (WeightedString)shortcutIterator.next(); + final int addressOfTarget = findAddressOfWord(dict, target.mWord); + ++groupAddress; + final int offset = addressOfTarget - groupAddress; + int shortcutFlags = makeAttributeFlags(shortcutIterator.hasNext(), offset, + target.mFrequency); + buffer[index++] = (byte)shortcutFlags; + final int shortcutShift = writeVariableAddress(buffer, index, Math.abs(offset)); + index += shortcutShift; + groupAddress += shortcutShift; + } + } + // Write bigrams + if (null != group.mBigrams) { + final Iterator bigramIterator = group.mBigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = (WeightedString)bigramIterator.next(); + final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); + ++groupAddress; + final int offset = addressOfBigram - groupAddress; + int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset, + bigram.mFrequency); + buffer[index++] = (byte)bigramFlags; + final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); + index += bigramShift; + groupAddress += bigramShift; + } + } + + } + if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException( + "Not the same size : written " + + (index - node.mCachedAddress) + " bytes out of a node that should have " + + node.mCachedSize + " bytes"); + return index; + } + + /** + * Dumps a collection of useful statistics about a node array. + * + * This prints purely informative stuff, like the total estimated file size, the + * number of nodes, of character groups, the repartition of each address size, etc + * + * @param nodes the node array. + */ + private static void showStatistics(ArrayList nodes) { + int firstTerminalAddress = Integer.MAX_VALUE; + int lastTerminalAddress = Integer.MIN_VALUE; + int size = 0; + int charGroups = 0; + int maxGroups = 0; + int maxRuns = 0; + for (Node n : nodes) { + if (maxGroups < n.mData.size()) maxGroups = n.mData.size(); + for (CharGroup cg : n.mData) { + ++charGroups; + if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length; + if (cg.mFrequency >= 0) { + if (n.mCachedAddress < firstTerminalAddress) + firstTerminalAddress = n.mCachedAddress; + if (n.mCachedAddress > lastTerminalAddress) + lastTerminalAddress = n.mCachedAddress; + } + } + if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize; + } + final int[] groupCounts = new int[maxGroups + 1]; + final int[] runCounts = new int[maxRuns + 1]; + for (Node n : nodes) { + ++groupCounts[n.mData.size()]; + for (CharGroup cg : n.mData) { + ++runCounts[cg.mChars.length]; + } + } + + MakedictLog.i("Statistics:\n" + + " total file size " + size + "\n" + + " " + nodes.size() + " nodes\n" + + " " + charGroups + " groups (" + ((float)charGroups / nodes.size()) + + " groups per node)\n" + + " first terminal at " + firstTerminalAddress + "\n" + + " last terminal at " + lastTerminalAddress + "\n" + + " Group stats : max = " + maxGroups); + for (int i = 0; i < groupCounts.length; ++i) { + MakedictLog.i(" " + i + " : " + groupCounts[i]); + } + MakedictLog.i(" Character run stats : max = " + maxRuns); + for (int i = 0; i < runCounts.length; ++i) { + MakedictLog.i(" " + i + " : " + runCounts[i]); + } + } + + /** + * Dumps a FusionDictionary to a file. + * + * This is the public entry point to write a dictionary to a file. + * + * @param destination the stream to write the binary data to. + * @param dict the dictionary to write. + * @param version the version of the format to write, currently either 1 or 2. + */ + public static void writeDictionaryBinary(final OutputStream destination, + final FusionDictionary dict, final int version) + throws IOException, UnsupportedFormatException { + + // Addresses are limited to 3 bytes, so we'll just make a 16MB buffer. Since addresses + // can be relative to each node, the structure itself is not limited to 16MB at all, but + // I doubt this will ever be shot. If it is, deciding the order of the nodes becomes + // a quite complicated problem, because though the dictionary itself does not have a + // size limit, each node must still be within 16MB of all its children and parents. + // As long as this is ensured, the dictionary file may grow to any size. + // Anyway, to make a dictionary bigger than 16MB just increase the size of this buffer. + final byte[] buffer = new byte[1 << 24]; + int index = 0; + + if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { + throw new UnsupportedFormatException("Requested file format version " + version + + ", but this implementation only supports versions " + + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION); + } + + // The magic number in big-endian order. + if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { + // Magic number for version 2+. + buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24)); + buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16)); + buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8)); + buffer[index++] = (byte) (0xFF & VERSION_2_MAGIC_NUMBER); + // Dictionary version. + buffer[index++] = (byte) (0xFF & (version >> 8)); + buffer[index++] = (byte) (0xFF & version); + } else { + // Magic number for version 1. + buffer[index++] = (byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8)); + buffer[index++] = (byte) (0xFF & VERSION_1_MAGIC_NUMBER); + // Dictionary version. + buffer[index++] = (byte) (0xFF & version); + } + // Options flags + buffer[index++] = (byte) (0xFF & (OPTIONS >> 8)); + buffer[index++] = (byte) (0xFF & OPTIONS); + if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { + final int headerSizeOffset = index; + index += 4; // Size of the header size + // TODO: Write out the header contents here. + // Write out the header size. + buffer[headerSizeOffset] = (byte) (0xFF & (index >> 24)); + buffer[headerSizeOffset + 1] = (byte) (0xFF & (index >> 16)); + buffer[headerSizeOffset + 2] = (byte) (0xFF & (index >> 8)); + buffer[headerSizeOffset + 3] = (byte) (0xFF & (index >> 0)); + } + + destination.write(buffer, 0, index); + index = 0; + + // Leave the choice of the optimal node order to the flattenTree function. + MakedictLog.i("Flattening the tree..."); + ArrayList flatNodes = flattenTree(dict.mRoot); + + MakedictLog.i("Computing addresses..."); + computeAddresses(dict, flatNodes); + MakedictLog.i("Checking array..."); + checkFlatNodeArray(flatNodes); + + MakedictLog.i("Writing file..."); + int dataEndOffset = 0; + for (Node n : flatNodes) { + dataEndOffset = writePlacedNode(dict, buffer, n); + } + + showStatistics(flatNodes); + + destination.write(buffer, 0, dataEndOffset); + + destination.close(); + MakedictLog.i("Done"); + } + + + // Input methods: Read a binary dictionary to memory. + // readDictionaryBinary is the public entry point for them. + + static final int[] characterBuffer = new int[MAX_WORD_LENGTH]; + private static CharGroupInfo readCharGroup(RandomAccessFile source, + final int originalGroupAddress) throws IOException { + int addressPointer = originalGroupAddress; + final int flags = source.readUnsignedByte(); + ++addressPointer; + final int characters[]; + if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) { + int index = 0; + int character = CharEncoding.readChar(source); + addressPointer += CharEncoding.getCharSize(character); + while (-1 != character) { + characterBuffer[index++] = character; + character = CharEncoding.readChar(source); + addressPointer += CharEncoding.getCharSize(character); + } + characters = Arrays.copyOfRange(characterBuffer, 0, index); + } else { + final int character = CharEncoding.readChar(source); + addressPointer += CharEncoding.getCharSize(character); + characters = new int[] { character }; + } + final int frequency; + if (0 != (FLAG_IS_TERMINAL & flags)) { + ++addressPointer; + frequency = source.readUnsignedByte(); + } else { + frequency = CharGroup.NOT_A_TERMINAL; + } + int childrenAddress = addressPointer; + switch (flags & MASK_GROUP_ADDRESS_TYPE) { + case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + childrenAddress += source.readUnsignedByte(); + addressPointer += 1; + break; + case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + childrenAddress += source.readUnsignedShort(); + addressPointer += 2; + break; + case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort(); + addressPointer += 3; + break; + case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: + default: + childrenAddress = NO_CHILDREN_ADDRESS; + break; + } + ArrayList shortcutTargets = null; + if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { + shortcutTargets = new ArrayList(); + while (true) { + final int targetFlags = source.readUnsignedByte(); + ++addressPointer; + final int sign = 0 == (targetFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; + int targetAddress = addressPointer; + switch (targetFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + targetAddress += sign * source.readUnsignedByte(); + addressPointer += 1; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + targetAddress += sign * source.readUnsignedShort(); + addressPointer += 2; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + final int offset = ((source.readUnsignedByte() << 16) + + source.readUnsignedShort()); + targetAddress += sign * offset; + addressPointer += 3; + break; + default: + throw new RuntimeException("Has shortcut targets with no address"); + } + shortcutTargets.add(new PendingAttribute(targetFlags & FLAG_ATTRIBUTE_FREQUENCY, + targetAddress)); + if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; + } + } + ArrayList bigrams = null; + if (0 != (flags & FLAG_HAS_BIGRAMS)) { + bigrams = new ArrayList(); + while (true) { + final int bigramFlags = source.readUnsignedByte(); + ++addressPointer; + final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; + int bigramAddress = addressPointer; + switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + bigramAddress += sign * source.readUnsignedByte(); + addressPointer += 1; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + bigramAddress += sign * source.readUnsignedShort(); + addressPointer += 2; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + final int offset = ((source.readUnsignedByte() << 16) + + source.readUnsignedShort()); + bigramAddress += sign * offset; + addressPointer += 3; + break; + default: + throw new RuntimeException("Has bigrams with no address"); + } + bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, + bigramAddress)); + if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; + } + } + return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, + childrenAddress, shortcutTargets, bigrams); + } + + /** + * Reads and returns the char group count out of a file and forwards the pointer. + */ + private static int readCharGroupCount(RandomAccessFile source) throws IOException { + final int msb = source.readUnsignedByte(); + if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) { + return msb; + } else { + return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8) + + source.readUnsignedByte(); + } + } + + /** + * Finds, as a string, the word at the address passed as an argument. + * + * @param source the file to read from. + * @param headerSize the size of the header. + * @param address the address to seek. + * @return the word, as a string. + * @throws IOException if the file can't be read. + */ + private static String getWordAtAddress(RandomAccessFile source, long headerSize, + int address) throws IOException { + final long originalPointer = source.getFilePointer(); + source.seek(headerSize); + final int count = readCharGroupCount(source); + int groupOffset = getGroupCountSize(count); + final StringBuilder builder = new StringBuilder(); + String result = null; + + CharGroupInfo last = null; + for (int i = count - 1; i >= 0; --i) { + CharGroupInfo info = readCharGroup(source, groupOffset); + groupOffset = info.mEndAddress; + if (info.mOriginalAddress == address) { + builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); + result = builder.toString(); + break; // and return + } + if (hasChildrenAddress(info.mChildrenAddress)) { + if (info.mChildrenAddress > address) { + if (null == last) continue; + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + source.seek(last.mChildrenAddress + headerSize); + groupOffset = last.mChildrenAddress + 1; + i = source.readUnsignedByte(); + last = null; + continue; + } + last = info; + } + if (0 == i && hasChildrenAddress(last.mChildrenAddress)) { + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + source.seek(last.mChildrenAddress + headerSize); + groupOffset = last.mChildrenAddress + 1; + i = source.readUnsignedByte(); + last = null; + continue; + } + } + source.seek(originalPointer); + return result; + } + + /** + * Reads a single node from a binary file. + * + * This methods reads the file at the current position of its file pointer. A node is + * fully expected to start at the current position. + * This will recursively read other nodes into the structure, populating the reverse + * maps on the fly and using them to keep track of already read nodes. + * + * @param source the data file, correctly positioned at the start of a node. + * @param headerSize the size, in bytes, of the file header. + * @param reverseNodeMap a mapping from addresses to already read nodes. + * @param reverseGroupMap a mapping from addresses to already read character groups. + * @return the read node with all his children already read. + */ + private static Node readNode(RandomAccessFile source, long headerSize, + Map reverseNodeMap, Map reverseGroupMap) + throws IOException { + final int nodeOrigin = (int)(source.getFilePointer() - headerSize); + final int count = readCharGroupCount(source); + final ArrayList nodeContents = new ArrayList(); + int groupOffset = nodeOrigin + getGroupCountSize(count); + for (int i = count; i > 0; --i) { + CharGroupInfo info = readCharGroup(source, groupOffset); + ArrayList shortcutTargets = null; + if (null != info.mShortcutTargets) { + shortcutTargets = new ArrayList(); + for (PendingAttribute target : info.mShortcutTargets) { + final String word = getWordAtAddress(source, headerSize, target.mAddress); + shortcutTargets.add(new WeightedString(word, target.mFrequency)); + } + } + ArrayList bigrams = null; + if (null != info.mBigrams) { + bigrams = new ArrayList(); + for (PendingAttribute bigram : info.mBigrams) { + final String word = getWordAtAddress(source, headerSize, bigram.mAddress); + bigrams.add(new WeightedString(word, bigram.mFrequency)); + } + } + if (hasChildrenAddress(info.mChildrenAddress)) { + Node children = reverseNodeMap.get(info.mChildrenAddress); + if (null == children) { + final long currentPosition = source.getFilePointer(); + source.seek(info.mChildrenAddress + headerSize); + children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap); + source.seek(currentPosition); + } + nodeContents.add( + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, + children, isShortcutOnly(info))); + } else { + nodeContents.add( + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, + isShortcutOnly(info))); + } + groupOffset = info.mEndAddress; + } + final Node node = new Node(nodeContents); + node.mCachedAddress = nodeOrigin; + reverseNodeMap.put(node.mCachedAddress, node); + return node; + } + + /** + * Helper function to get the binary format version from the header. + */ + private static int getFormatVersion(final RandomAccessFile source) throws IOException { + final int magic_v1 = source.readUnsignedShort(); + if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte(); + final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort(); + if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort(); + return NOT_A_VERSION_NUMBER; + } + + /** + * Reads a random access file and returns the memory representation of the dictionary. + * + * This high-level method takes a binary file and reads its contents, populating a + * FusionDictionary structure. The optional dict argument is an existing dictionary to + * which words from the file should be added. If it is null, a new dictionary is created. + * + * @param source the file to read. + * @param dict an optional dictionary to add words to, or null. + * @return the created (or merged) dictionary. + */ + public static FusionDictionary readDictionaryBinary(final RandomAccessFile source, + final FusionDictionary dict) throws IOException, UnsupportedFormatException { + // Check file version + final int version = getFormatVersion(source); + if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION ) { + throw new UnsupportedFormatException("This file has version " + version + + ", but this implementation does not support versions above " + + MAXIMUM_SUPPORTED_VERSION); + } + + // Read options + source.readUnsignedShort(); + + final long headerSize; + if (version < FIRST_VERSION_WITH_HEADER_SIZE) { + headerSize = source.getFilePointer(); + } else { + headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16) + + (source.readUnsignedByte() << 8) + source.readUnsignedByte(); + // read the header body + source.seek(headerSize); + } + + Map reverseNodeMapping = new TreeMap(); + Map reverseGroupMapping = new TreeMap(); + final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping); + + FusionDictionary newDict = new FusionDictionary(root, + new FusionDictionary.DictionaryOptions()); + if (null != dict) { + for (Word w : dict) { + newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); + } + } + + return newDict; + } + + /** + * Basic test to find out whether the file is a binary dictionary or not. + * + * Concretely this only tests the magic number. + * + * @param filename The name of the file to test. + * @return true if it's a binary dictionary, false otherwise + */ + public static boolean isBinaryDictionary(final String filename) { + try { + RandomAccessFile f = new RandomAccessFile(filename, "r"); + final int version = getFormatVersion(f); + return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION); + } catch (FileNotFoundException e) { + return false; + } catch (IOException e) { + return false; + } + } +} -- cgit v1.2.3-83-g751a From 3bbb31f3f00e64cb68bd5877ae69d6dbccfeb519 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 27 Mar 2012 12:36:19 +0900 Subject: Change the format of the shortcuts in the binary dict. This only includes the write part of the change. The read part is coming in a different commit. Change-Id: Iabe7af6cd134462dc19245f5400719920ed31c8f --- .../latin/makedict/BinaryDictInputOutput.java | 151 +++++++++++++++------ 1 file changed, 107 insertions(+), 44 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 42dd4df34..a0059e24c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -47,7 +47,6 @@ public class BinaryDictInputOutput { * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS - * | is shortcut only ? 1 bit, 1 = yes, 0 = no : FLAG_IS_SHORTCUT_ONLY * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers @@ -74,7 +73,7 @@ public class BinaryDictInputOutput { * dress * * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS - * | shortcut targets address list + * | shortcut string list * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS * | bigrams address list * @@ -89,7 +88,7 @@ public class BinaryDictInputOutput { * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control * characters which should never happen anyway (and still work, but take 3 bytes). * - * bigram and shortcut address list is: + * bigram address list is: * = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT * | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE * | 1 = must take -address, 0 = must take +address @@ -107,8 +106,16 @@ public class BinaryDictInputOutput { * | read 3 bytes, add top 4 bits * | END * | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address - * if (FLAG_ATTRIBUTE_HAS_NET) goto bigram_and_shortcut_address_list_is + * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is * + * shortcut string list is: + * = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes. + * = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT + * | reserved = 3 bits, must be 0 + * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY + * = | string of characters at the char format described above, with the terminator + * | used to signal the end of the string. + * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags */ private static final int VERSION_1_MAGIC_NUMBER = 0x78B1; @@ -136,7 +143,6 @@ public class BinaryDictInputOutput { private static final int FLAG_IS_TERMINAL = 0x10; private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; private static final int FLAG_HAS_BIGRAMS = 0x04; - private static final int FLAG_IS_SHORTCUT_ONLY = 0x02; private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; @@ -154,6 +160,7 @@ public class BinaryDictInputOutput { private static final int GROUP_MAX_ADDRESS_SIZE = 3; private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1; private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; + private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2; private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; private static final int INVALID_CHARACTER = -1; @@ -215,24 +222,52 @@ public class BinaryDictInputOutput { /** * Writes a char array to a byte buffer. * - * @param characters the character array to write. + * @param codePoints the code point array to write. * @param buffer the byte buffer to write to. * @param index the index in buffer to write the character array to. * @return the index after the last character. */ - private static int writeCharArray(int[] characters, byte[] buffer, int index) { - for (int character : characters) { - if (1 == getCharSize(character)) { - buffer[index++] = (byte)character; + private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { + for (int codePoint : codePoints) { + if (1 == getCharSize(codePoint)) { + buffer[index++] = (byte)codePoint; } else { - buffer[index++] = (byte)(0xFF & (character >> 16)); - buffer[index++] = (byte)(0xFF & (character >> 8)); - buffer[index++] = (byte)(0xFF & character); + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); } } return index; } + /** + * Writes a string with our character format to a byte buffer. + * + * This will also write the terminator byte. + * + * @param buffer the byte buffer to write to. + * @param origin the offset to write from. + * @param word the string to write. + * @return the size written, in bytes. + */ + private static int writeString(final byte[] buffer, final int origin, + final String word) { + final int length = word.length(); + int index = origin; + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + if (1 == getCharSize(codePoint)) { + buffer[index++] = (byte)codePoint; + } else { + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); + } + } + buffer[index++] = GROUP_CHARACTERS_TERMINATOR; + return index - origin; + } + /** * Reads a character from the file. * @@ -293,6 +328,36 @@ public class BinaryDictInputOutput { return getGroupCountSize(node.mData.size()); } + /** + * Compute the size of a shortcut in bytes. + */ + private static int getShortcutSize(final WeightedString shortcut) { + int size = GROUP_ATTRIBUTE_FLAGS_SIZE; + final String word = shortcut.mWord; + final int length = word.length(); + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + size += CharEncoding.getCharSize(codePoint); + } + size += GROUP_TERMINATOR_SIZE; + return size; + } + + /** + * Compute the size of a shortcut list in bytes. + * + * This is known in advance and does not change according to position in the file + * like address lists do. + */ + private static int getShortcutListSize(final ArrayList shortcutList) { + if (null == shortcutList) return 0; + int size = GROUP_SHORTCUT_LIST_SIZE_SIZE; + for (final WeightedString shortcut : shortcutList) { + size += getShortcutSize(shortcut); + } + return size; + } + /** * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. * @@ -304,10 +369,7 @@ public class BinaryDictInputOutput { // If terminal, one byte for the frequency if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; size += GROUP_MAX_ADDRESS_SIZE; // For children address - if (null != group.mShortcutTargets) { - size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) - * group.mShortcutTargets.size(); - } + size += getShortcutListSize(group.mShortcutTargets); if (null != group.mBigrams) { size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) * group.mBigrams.size(); @@ -338,13 +400,6 @@ public class BinaryDictInputOutput { return NO_CHILDREN_ADDRESS != address; } - /** - * Helper method to find out if a character info is a shortcut only. - */ - private static boolean isShortcutOnly(final CharGroupInfo info) { - return 0 != (info.mFlags & FLAG_IS_SHORTCUT_ONLY); - } - /** * Compute the size, in bytes, that an address will occupy. * @@ -430,15 +485,7 @@ public class BinaryDictInputOutput { final int offset = group.mChildren.mCachedAddress - offsetBasePoint; groupSize += getByteSize(offset); } - if (null != group.mShortcutTargets) { - for (WeightedString target : group.mShortcutTargets) { - final int offsetBasePoint = groupSize + node.mCachedAddress + size - + GROUP_FLAGS_SIZE; - final int addressOfTarget = findAddressOfWord(dict, target.mWord); - final int offset = addressOfTarget - offsetBasePoint; - groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; - } - } + groupSize += getShortcutListSize(group.mShortcutTargets); if (null != group.mBigrams) { for (WeightedString bigram : group.mBigrams) { final int offsetBasePoint = groupSize + node.mCachedAddress + size @@ -555,7 +602,7 @@ public class BinaryDictInputOutput { * @param address the address to write. * @return the size in bytes the address actually took. */ - private static int writeVariableAddress(byte[] buffer, int index, int address) { + private static int writeVariableAddress(final byte[] buffer, int index, final int address) { switch (getByteSize(address)) { case 1: buffer[index++] = (byte)address; @@ -610,9 +657,6 @@ public class BinaryDictInputOutput { } flags |= FLAG_HAS_BIGRAMS; } - if (group.mIsShortcutOnly) { - flags |= FLAG_IS_SHORTCUT_ONLY; - } return flags; } @@ -645,6 +689,17 @@ public class BinaryDictInputOutput { return bigramFlags; } + /** + * Makes the flag value for a shortcut. + * + * @param more whether there are more attributes after this one. + * @param frequency the frequency of the attribute, 0..15 + * @return the flags + */ + private static final int makeShortcutFlags(final boolean more, final int frequency) { + return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY); + } + /** * Write a node to memory. The node is expected to have its final position cached. * @@ -675,7 +730,8 @@ public class BinaryDictInputOutput { for (int i = 0; i < groupCount; ++i) { CharGroup group = node.mData.get(i); if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " - + "the same as the cached address of the group"); + + "the same as the cached address of the group : " + + index + " <> " + group.mCachedAddress); groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group); // Sanity checks. if (group.mFrequency > MAX_TERMINAL_FREQUENCY) { @@ -700,19 +756,26 @@ public class BinaryDictInputOutput { // Write shortcuts if (null != group.mShortcutTargets) { + final int indexOfShortcutByteSize = index; + index += GROUP_SHORTCUT_LIST_SIZE_SIZE; + groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE; final Iterator shortcutIterator = group.mShortcutTargets.iterator(); while (shortcutIterator.hasNext()) { final WeightedString target = (WeightedString)shortcutIterator.next(); - final int addressOfTarget = findAddressOfWord(dict, target.mWord); ++groupAddress; - final int offset = addressOfTarget - groupAddress; - int shortcutFlags = makeAttributeFlags(shortcutIterator.hasNext(), offset, + int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(), target.mFrequency); buffer[index++] = (byte)shortcutFlags; - final int shortcutShift = writeVariableAddress(buffer, index, Math.abs(offset)); + final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord); index += shortcutShift; groupAddress += shortcutShift; } + final int shortcutByteSize = index - indexOfShortcutByteSize; + if (shortcutByteSize > 0xFFFF) { + throw new RuntimeException("Shortcut list too large"); + } + buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8); + buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF); } // Write bigrams if (null != group.mBigrams) { @@ -1112,11 +1175,11 @@ public class BinaryDictInputOutput { } nodeContents.add( new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, - children, isShortcutOnly(info))); + children, false)); } else { nodeContents.add( new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, - isShortcutOnly(info))); + false)); } groupOffset = info.mEndAddress; } -- cgit v1.2.3-83-g751a From 752996540ff3a6dd5b48819849c06355c4270e03 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 27 Mar 2012 17:59:30 +0900 Subject: Add read support for string shortcuts for makedict. Change-Id: I48ee4fc9ac703ad2a680b3cd848de91c415ea3c8 --- .../latin/makedict/BinaryDictInputOutput.java | 55 +++++++++------------- .../inputmethod/latin/makedict/CharGroupInfo.java | 6 ++- 2 files changed, 26 insertions(+), 35 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index a0059e24c..af7f863ee 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -268,6 +268,19 @@ public class BinaryDictInputOutput { return index - origin; } + /** + * Reads a string from a RandomAccessFile. This is the converse of the above method. + */ + private static String readString(final RandomAccessFile source) throws IOException { + final StringBuilder s = new StringBuilder(); + int character = readChar(source); + while (character != INVALID_CHARACTER) { + s.appendCodePoint(character); + character = readChar(source); + } + return s.toString(); + } + /** * Reads a character from the file. * @@ -995,36 +1008,19 @@ public class BinaryDictInputOutput { childrenAddress = NO_CHILDREN_ADDRESS; break; } - ArrayList shortcutTargets = null; + ArrayList shortcutTargets = null; if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { - shortcutTargets = new ArrayList(); + final long pointerBefore = source.getFilePointer(); + shortcutTargets = new ArrayList(); + source.readUnsignedShort(); // Skip the size while (true) { final int targetFlags = source.readUnsignedByte(); - ++addressPointer; - final int sign = 0 == (targetFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; - int targetAddress = addressPointer; - switch (targetFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { - case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: - targetAddress += sign * source.readUnsignedByte(); - addressPointer += 1; - break; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: - targetAddress += sign * source.readUnsignedShort(); - addressPointer += 2; - break; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: - final int offset = ((source.readUnsignedByte() << 16) - + source.readUnsignedShort()); - targetAddress += sign * offset; - addressPointer += 3; - break; - default: - throw new RuntimeException("Has shortcut targets with no address"); - } - shortcutTargets.add(new PendingAttribute(targetFlags & FLAG_ATTRIBUTE_FREQUENCY, - targetAddress)); + final String word = CharEncoding.readString(source); + shortcutTargets.add(new WeightedString(word, + targetFlags & FLAG_ATTRIBUTE_FREQUENCY)); if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; } + addressPointer += (source.getFilePointer() - pointerBefore); } ArrayList bigrams = null; if (0 != (flags & FLAG_HAS_BIGRAMS)) { @@ -1149,14 +1145,7 @@ public class BinaryDictInputOutput { int groupOffset = nodeOrigin + getGroupCountSize(count); for (int i = count; i > 0; --i) { CharGroupInfo info = readCharGroup(source, groupOffset); - ArrayList shortcutTargets = null; - if (null != info.mShortcutTargets) { - shortcutTargets = new ArrayList(); - for (PendingAttribute target : info.mShortcutTargets) { - final String word = getWordAtAddress(source, headerSize, target.mAddress); - shortcutTargets.add(new WeightedString(word, target.mFrequency)); - } - } + ArrayList shortcutTargets = info.mShortcutTargets; ArrayList bigrams = null; if (null != info.mBigrams) { bigrams = new ArrayList(); diff --git a/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java b/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java index 444b11732..ef7dbb251 100644 --- a/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java +++ b/java/src/com/android/inputmethod/latin/makedict/CharGroupInfo.java @@ -16,6 +16,8 @@ package com.android.inputmethod.latin.makedict; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + import java.util.ArrayList; /** @@ -29,12 +31,12 @@ public class CharGroupInfo { public final int[] mCharacters; public final int mFrequency; public final int mChildrenAddress; - public final ArrayList mShortcutTargets; + public final ArrayList mShortcutTargets; public final ArrayList mBigrams; public CharGroupInfo(final int originalAddress, final int endAddress, final int flags, final int[] characters, final int frequency, final int childrenAddress, - final ArrayList shortcutTargets, + final ArrayList shortcutTargets, final ArrayList bigrams) { mOriginalAddress = originalAddress; mEndAddress = endAddress; -- cgit v1.2.3-83-g751a From c734c2aca1830643d169fd292e0c9d4d9306af5a Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 3 Apr 2012 13:19:57 +0900 Subject: Add a simple way to input dictionary header attributes Just add them as an attribute to the root of the XML node. Bug: 6202812 Change-Id: Idf040bfebf20a72f9e4370930a85d97df593f484 --- .../latin/makedict/BinaryDictInputOutput.java | 19 ++++++++++++++++--- .../inputmethod/latin/makedict/FusionDictionary.java | 16 +++++++++++++++- .../latin/makedict/XmlDictInputOutput.java | 14 +++++++++++++- 3 files changed, 44 insertions(+), 5 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index af7f863ee..010ea6813 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -26,6 +26,7 @@ import java.io.OutputStream; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; @@ -922,7 +923,14 @@ public class BinaryDictInputOutput { if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { final int headerSizeOffset = index; index += 4; // Size of the header size - // TODO: Write out the header contents here. + + // Write out the options. + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + index += CharEncoding.writeString(buffer, index, key); + index += CharEncoding.writeString(buffer, index, value); + } + // Write out the header size. buffer[headerSizeOffset] = (byte) (0xFF & (index >> 24)); buffer[headerSizeOffset + 1] = (byte) (0xFF & (index >> 16)); @@ -1214,12 +1222,17 @@ public class BinaryDictInputOutput { source.readUnsignedShort(); final long headerSize; + final HashMap options = new HashMap(); if (version < FIRST_VERSION_WITH_HEADER_SIZE) { headerSize = source.getFilePointer(); } else { headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16) + (source.readUnsignedByte() << 8) + source.readUnsignedByte(); - // read the header body + while (source.getFilePointer() < headerSize) { + final String key = CharEncoding.readString(source); + final String value = CharEncoding.readString(source); + options.put(key, value); + } source.seek(headerSize); } @@ -1228,7 +1241,7 @@ public class BinaryDictInputOutput { final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping); FusionDictionary newDict = new FusionDictionary(root, - new FusionDictionary.DictionaryOptions()); + new FusionDictionary.DictionaryOptions(options)); if (null != dict) { for (Word w : dict) { newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index d3ffb47ad..99b17048d 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -19,6 +19,7 @@ package com.android.inputmethod.latin.makedict; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; @@ -250,6 +251,10 @@ public class FusionDictionary implements Iterable { * There are no options at the moment, so this class is empty. */ public static class DictionaryOptions { + final HashMap mAttributes; + public DictionaryOptions(final HashMap attributes) { + mAttributes = attributes; + } } @@ -257,8 +262,13 @@ public class FusionDictionary implements Iterable { public final Node mRoot; public FusionDictionary() { - mOptions = new DictionaryOptions(); mRoot = new Node(); + mOptions = new DictionaryOptions(new HashMap()); + } + + public FusionDictionary(final HashMap attributes) { + mRoot = new Node(); + mOptions = new DictionaryOptions(attributes); } public FusionDictionary(final Node root, final DictionaryOptions options) { @@ -266,6 +276,10 @@ public class FusionDictionary implements Iterable { mOptions = options; } + public void addOptionAttribute(final String key, final String value) { + mOptions.mAttributes.put(key, value); + } + /** * Helper method to convert a String to an int array. */ diff --git a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java index 483473b3c..1d45fd25f 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java @@ -40,6 +40,7 @@ import org.xml.sax.helpers.DefaultHandler; */ public class XmlDictInputOutput { + private static final String ROOT_TAG = "wordlist"; private static final String WORD_TAG = "w"; private static final String BIGRAM_TAG = "bigram"; private static final String SHORTCUT_TAG = "shortcut"; @@ -96,6 +97,11 @@ public class XmlDictInputOutput { mFreq = Integer.parseInt(attrs.getValue(attrIndex)); } } + } else if (ROOT_TAG.equals(localName)) { + for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { + final String attrName = attrs.getLocalName(attrIndex); + mDictionary.mOptions.mAttributes.put(attrName, attrs.getValue(attrIndex)); + } } else { mState = UNKNOWN; } @@ -275,7 +281,13 @@ public class XmlDictInputOutput { set.add(word); } // TODO: use an XMLSerializer if this gets big - destination.write("\n"); + destination.write(" options = dict.mOptions.mAttributes; + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + destination.write(" " + key + "=\"" + value + "\""); + } + destination.write(">\n"); destination.write("\n"); for (Word word : set) { destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " -- cgit v1.2.3-83-g751a From 8cf1a8d04f77aefed3a57f6994869e0f35b1a8b4 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 27 Mar 2012 21:36:52 +0900 Subject: Remove the shortcutOnly attribute which is now useless. Change-Id: Ifccdfdaf7c0066bb7728981503baceff0fedb71f --- .../latin/makedict/BinaryDictInputOutput.java | 5 +- .../latin/makedict/FusionDictionary.java | 70 ++++++---------------- .../android/inputmethod/latin/makedict/Word.java | 6 +- .../latin/makedict/XmlDictInputOutput.java | 13 +--- 4 files changed, 21 insertions(+), 73 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 010ea6813..820c0a59c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -1172,11 +1172,10 @@ public class BinaryDictInputOutput { } nodeContents.add( new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, - children, false)); + children)); } else { nodeContents.add( - new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, - false)); + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency)); } groupOffset = info.mEndAddress; } diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 99b17048d..d8081e1f4 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -98,35 +98,24 @@ public class FusionDictionary implements Iterable { ArrayList mShortcutTargets; ArrayList mBigrams; int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. - boolean mIsShortcutOnly; // Only valid if this is a terminal. Node mChildren; // The two following members to help with binary generation int mCachedSize; int mCachedAddress; public CharGroup(final int[] chars, final ArrayList shortcutTargets, - final ArrayList bigrams, final int frequency, - final boolean isShortcutOnly) { + final ArrayList bigrams, final int frequency) { mChars = chars; mFrequency = frequency; - mIsShortcutOnly = isShortcutOnly; - if (mIsShortcutOnly && NOT_A_TERMINAL == mFrequency) { - throw new RuntimeException("A node must be a terminal to be a shortcut only"); - } mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = null; } public CharGroup(final int[] chars, final ArrayList shortcutTargets, - final ArrayList bigrams, final int frequency, final Node children, - final boolean isShortcutOnly) { + final ArrayList bigrams, final int frequency, final Node children) { mChars = chars; mFrequency = frequency; - mIsShortcutOnly = isShortcutOnly; - if (mIsShortcutOnly && NOT_A_TERMINAL == mFrequency) { - throw new RuntimeException("A node must be a terminal to be a shortcut only"); - } mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = children; @@ -205,7 +194,7 @@ public class FusionDictionary implements Iterable { * updated if they are higher than the existing ones. */ public void update(int frequency, ArrayList shortcutTargets, - ArrayList bigrams, boolean isShortcutOnly) { + ArrayList bigrams) { if (frequency > mFrequency) { mFrequency = frequency; } @@ -241,7 +230,6 @@ public class FusionDictionary implements Iterable { } } } - mIsShortcutOnly = isShortcutOnly; } } @@ -304,7 +292,7 @@ public class FusionDictionary implements Iterable { for (WeightedString word : words) { final CharGroup t = findWordInTree(mRoot, word.mWord); if (null == t) { - add(getCodePoints(word.mWord), 0, null, null, false /* isShortcutOnly */); + add(getCodePoints(word.mWord), 0, null, null); } } } @@ -328,7 +316,7 @@ public class FusionDictionary implements Iterable { if (null != bigrams) { addNeutralWords(bigrams); } - add(getCodePoints(word), frequency, shortcutTargets, bigrams, false /* isShortcutOnly */); + add(getCodePoints(word), frequency, shortcutTargets, bigrams); } /** @@ -349,21 +337,6 @@ public class FusionDictionary implements Iterable { } } - /** - * Helper method to add a shortcut that should not be a dictionary word. - * - * @param word the word to add. - * @param frequency the frequency of the word, in the range [0..255]. - * @param shortcutTargets a list of shortcut targets. May not be null. - */ - public void addShortcutOnly(final String word, final int frequency, - final ArrayList shortcutTargets) { - if (null == shortcutTargets) { - throw new RuntimeException("Can't add a shortcut without targets"); - } - add(getCodePoints(word), frequency, shortcutTargets, null, true /* isShortcutOnly */); - } - /** * Helper method to add a new bigram to the dictionary. * @@ -377,7 +350,7 @@ public class FusionDictionary implements Iterable { final CharGroup charGroup2 = findWordInTree(mRoot, word2); if (charGroup2 == null) { // TODO: refactor with the identical code in addNeutralWords - add(getCodePoints(word2), 0, null, null, false /* isShortcutOnly */); + add(getCodePoints(word2), 0, null, null); } charGroup.addBigram(word2, frequency); } else { @@ -395,12 +368,10 @@ public class FusionDictionary implements Iterable { * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets an optional list of shortcut targets for this word (null if none). * @param bigrams an optional list of bigrams for this word (null if none). - * @param isShortcutOnly whether this should be a shortcut only. */ private void add(final int[] word, final int frequency, final ArrayList shortcutTargets, - final ArrayList bigrams, - final boolean isShortcutOnly) { + final ArrayList bigrams) { assert(frequency >= 0 && frequency <= 255); Node currentNode = mRoot; int charIndex = 0; @@ -425,7 +396,7 @@ public class FusionDictionary implements Iterable { final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final CharGroup newGroup = new CharGroup( Arrays.copyOfRange(word, charIndex, word.length), - shortcutTargets, bigrams, frequency, isShortcutOnly); + shortcutTargets, bigrams, frequency); currentNode.mData.add(insertionIndex, newGroup); checkStack(currentNode); } else { @@ -435,13 +406,13 @@ public class FusionDictionary implements Iterable { // The new word is a prefix of an existing word, but the node on which it // should end already exists as is. Since the old CharNode was not a terminal, // make it one by filling in its frequency and other attributes - currentGroup.update(frequency, shortcutTargets, bigrams, isShortcutOnly); + currentGroup.update(frequency, shortcutTargets, bigrams); } else { // The new word matches the full old word and extends past it. // We only have to create a new node and add it to the end of this. final CharGroup newNode = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), - shortcutTargets, bigrams, frequency, isShortcutOnly); + shortcutTargets, bigrams, frequency); currentGroup.mChildren = new Node(); currentGroup.mChildren.mData.add(newNode); } @@ -449,7 +420,7 @@ public class FusionDictionary implements Iterable { if (0 == differentCharIndex) { // Exact same word. Update the frequency if higher. This will also add the // new bigrams to the existing bigram list if it already exists. - currentGroup.update(frequency, shortcutTargets, bigrams, isShortcutOnly); + currentGroup.update(frequency, shortcutTargets, bigrams); } else { // Partial prefix match only. We have to replace the current node with a node // containing the current prefix and create two new ones for the tails. @@ -457,26 +428,21 @@ public class FusionDictionary implements Iterable { final CharGroup newOldWord = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, currentGroup.mChars.length), currentGroup.mShortcutTargets, - currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren, - currentGroup.mIsShortcutOnly); + currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren); newChildren.mData.add(newOldWord); final CharGroup newParent; if (charIndex + differentCharIndex >= word.length) { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - shortcutTargets, bigrams, frequency, newChildren, isShortcutOnly); + shortcutTargets, bigrams, frequency, newChildren); } else { - // isShortcutOnly makes no sense for non-terminal nodes. The following node - // is non-terminal (frequency 0 in FusionDictionary representation) so we - // pass false for isShortcutOnly newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - null, null, -1, newChildren, false /* isShortcutOnly */); + null, null, -1, newChildren); final CharGroup newWord = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, - word.length), shortcutTargets, bigrams, frequency, - isShortcutOnly); + word.length), shortcutTargets, bigrams, frequency); final int addIndex = word[charIndex + differentCharIndex] > currentGroup.mChars[differentCharIndex] ? 1 : 0; newChildren.mData.add(addIndex, newWord); @@ -534,8 +500,7 @@ public class FusionDictionary implements Iterable { */ private static int findInsertionIndex(final Node node, int character) { final ArrayList data = node.mData; - final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0, - false /* isShortcutOnly */); + final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); return result >= 0 ? result : -result - 1; } @@ -763,8 +728,7 @@ public class FusionDictionary implements Iterable { } if (currentGroup.mFrequency >= 0) return new Word(mCurrentString.toString(), currentGroup.mFrequency, - currentGroup.mShortcutTargets, currentGroup.mBigrams, - currentGroup.mIsShortcutOnly); + currentGroup.mShortcutTargets, currentGroup.mBigrams); } else { mPositions.removeLast(); currentPos = mPositions.getLast(); diff --git a/java/src/com/android/inputmethod/latin/makedict/Word.java b/java/src/com/android/inputmethod/latin/makedict/Word.java index 4e0ab1049..d07826757 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Word.java +++ b/java/src/com/android/inputmethod/latin/makedict/Word.java @@ -29,7 +29,6 @@ import java.util.Arrays; public class Word implements Comparable { final String mWord; final int mFrequency; - final boolean mIsShortcutOnly; final ArrayList mShortcutTargets; final ArrayList mBigrams; @@ -37,19 +36,17 @@ public class Word implements Comparable { public Word(final String word, final int frequency, final ArrayList shortcutTargets, - final ArrayList bigrams, final boolean isShortcutOnly) { + final ArrayList bigrams) { mWord = word; mFrequency = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; - mIsShortcutOnly = isShortcutOnly; } private static int computeHashCode(Word word) { return Arrays.hashCode(new Object[] { word.mWord, word.mFrequency, - word.mIsShortcutOnly, word.mShortcutTargets.hashCode(), word.mBigrams.hashCode() }); @@ -80,7 +77,6 @@ public class Word implements Comparable { if (!(o instanceof Word)) return false; Word w = (Word)o; return mFrequency == w.mFrequency && mWord.equals(w.mWord) - && mIsShortcutOnly == w.mIsShortcutOnly && mShortcutTargets.equals(w.mShortcutTargets) && mBigrams.equals(w.mBigrams); } diff --git a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java index 1d45fd25f..c51eea5ef 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java @@ -46,7 +46,6 @@ public class XmlDictInputOutput { private static final String SHORTCUT_TAG = "shortcut"; private static final String FREQUENCY_ATTR = "f"; private static final String WORD_ATTR = "word"; - private static final String SHORTCUT_ONLY_ATTR = "shortcutOnly"; private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1; @@ -241,15 +240,6 @@ public class XmlDictInputOutput { new UnigramHandler(dict, shortcutHandler.getShortcutMap(), bigramHandler.getBigramMap()); parser.parse(unigrams, unigramHandler); - - final HashMap> shortcutMap = - shortcutHandler.getShortcutMap(); - for (final String shortcut : shortcutMap.keySet()) { - if (dict.hasWord(shortcut)) continue; - // TODO: list a frequency in the shortcut file and use it here, instead of - // a constant freq - dict.addShortcutOnly(shortcut, SHORTCUT_ONLY_DEFAULT_FREQ, shortcutMap.get(shortcut)); - } return dict; } @@ -291,8 +281,7 @@ public class XmlDictInputOutput { destination.write("\n"); for (Word word : set) { destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " - + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\" " + SHORTCUT_ONLY_ATTR - + "=\"" + word.mIsShortcutOnly + "\">"); + + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">"); if (null != word.mShortcutTargets) { destination.write("\n"); for (WeightedString target : word.mShortcutTargets) { -- cgit v1.2.3-83-g751a From f420df28233c26e555d203185fb292e83b94b8c3 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 6 Apr 2012 15:30:42 +0900 Subject: Add support for German umlaut and French ligatures flags Bug: 6202812 Change-Id: Ib4a7f96f6ef86c840069b15d04393f84d428c176 --- .../latin/makedict/BinaryDictInputOutput.java | 25 ++++++++++++++++------ .../latin/makedict/FusionDictionary.java | 10 ++++++--- .../latin/makedict/XmlDictInputOutput.java | 12 ++++++++++- .../latin/BinaryDictInputOutputTest.java | 3 ++- 4 files changed, 39 insertions(+), 11 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 820c0a59c..d22332116 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -17,6 +17,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.Node; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; @@ -126,8 +127,9 @@ public class BinaryDictInputOutput { private static final int NOT_A_VERSION_NUMBER = -1; private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2; - // No options yet, reserved for future use. - private static final int OPTIONS = 0; + // These options need to be the same numeric values as the one in the native reading code. + private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; + private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; // TODO: Make this value adaptative to content data, store it in the header, and // use it in the reading code. @@ -703,6 +705,14 @@ public class BinaryDictInputOutput { return bigramFlags; } + /** + * Makes the 2-byte value for options flags. + */ + private static final int makeOptionsValue(final DictionaryOptions options) { + return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0) + + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0); + } + /** * Makes the flag value for a shortcut. * @@ -918,8 +928,9 @@ public class BinaryDictInputOutput { buffer[index++] = (byte) (0xFF & version); } // Options flags - buffer[index++] = (byte) (0xFF & (OPTIONS >> 8)); - buffer[index++] = (byte) (0xFF & OPTIONS); + final int options = makeOptionsValue(dict.mOptions); + buffer[index++] = (byte) (0xFF & (options >> 8)); + buffer[index++] = (byte) (0xFF & options); if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { final int headerSizeOffset = index; index += 4; // Size of the header size @@ -1218,7 +1229,7 @@ public class BinaryDictInputOutput { } // Read options - source.readUnsignedShort(); + final int optionsFlags = source.readUnsignedShort(); final long headerSize; final HashMap options = new HashMap(); @@ -1240,7 +1251,9 @@ public class BinaryDictInputOutput { final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping); FusionDictionary newDict = new FusionDictionary(root, - new FusionDictionary.DictionaryOptions(options)); + new FusionDictionary.DictionaryOptions(options, + 0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG), + 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG))); if (null != dict) { for (Word w : dict) { newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 3515287b0..40bcfc3aa 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -239,13 +239,17 @@ public class FusionDictionary implements Iterable { * There are no options at the moment, so this class is empty. */ public static class DictionaryOptions { - final HashMap mAttributes; - public DictionaryOptions(final HashMap attributes) { + public final boolean mGermanUmlautProcessing; + public final boolean mFrenchLigatureProcessing; + public final HashMap mAttributes; + public DictionaryOptions(final HashMap attributes, + final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { mAttributes = attributes; + mGermanUmlautProcessing = germanUmlautProcessing; + mFrenchLigatureProcessing = frenchLigatureProcessing; } } - public final DictionaryOptions mOptions; public final Node mRoot; diff --git a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java index 0acd1b2a9..dfc8baa4d 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java @@ -51,6 +51,10 @@ public class XmlDictInputOutput { private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1; + private static final String OPTIONS_KEY = "options"; + private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; + private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; + /** * SAX handler for a unigram XML file. */ @@ -114,7 +118,13 @@ public class XmlDictInputOutput { final String attrName = attrs.getLocalName(attrIndex); attributes.put(attrName, attrs.getValue(attrIndex)); } - mDictionary = new FusionDictionary(new Node(), new DictionaryOptions(attributes)); + final String optionsString = attributes.get(OPTIONS_KEY); + final boolean processUmlauts = + GERMAN_UMLAUT_PROCESSING_OPTION.equals(optionsString); + final boolean processLigatures = + FRENCH_LIGATURE_PROCESSING_OPTION.equals(optionsString); + mDictionary = new FusionDictionary(new Node(), new DictionaryOptions(attributes, + processUmlauts, processLigatures)); } else { mState = UNKNOWN; } diff --git a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java index e19c7d53b..191eb804d 100644 --- a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java +++ b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java @@ -41,7 +41,8 @@ public class BinaryDictInputOutputTest extends TestCase { // that it does not contain any duplicates. public void testFlattenNodes() { final FusionDictionary dict = new FusionDictionary(new Node(), - new DictionaryOptions(new HashMap())); + new DictionaryOptions(new HashMap(), + false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); dict.add("foo", 1, null, null); dict.add("fta", 1, null, null); dict.add("ftb", 1, null, null); -- cgit v1.2.3-83-g751a From df7ebbbd616fa5aff569d00b16cd3f85ddf2da6d Mon Sep 17 00:00:00 2001 From: Tom Ouyang Date: Wed, 18 Apr 2012 14:15:34 -0700 Subject: Change binary dictionary output buffer size to match dictionary size. Bug: 6355943 Change-Id: Iaab7bc16ba0dbc7bfde70b06e7bd355519838831 --- .../latin/makedict/BinaryDictInputOutput.java | 97 +++++++++++++++------- 1 file changed, 65 insertions(+), 32 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index d22332116..97df98e34 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -21,6 +21,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions import com.android.inputmethod.latin.makedict.FusionDictionary.Node; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; @@ -271,6 +272,29 @@ public class BinaryDictInputOutput { return index - origin; } + /** + * Writes a string with our character format to a ByteArrayOutputStream. + * + * This will also write the terminator byte. + * + * @param buffer the ByteArrayOutputStream to write to. + * @param word the string to write. + */ + private static void writeString(ByteArrayOutputStream buffer, final String word) { + final int length = word.length(); + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + if (1 == getCharSize(codePoint)) { + buffer.write((byte) codePoint); + } else { + buffer.write((byte) (0xFF & (codePoint >> 16))); + buffer.write((byte) (0xFF & (codePoint >> 8))); + buffer.write((byte) (0xFF & codePoint)); + } + } + buffer.write(GROUP_CHARACTERS_TERMINATOR); + } + /** * Reads a string from a RandomAccessFile. This is the converse of the above method. */ @@ -894,15 +918,11 @@ public class BinaryDictInputOutput { final FusionDictionary dict, final int version) throws IOException, UnsupportedFormatException { - // Addresses are limited to 3 bytes, so we'll just make a 16MB buffer. Since addresses - // can be relative to each node, the structure itself is not limited to 16MB at all, but - // I doubt this will ever be shot. If it is, deciding the order of the nodes becomes - // a quite complicated problem, because though the dictionary itself does not have a - // size limit, each node must still be within 16MB of all its children and parents. - // As long as this is ensured, the dictionary file may grow to any size. - // Anyway, to make a dictionary bigger than 16MB just increase the size of this buffer. - final byte[] buffer = new byte[1 << 24]; - int index = 0; + // Addresses are limited to 3 bytes, but since addresses can be relative to each node, the + // structure itself is not limited to 16MB. However, if it is over 16MB deciding the order + // of the nodes becomes a quite complicated problem, because though the dictionary itself + // does not have a size limit, each node must still be within 16MB of all its children and + // parents. As long as this is ensured, the dictionary file may grow to any size. if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { throw new UnsupportedFormatException("Requested file format version " + version @@ -910,47 +930,54 @@ public class BinaryDictInputOutput { + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION); } + ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); + // The magic number in big-endian order. if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { // Magic number for version 2+. - buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24)); - buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16)); - buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8)); - buffer[index++] = (byte) (0xFF & VERSION_2_MAGIC_NUMBER); + headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24))); + headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16))); + headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8))); + headerBuffer.write((byte) (0xFF & VERSION_2_MAGIC_NUMBER)); // Dictionary version. - buffer[index++] = (byte) (0xFF & (version >> 8)); - buffer[index++] = (byte) (0xFF & version); + headerBuffer.write((byte) (0xFF & (version >> 8))); + headerBuffer.write((byte) (0xFF & version)); } else { // Magic number for version 1. - buffer[index++] = (byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8)); - buffer[index++] = (byte) (0xFF & VERSION_1_MAGIC_NUMBER); + headerBuffer.write((byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8))); + headerBuffer.write((byte) (0xFF & VERSION_1_MAGIC_NUMBER)); // Dictionary version. - buffer[index++] = (byte) (0xFF & version); + headerBuffer.write((byte) (0xFF & version)); } // Options flags final int options = makeOptionsValue(dict.mOptions); - buffer[index++] = (byte) (0xFF & (options >> 8)); - buffer[index++] = (byte) (0xFF & options); + headerBuffer.write((byte) (0xFF & (options >> 8))); + headerBuffer.write((byte) (0xFF & options)); if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { - final int headerSizeOffset = index; - index += 4; // Size of the header size - + final int headerSizeOffset = headerBuffer.size(); + // Placeholder to be written later with header size. + for (int i = 0; i < 4; ++i) { + headerBuffer.write(0); + } // Write out the options. for (final String key : dict.mOptions.mAttributes.keySet()) { final String value = dict.mOptions.mAttributes.get(key); - index += CharEncoding.writeString(buffer, index, key); - index += CharEncoding.writeString(buffer, index, value); + CharEncoding.writeString(headerBuffer, key); + CharEncoding.writeString(headerBuffer, value); } - + final int size = headerBuffer.size(); + final byte[] bytes = headerBuffer.toByteArray(); // Write out the header size. - buffer[headerSizeOffset] = (byte) (0xFF & (index >> 24)); - buffer[headerSizeOffset + 1] = (byte) (0xFF & (index >> 16)); - buffer[headerSizeOffset + 2] = (byte) (0xFF & (index >> 8)); - buffer[headerSizeOffset + 3] = (byte) (0xFF & (index >> 0)); + bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); + bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); + bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); + bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); + destination.write(bytes); + } else { + headerBuffer.writeTo(destination); } - destination.write(buffer, 0, index); - index = 0; + headerBuffer.close(); // Leave the choice of the optimal node order to the flattenTree function. MakedictLog.i("Flattening the tree..."); @@ -961,6 +988,12 @@ public class BinaryDictInputOutput { MakedictLog.i("Checking array..."); checkFlatNodeArray(flatNodes); + // Create a buffer that matches the final dictionary size. + final Node lastNode = flatNodes.get(flatNodes.size() - 1); + final int bufferSize =(lastNode.mCachedAddress + lastNode.mCachedSize); + final byte[] buffer = new byte[bufferSize]; + int index = 0; + MakedictLog.i("Writing file..."); int dataEndOffset = 0; for (Node n : flatNodes) { -- cgit v1.2.3-83-g751a From a64a1a46e482664dcebdf4fee0745a890d0d70dc Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 24 Apr 2012 12:13:22 +0900 Subject: Fix a bug where a node size would be seen as increasing. The core reason for this is quite shrewd. When a word is a bigram of itself, the corresponding chargroup will have a bigram referring to itself. When computing bigram offsets, we use cached addresses of chargroups, but we compute the size of the node as we go. Hence, a discrepancy may happen between the base offset as seen by the bigram (which uses the recomputed value) and the target offset (which uses the cached value). When this happens, the cached node address is too large. The relative offset is negative, which is expected, since it points to this very charnode whose start is a few bytes earlier. But since the cached address is too large, the offset is computed as smaller than it should be. On the next pass, the cache has been refreshed with the newly computed size and the seen offset is now correct (or at least, much closer to correct). The correct value is larger than the previously computed offset, which was too small. If it happens that it crosses the -255 or -65335 boundary, the address will be seen as needing 1 more byte than previously computed. If this is the only change in size of this node, the node will be seen as having a larger size than previously, which is unexpected. Debug code was catching this and crashing the program. So this case is very rare, but in an even rarer occurence, it may happen that in the same node, another chargroup happens to decrease it size by the same amount. In this case, the node may be seen as having not been modified. This is probably extremely rare. If on top of this, it happens that no other node has been modified, then the file may be seen as complete, and the discrepancy left as is in the file, leading to a broken file. The probability that this happens is abyssally low, but the bug exists, and the current debug code would not have caught this. To further catch similar bugs, this change also modifies the test that decides if the node has changed. On grounds that all components of a node may only decrease in size with each successive pass, it's theoritically safe to assume that the same size means the node contents have not changed, but in case of a bug like the bug above where a component wrongly grows while another shrinks and both cancel each other out, the new code will catch this. Also, this change adds a check against the number of passses, to avoid infinite loops in case of a bug in the computation code. This change fixes this bug by updating the cached address of each chargroup as we go. This eliminates the discrepancy and fixes the bug. Bug: 6383103 Change-Id: Ia3f450e22c87c4c193cea8ddb157aebd5f224f01 --- .../latin/makedict/BinaryDictInputOutput.java | 30 ++++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 97df98e34..b1bee4e65 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -174,6 +174,13 @@ public class BinaryDictInputOutput { private static final int MAX_TERMINAL_FREQUENCY = 255; + // Arbitrary limit to how much passes we consider address size compression should + // terminate in. At the time of this writing, our largest dictionary completes + // compression in five passes. + // If the number of passes exceeds this number, makedict bails with an exception on + // suspicion that a bug might be causing an infinite loop. + private static final int MAX_PASSES = 24; + /** * A class grouping utility function for our specific character encoding. */ @@ -510,14 +517,22 @@ public class BinaryDictInputOutput { * Each node stores its tentative address. During dictionary address computing, these * are not final, but they can be used to compute the node size (the node size depends * on the address of the children because the number of bytes necessary to store an - * address depends on its numeric value. + * address depends on its numeric value. The return value indicates whether the node + * contents (as in, any of the addresses stored in the cache fields) have changed with + * respect to their previous value. * * @param node the node to compute the size of. * @param dict the dictionary in which the word/attributes are to be found. + * @return false if none of the cached addresses inside the node changed, true otherwise. */ - private static void computeActualNodeSize(Node node, FusionDictionary dict) { + private static boolean computeActualNodeSize(Node node, FusionDictionary dict) { + boolean changed = false; int size = getGroupCountSize(node); for (CharGroup group : node.mData) { + if (group.mCachedAddress != node.mCachedAddress + size) { + changed = true; + group.mCachedAddress = node.mCachedAddress + size; + } int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group); if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; if (null != group.mChildren) { @@ -538,7 +553,11 @@ public class BinaryDictInputOutput { group.mCachedSize = groupSize; size += groupSize; } - node.mCachedSize = size; + if (node.mCachedSize != size) { + node.mCachedSize = size; + changed = true; + } + return changed; } /** @@ -594,13 +613,14 @@ public class BinaryDictInputOutput { changesDone = false; for (Node n : flatNodes) { final int oldNodeSize = n.mCachedSize; - computeActualNodeSize(n, dict); + final boolean changed = computeActualNodeSize(n, dict); final int newNodeSize = n.mCachedSize; if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); - if (oldNodeSize != newNodeSize) changesDone = true; + changesDone |= changed; } stackNodes(flatNodes); ++passes; + if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); } while (changesDone); final Node lastNode = flatNodes.get(flatNodes.size() - 1); -- cgit v1.2.3-83-g751a From 1d80a7f395290cd0e7344210bb3960f685059264 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Tue, 24 Apr 2012 12:54:44 +0900 Subject: Fix binary reading code performance. This is not the Right fix ; the Right fix would be to read the file in a buffered way. However this delivers tolerable performance for a minimal amount of code changes. We may want to skip submitting this patch, but keep it around in case we need to use the functionality until we have a good patch. Change-Id: I1ba938f82acfd9436c3701d1078ff981afdbea60 --- .../inputmethod/latin/makedict/BinaryDictInputOutput.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 97df98e34..4256871cc 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -1122,6 +1122,12 @@ public class BinaryDictInputOutput { } } + // The word cache here is a stopgap bandaid to help the catastrophic performance + // of this method. Since it performs direct, unbuffered random access to the file and + // may be called hundreds of thousands of times, the resulting performance is not + // reasonable without some kind of cache. Thus: + // TODO: perform buffered I/O here and in other places in the code. + private static TreeMap wordCache = new TreeMap(); /** * Finds, as a string, the word at the address passed as an argument. * @@ -1131,8 +1137,10 @@ public class BinaryDictInputOutput { * @return the word, as a string. * @throws IOException if the file can't be read. */ - private static String getWordAtAddress(RandomAccessFile source, long headerSize, + private static String getWordAtAddress(final RandomAccessFile source, final long headerSize, int address) throws IOException { + final String cachedString = wordCache.get(address); + if (null != cachedString) return cachedString; final long originalPointer = source.getFilePointer(); source.seek(headerSize); final int count = readCharGroupCount(source); @@ -1171,6 +1179,7 @@ public class BinaryDictInputOutput { } } source.seek(originalPointer); + wordCache.put(address, result); return result; } -- cgit v1.2.3-83-g751a From 44c64f46a143623dd793facd889c8d6eab5e230c Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 20 Apr 2012 19:58:01 +0900 Subject: Ignore bigrams that are not also listed as unigrams This is a cherry pick of I14b67e51 on jb-dev Bug: 6340915 Change-Id: Iaa512abe1b19ca640ea201f9761fd7f1416270ed --- .../latin/ExpandableBinaryDictionary.java | 2 +- .../latin/makedict/BinaryDictInputOutput.java | 12 +++++-- .../latin/makedict/FusionDictionary.java | 41 ++++++++++------------ .../latin/makedict/XmlDictInputOutput.java | 27 ++++++++------ .../latin/BinaryDictInputOutputTest.java | 10 +++--- 5 files changed, 51 insertions(+), 41 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java index 9dcffd4e2..3d89226c0 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java @@ -159,7 +159,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { // TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries, // considering performance regression. protected void addWord(final String word, final int frequency) { - mFusionDictionary.add(word, frequency, null, null); + mFusionDictionary.add(word, frequency, null /* shortcutTargets */); } /** diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index cc98010fb..88da7b0d8 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -1317,8 +1317,16 @@ public class BinaryDictInputOutput { 0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG), 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG))); if (null != dict) { - for (Word w : dict) { - newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); + for (final Word w : dict) { + newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets); + } + for (final Word w : dict) { + // By construction a binary dictionary may not have bigrams pointing to + // words that are not also registered as unigrams so we don't have to avoid + // them explicitly here. + for (final WeightedString bigram : w.mBigrams) { + newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency); + } } } diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 40bcfc3aa..c293b2ba4 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -286,7 +286,7 @@ public class FusionDictionary implements Iterable { for (WeightedString word : words) { final CharGroup t = findWordInTree(mRoot, word.mWord); if (null == t) { - add(getCodePoints(word.mWord), 0, null, null); + add(getCodePoints(word.mWord), 0, null); } } } @@ -305,12 +305,8 @@ public class FusionDictionary implements Iterable { * @param bigrams a list of bigrams, or null. */ public void add(final String word, final int frequency, - final ArrayList shortcutTargets, - final ArrayList bigrams) { - if (null != bigrams) { - addNeutralWords(bigrams); - } - add(getCodePoints(word), frequency, shortcutTargets, bigrams); + final ArrayList shortcutTargets) { + add(getCodePoints(word), frequency, shortcutTargets); } /** @@ -344,7 +340,7 @@ public class FusionDictionary implements Iterable { final CharGroup charGroup2 = findWordInTree(mRoot, word2); if (charGroup2 == null) { // TODO: refactor with the identical code in addNeutralWords - add(getCodePoints(word2), 0, null, null); + add(getCodePoints(word2), 0, null); } charGroup.addBigram(word2, frequency); } else { @@ -355,17 +351,15 @@ public class FusionDictionary implements Iterable { /** * Add a word to this dictionary. * - * The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't, + * The shortcuts, if any, have to be in the dictionary already. If they aren't, * an exception is thrown. * * @param word the word, as an int array. * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets an optional list of shortcut targets for this word (null if none). - * @param bigrams an optional list of bigrams for this word (null if none). */ private void add(final int[] word, final int frequency, - final ArrayList shortcutTargets, - final ArrayList bigrams) { + final ArrayList shortcutTargets) { assert(frequency >= 0 && frequency <= 255); Node currentNode = mRoot; int charIndex = 0; @@ -390,7 +384,7 @@ public class FusionDictionary implements Iterable { final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final CharGroup newGroup = new CharGroup( Arrays.copyOfRange(word, charIndex, word.length), - shortcutTargets, bigrams, frequency); + shortcutTargets, null /* bigrams */, frequency); currentNode.mData.add(insertionIndex, newGroup); checkStack(currentNode); } else { @@ -400,21 +394,21 @@ public class FusionDictionary implements Iterable { // The new word is a prefix of an existing word, but the node on which it // should end already exists as is. Since the old CharNode was not a terminal, // make it one by filling in its frequency and other attributes - currentGroup.update(frequency, shortcutTargets, bigrams); + currentGroup.update(frequency, shortcutTargets, null); } else { // The new word matches the full old word and extends past it. // We only have to create a new node and add it to the end of this. final CharGroup newNode = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), - shortcutTargets, bigrams, frequency); + shortcutTargets, null /* bigrams */, frequency); currentGroup.mChildren = new Node(); currentGroup.mChildren.mData.add(newNode); } } else { if (0 == differentCharIndex) { // Exact same word. Update the frequency if higher. This will also add the - // new bigrams to the existing bigram list if it already exists. - currentGroup.update(frequency, shortcutTargets, bigrams); + // new shortcuts to the existing shortcut list if it already exists. + currentGroup.update(frequency, shortcutTargets, null); } else { // Partial prefix match only. We have to replace the current node with a node // containing the current prefix and create two new ones for the tails. @@ -429,14 +423,14 @@ public class FusionDictionary implements Iterable { if (charIndex + differentCharIndex >= word.length) { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - shortcutTargets, bigrams, frequency, newChildren); + shortcutTargets, null /* bigrams */, frequency, newChildren); } else { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - null, null, -1, newChildren); - final CharGroup newWord = new CharGroup( - Arrays.copyOfRange(word, charIndex + differentCharIndex, - word.length), shortcutTargets, bigrams, frequency); + null /* shortcutTargets */, null /* bigrams */, -1, newChildren); + final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word, + charIndex + differentCharIndex, word.length), + shortcutTargets, null /* bigrams */, frequency); final int addIndex = word[charIndex + differentCharIndex] > currentGroup.mChars[differentCharIndex] ? 1 : 0; newChildren.mData.add(addIndex, newWord); @@ -494,7 +488,8 @@ public class FusionDictionary implements Iterable { */ private static int findInsertionIndex(final Node node, int character) { final ArrayList data = node.mData; - final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0); + final CharGroup reference = new CharGroup(new int[] { character }, + null /* shortcutTargets */, null /* bigrams */, 0); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); return result >= 0 ? result : -result - 1; } diff --git a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java index d1d2a9ca4..d86719a1d 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java @@ -72,19 +72,15 @@ public class XmlDictInputOutput { int mFreq; // the currently read freq String mWord; // the current word final HashMap> mShortcutsMap; - final HashMap> mBigramsMap; /** * Create the handler. * * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. - * @param bigrams the bigrams as a map. This may be empty, but may not be null. */ - public UnigramHandler(final HashMap> shortcuts, - final HashMap> bigrams) { + public UnigramHandler(final HashMap> shortcuts) { mDictionary = null; mShortcutsMap = shortcuts; - mBigramsMap = bigrams; mWord = ""; mState = START; mFreq = 0; @@ -94,7 +90,6 @@ public class XmlDictInputOutput { final FusionDictionary dict = mDictionary; mDictionary = null; mShortcutsMap.clear(); - mBigramsMap.clear(); mWord = ""; mState = START; mFreq = 0; @@ -143,7 +138,7 @@ public class XmlDictInputOutput { @Override public void endElement(String uri, String localName, String qName) { if (WORD == mState) { - mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord)); + mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord)); mState = START; } } @@ -191,6 +186,7 @@ public class XmlDictInputOutput { } } + // This may return an empty map, but will never return null. public HashMap> getAssocMap() { return mAssocMap; } @@ -211,6 +207,7 @@ public class XmlDictInputOutput { BIGRAM_FREQ_ATTRIBUTE); } + // As per getAssocMap(), this never returns null. public HashMap> getBigramMap() { return getAssocMap(); } @@ -231,6 +228,7 @@ public class XmlDictInputOutput { TARGET_PRIORITY_ATTRIBUTE); } + // As per getAssocMap(), this never returns null. public HashMap> getShortcutMap() { return getAssocMap(); } @@ -260,10 +258,19 @@ public class XmlDictInputOutput { if (null != shortcuts) parser.parse(shortcuts, shortcutHandler); final UnigramHandler unigramHandler = - new UnigramHandler(shortcutHandler.getShortcutMap(), - bigramHandler.getBigramMap()); + new UnigramHandler(shortcutHandler.getShortcutMap()); parser.parse(unigrams, unigramHandler); - return unigramHandler.getFinalDictionary(); + final FusionDictionary dict = unigramHandler.getFinalDictionary(); + final HashMap> bigramMap = bigramHandler.getBigramMap(); + for (final String firstWord : bigramMap.keySet()) { + if (!dict.hasWord(firstWord)) continue; + final ArrayList bigramList = bigramMap.get(firstWord); + for (final WeightedString bigram : bigramList) { + if (!dict.hasWord(bigram.mWord)) continue; + dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency); + } + } + return dict; } /** diff --git a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java index 191eb804d..24042f120 100644 --- a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java +++ b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java @@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase { final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions(new HashMap(), false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); - dict.add("foo", 1, null, null); - dict.add("fta", 1, null, null); - dict.add("ftb", 1, null, null); - dict.add("bar", 1, null, null); - dict.add("fool", 1, null, null); + dict.add("foo", 1, null); + dict.add("fta", 1, null); + dict.add("ftb", 1, null); + dict.add("bar", 1, null); + dict.add("fool", 1, null); final ArrayList result = BinaryDictInputOutput.flattenTree(dict.mRoot); assertEquals(4, result.size()); while (!result.isEmpty()) { -- cgit v1.2.3-83-g751a From 20a6dea1cabfd8822824f7dca828d898e5b91cbc Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Wed, 25 Apr 2012 18:49:31 +0900 Subject: Add a flag for bigram presence in the header This is a cherry-pick of Icb602762 onto jb-dev. Bug: 6355745 Change-Id: Icb602762bb0d81472f024fa491571062ec1fc4e9 --- .../latin/makedict/BinaryDictInputOutput.java | 10 +++++--- .../latin/makedict/FusionDictionary.java | 28 +++++++++++++++++++++- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 88da7b0d8..d82d503c4 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -131,6 +131,7 @@ public class BinaryDictInputOutput { // These options need to be the same numeric values as the one in the native reading code. private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; + private static final int CONTAINS_BIGRAMS_FLAG = 0x8; // TODO: Make this value adaptative to content data, store it in the header, and // use it in the reading code. @@ -752,9 +753,12 @@ public class BinaryDictInputOutput { /** * Makes the 2-byte value for options flags. */ - private static final int makeOptionsValue(final DictionaryOptions options) { + private static final int makeOptionsValue(final FusionDictionary dictionary) { + final DictionaryOptions options = dictionary.mOptions; + final boolean hasBigrams = dictionary.hasBigrams(); return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0) - + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0); + + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0) + + (hasBigrams ? CONTAINS_BIGRAMS_FLAG : 0); } /** @@ -970,7 +974,7 @@ public class BinaryDictInputOutput { headerBuffer.write((byte) (0xFF & version)); } // Options flags - final int options = makeOptionsValue(dict.mOptions); + final int options = makeOptionsValue(dict); headerBuffer.write((byte) (0xFF & (options >> 8))); headerBuffer.write((byte) (0xFF & options)); if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index c293b2ba4..b08702e47 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -563,7 +563,7 @@ public class FusionDictionary implements Iterable { * Recursively count the number of nodes in a given branch of the trie. * * @param node the node to count. - * @result the number of nodes in this branch. + * @return the number of nodes in this branch. */ public static int countNodes(final Node node) { int size = 1; @@ -575,6 +575,32 @@ public class FusionDictionary implements Iterable { return size; } + // Recursively find out whether there are any bigrams. + // This can be pretty expensive especially if there aren't any (we return as soon + // as we find one, so it's much cheaper if there are bigrams) + private static boolean hasBigramsInternal(final Node node) { + if (null == node) return false; + for (int i = node.mData.size() - 1; i >= 0; --i) { + CharGroup group = node.mData.get(i); + if (null != group.mBigrams) return true; + if (hasBigramsInternal(group.mChildren)) return true; + } + return false; + } + + /** + * Finds out whether there are any bigrams in this dictionary. + * + * @return true if there is any bigram, false otherwise. + */ + // TODO: this is expensive especially for large dictionaries without any bigram. + // The up side is, this is always accurate and correct and uses no memory. We should + // find a more efficient way of doing this, without compromising too much on memory + // and ease of use. + public boolean hasBigrams() { + return hasBigramsInternal(mRoot); + } + // Historically, the tails of the words were going to be merged to save space. // However, that would prevent the code to search for a specific address in log(n) // time so this was abandoned. -- cgit v1.2.3-83-g751a From 4455fe2c894f8aabaf2b3105b72f9193226d4aba Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 11 May 2012 19:28:05 +0900 Subject: Refactor a method Rename it, rename parameters, and add a parameter that will be necessary soon. Also, rescale the bigram frequency as necessary. Bug: 6313806 Change-Id: I192543cfb6ab6bccda4a1a53c8e67fbf50a257b0 --- .../latin/makedict/BinaryDictInputOutput.java | 24 +++++++++++++--------- .../latin/makedict/XmlDictInputOutput.java | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index d82d503c4..4845a33a5 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -722,15 +722,16 @@ public class BinaryDictInputOutput { } /** - * Makes the flag value for an attribute. + * Makes the flag value for a bigram. * - * @param more whether there are more attributes after this one. - * @param offset the offset of the attribute. - * @param frequency the frequency of the attribute, 0..15 + * @param more whether there are more bigrams after this one. + * @param offset the offset of the bigram. + * @param bigramFrequency the frequency of the bigram, 0..15. + * @param unigramFrequency the unigram frequency of the same word. * @return the flags */ - private static final int makeAttributeFlags(final boolean more, final int offset, - final int frequency) { + private static final int makeBigramFlags(final boolean more, final int offset, + final int bigramFrequency, final int unigramFrequency) { int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); switch (getByteSize(offset)) { @@ -746,7 +747,7 @@ public class BinaryDictInputOutput { default: throw new RuntimeException("Strange offset size"); } - bigramFlags += frequency & FLAG_ATTRIBUTE_FREQUENCY; + bigramFlags += bigramFrequency & FLAG_ATTRIBUTE_FREQUENCY; return bigramFlags; } @@ -854,11 +855,14 @@ public class BinaryDictInputOutput { final Iterator bigramIterator = group.mBigrams.iterator(); while (bigramIterator.hasNext()) { final WeightedString bigram = (WeightedString)bigramIterator.next(); - final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); + final CharGroup target = + FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord); + final int addressOfBigram = target.mCachedAddress; + final int unigramFrequencyForThisWord = target.mFrequency; ++groupAddress; final int offset = addressOfBigram - groupAddress; - int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset, - bigram.mFrequency); + int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset, + bigram.mFrequency, unigramFrequencyForThisWord); buffer[index++] = (byte)bigramFlags; final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); index += bigramShift; diff --git a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java index d86719a1d..52f124dfb 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java @@ -154,7 +154,7 @@ public class XmlDictInputOutput { // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX private final static int XML_MAX = 256; // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX - private final static int MEMORY_MAX = 16; + private final static int MEMORY_MAX = 256; private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; private String mSrc; -- cgit v1.2.3-83-g751a From f7346de94a902b0d0675a85425e68eba96cece7e Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 11 May 2012 19:58:21 +0900 Subject: Write the bigram frequency following the new formula This also tests for bigram frequency against unigram frequency Bug: 6313806 Bug: 6028348 Change-Id: If7faa3559fee9f2496890f0bc0e081279e100854 --- .../latin/makedict/BinaryDictInputOutput.java | 26 +++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 4845a33a5..3c818cc56 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -174,6 +174,7 @@ public class BinaryDictInputOutput { private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 private static final int MAX_TERMINAL_FREQUENCY = 255; + private static final int MAX_BIGRAM_FREQUENCY = 15; // Arbitrary limit to how much passes we consider address size compression should // terminate in. At the time of this writing, our largest dictionary completes @@ -726,12 +727,13 @@ public class BinaryDictInputOutput { * * @param more whether there are more bigrams after this one. * @param offset the offset of the bigram. - * @param bigramFrequency the frequency of the bigram, 0..15. - * @param unigramFrequency the unigram frequency of the same word. + * @param bigramFrequency the frequency of the bigram, 0..255. + * @param unigramFrequency the unigram frequency of the same word, 0..255. + * @param word the second bigram, for debugging purposes * @return the flags */ private static final int makeBigramFlags(final boolean more, final int offset, - final int bigramFrequency, final int unigramFrequency) { + int bigramFrequency, final int unigramFrequency, final String word) { int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); switch (getByteSize(offset)) { @@ -747,7 +749,21 @@ public class BinaryDictInputOutput { default: throw new RuntimeException("Strange offset size"); } - bigramFlags += bigramFrequency & FLAG_ATTRIBUTE_FREQUENCY; + if (unigramFrequency > bigramFrequency) { + MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word + + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " + + word + " is " + unigramFrequency); + bigramFrequency = unigramFrequency; + } + // We compute the difference between 255 (which means probability = 1) and the + // unigram score. We split this into discrete 16 steps, and this is the value + // we store into the 4 bits of the bigrams frequency. + final float bigramRatio = (float)(bigramFrequency - unigramFrequency) + / (MAX_TERMINAL_FREQUENCY - unigramFrequency); + // TODO: if the bigram freq is very close to the unigram frequency, we don't want + // to include the bigram in the binary dictionary at all. + final int discretizedFrequency = Math.round(bigramRatio * MAX_BIGRAM_FREQUENCY); + bigramFlags += discretizedFrequency & FLAG_ATTRIBUTE_FREQUENCY; return bigramFlags; } @@ -862,7 +878,7 @@ public class BinaryDictInputOutput { ++groupAddress; final int offset = addressOfBigram - groupAddress; int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset, - bigram.mFrequency, unigramFrequencyForThisWord); + bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); buffer[index++] = (byte)bigramFlags; final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); index += bigramShift; -- cgit v1.2.3-83-g751a From 3b1b72ac4d8975d24a3176dd1b5a39b5fead71a8 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 11 May 2012 22:56:50 +0900 Subject: More optimizations We don't merge tails anyway, and we can't do it any more because that would break the bigram lookup algorithm. The speedup is about 20%, and possibly double this if there are no bigrams. Bug: 6394357 Change-Id: I9eec11dda9000451706d280f120404a2acbea304 --- .../inputmethod/latin/makedict/BinaryDictInputOutput.java | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 3c818cc56..bb1042324 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -489,10 +489,17 @@ public class BinaryDictInputOutput { // Merging tails can only be done if there are no attributes. Searching for attributes // in LatinIME code depends on a total breadth-first ordering, which merging tails // breaks. If there are no attributes, it should be fine (and reduce the file size) - // to merge tails, and the following step would be necessary. - // If eventually the code runs on Android, searching through the whole array each time - // may be a performance concern. - list.remove(node); + // to merge tails, and removing the node from the list would be necessary. However, + // we don't merge tails because breaking the breadth-first ordering would result in + // extreme overhead at bigram lookup time (it would make the search function O(n) instead + // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty + // high). + // If no nodes are ever merged, we can't have the same node twice in the list, hence + // searching for duplicates in unnecessary. It is also very performance consuming, + // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making + // this simple list.remove operation O(n*n) overall. On Android this overhead is very + // high. + // For future reference, the code to remove duplicate is a simple : list.remove(node); list.add(node); final ArrayList branches = node.mData; final int nodeSize = branches.size(); -- cgit v1.2.3-83-g751a From 4df5b43df8f4b29fbfab9180cffe5742f8b5f512 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 11 May 2012 23:03:01 +0900 Subject: Small optimizations Bug: 6394357 Change-Id: I00ba1b5ab3d527b3768e28090c758ddd1629f281 --- .../inputmethod/latin/makedict/BinaryDictInputOutput.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index bb1042324..522573d07 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -40,6 +40,8 @@ import java.util.TreeMap; */ public class BinaryDictInputOutput { + final static boolean DBG = MakedictLog.DBG; + /* Node layout is as follows: * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS @@ -715,13 +717,13 @@ public class BinaryDictInputOutput { } } if (null != group.mShortcutTargets) { - if (0 == group.mShortcutTargets.size()) { + if (DBG && 0 == group.mShortcutTargets.size()) { throw new RuntimeException("0-sized shortcut list must be null"); } flags |= FLAG_HAS_SHORTCUT_TARGETS; } if (null != group.mBigrams) { - if (0 == group.mBigrams.size()) { + if (DBG && 0 == group.mBigrams.size()) { throw new RuntimeException("0-sized bigram list must be null"); } flags |= FLAG_HAS_BIGRAMS; @@ -830,7 +832,7 @@ public class BinaryDictInputOutput { + index + " <> " + group.mCachedAddress); groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group); // Sanity checks. - if (group.mFrequency > MAX_TERMINAL_FREQUENCY) { + if (DBG && group.mFrequency > MAX_TERMINAL_FREQUENCY) { throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY + " : " + group.mFrequency); } @@ -1037,7 +1039,7 @@ public class BinaryDictInputOutput { MakedictLog.i("Computing addresses..."); computeAddresses(dict, flatNodes); MakedictLog.i("Checking array..."); - checkFlatNodeArray(flatNodes); + if (DBG) checkFlatNodeArray(flatNodes); // Create a buffer that matches the final dictionary size. final Node lastNode = flatNodes.get(flatNodes.size() - 1); -- cgit v1.2.3-83-g751a From 76319c6931becbe2994226a0e52925fc77bd0c92 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 11 May 2012 23:04:24 +0900 Subject: Small optimization Performance gain is < 2% Bug: 6394357 Change-Id: I2b7da946788cf11d1a491efd20fb2bd2333c23d1 --- .../com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 522573d07..830fbf07e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -1053,7 +1053,7 @@ public class BinaryDictInputOutput { dataEndOffset = writePlacedNode(dict, buffer, n); } - showStatistics(flatNodes); + if (DBG) showStatistics(flatNodes); destination.write(buffer, 0, dataEndOffset); -- cgit v1.2.3-83-g751a From 418b34379733aa7f3d31729090797c747c8a43a8 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 11 May 2012 21:49:55 +0900 Subject: Use a formula packing more information into 4 bits field Bug: 6313806 Change-Id: Id0779bd69afae0bb4a4a285340c1eb306544663a --- .../latin/makedict/BinaryDictInputOutput.java | 41 +++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 830fbf07e..563f8a99b 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -765,14 +765,39 @@ public class BinaryDictInputOutput { bigramFrequency = unigramFrequency; } // We compute the difference between 255 (which means probability = 1) and the - // unigram score. We split this into discrete 16 steps, and this is the value - // we store into the 4 bits of the bigrams frequency. - final float bigramRatio = (float)(bigramFrequency - unigramFrequency) - / (MAX_TERMINAL_FREQUENCY - unigramFrequency); - // TODO: if the bigram freq is very close to the unigram frequency, we don't want - // to include the bigram in the binary dictionary at all. - final int discretizedFrequency = Math.round(bigramRatio * MAX_BIGRAM_FREQUENCY); - bigramFlags += discretizedFrequency & FLAG_ATTRIBUTE_FREQUENCY; + // unigram score. We split this into a number of discrete steps. + // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 + // represents an increase of 16 steps: a value of 15 will be interpreted as the median + // value of the 16th step. In all justice, if the bigram frequency is low enough to be + // rounded below the first step (which means it is less than half a step higher than the + // unigram frequency) then the unigram frequency itself is the best approximation of the + // bigram freq that we could possibly supply, hence we should *not* include this bigram + // in the file at all. + // until this is done, we'll write 0 and slightly overestimate this case. + // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step + // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to + // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the + // step size. Then we compute the start of the first step (the one where value 0 starts) + // by adding half-a-step to the unigramFrequency. From there, we compute the integer + // number of steps to the bigramFrequency. One last thing: we want our steps to include + // their lower bound and exclude their higher bound so we need to have the first step + // start at exactly 1 unit higher than floor(unigramFreq + half a step). + // Note : to reconstruct the score, the dictionary reader will need to divide + // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise, and add + // (discretizedFrequency + 0.5) times this value to get the median value of the step, + // which is the best approximation. This is how we get the most precise result with + // only four bits. + final double stepSize = + (double)(MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5 + MAX_BIGRAM_FREQUENCY); + final double firstStepStart = 1 + unigramFrequency + (stepSize / 2.0); + final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); + // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 + // here. The best approximation would be the unigram freq itself, so we should not + // include this bigram in the dictionary. For now, register as 0, and live with the + // small over-estimation that we get in this case. TODO: actually remove this bigram + // if discretizedFrequency < 0. + final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0; + bigramFlags += finalBigramFrequency & FLAG_ATTRIBUTE_FREQUENCY; return bigramFlags; } -- cgit v1.2.3-83-g751a From 93ebf74bae44728e0d5f7e738ea28376187a876e Mon Sep 17 00:00:00 2001 From: "Tadashi G. Takaoka" Date: Fri, 25 May 2012 19:04:54 +0900 Subject: Clean up some compiler warnings Change-Id: I604da15e65fc3cf807ec4033df4e4cd5ef0196fc --- java/src/com/android/inputmethod/keyboard/PointerTracker.java | 6 ------ java/src/com/android/inputmethod/latin/ContactsDictionary.java | 1 + java/src/com/android/inputmethod/latin/Dictionary.java | 2 +- java/src/com/android/inputmethod/latin/DictionaryFactory.java | 1 - java/src/com/android/inputmethod/latin/ExpandableDictionary.java | 3 ++- java/src/com/android/inputmethod/latin/ResearchLogger.java | 2 +- java/src/com/android/inputmethod/latin/UserDictionary.java | 1 + .../android/inputmethod/latin/makedict/BinaryDictInputOutput.java | 8 ++++---- .../com/android/inputmethod/latin/makedict/FusionDictionary.java | 8 +++++--- .../inputmethod/latin/spellcheck/AndroidSpellCheckerService.java | 4 ---- 10 files changed, 15 insertions(+), 21 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/keyboard/PointerTracker.java b/java/src/com/android/inputmethod/keyboard/PointerTracker.java index 6ad854d1b..59f53fc21 100644 --- a/java/src/com/android/inputmethod/keyboard/PointerTracker.java +++ b/java/src/com/android/inputmethod/keyboard/PointerTracker.java @@ -452,12 +452,6 @@ public class PointerTracker { return newKey; } - private Key onUpKey(int x, int y, long eventTime) { - mUpTime = eventTime; - mCurrentKey = null; - return onMoveKeyInternal(x, y); - } - public void processMotionEvent(int action, int x, int y, long eventTime, KeyEventHandler handler) { switch (action) { diff --git a/java/src/com/android/inputmethod/latin/ContactsDictionary.java b/java/src/com/android/inputmethod/latin/ContactsDictionary.java index 2f3395245..c9b8d6eb1 100644 --- a/java/src/com/android/inputmethod/latin/ContactsDictionary.java +++ b/java/src/com/android/inputmethod/latin/ContactsDictionary.java @@ -34,6 +34,7 @@ import com.android.inputmethod.keyboard.Keyboard; * * @deprecated Use {@link ContactsBinaryDictionary}. */ +@Deprecated public class ContactsDictionary extends ExpandableDictionary { private static final String[] PROJECTION = { diff --git a/java/src/com/android/inputmethod/latin/Dictionary.java b/java/src/com/android/inputmethod/latin/Dictionary.java index 1ec678f7f..231e9ab81 100644 --- a/java/src/com/android/inputmethod/latin/Dictionary.java +++ b/java/src/com/android/inputmethod/latin/Dictionary.java @@ -33,7 +33,7 @@ public abstract class Dictionary { /** * Interface to be implemented by classes requesting words to be fetched from the dictionary. - * @see #getWords(WordComposer, WordCallback, ProximityInfo) + * @see #getWords(WordComposer, CharSequence, WordCallback, ProximityInfo) */ public interface WordCallback { /** diff --git a/java/src/com/android/inputmethod/latin/DictionaryFactory.java b/java/src/com/android/inputmethod/latin/DictionaryFactory.java index 4cd1b3883..a22d73af7 100644 --- a/java/src/com/android/inputmethod/latin/DictionaryFactory.java +++ b/java/src/com/android/inputmethod/latin/DictionaryFactory.java @@ -89,7 +89,6 @@ public class DictionaryFactory { /** * Initializes a dictionary from a raw resource file * @param context application context for reading resources - * @param resId the resource containing the raw binary dictionary * @param locale the locale to use for the resource * @return an initialized instance of BinaryDictionary */ diff --git a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java index dd9c57e0c..6c457afd2 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java @@ -309,7 +309,8 @@ public class ExpandableDictionary extends Dictionary { * @param word the word to insert, as an array of code points * @param depth the depth of the node in the tree * @param finalFreq the frequency for this word - * @return whether there is still space for more words. {@see Dictionary.WordCallback#addWord}. + * @return whether there is still space for more words. + * @see Dictionary.WordCallback#addWord(char[], int, int, int, int, int) */ private boolean addWordAndShortcutsFromNode(final Node node, final char[] word, final int depth, final int finalFreq, final WordCallback callback) { diff --git a/java/src/com/android/inputmethod/latin/ResearchLogger.java b/java/src/com/android/inputmethod/latin/ResearchLogger.java index aa979a66f..66d6d58b1 100644 --- a/java/src/com/android/inputmethod/latin/ResearchLogger.java +++ b/java/src/com/android/inputmethod/latin/ResearchLogger.java @@ -54,7 +54,7 @@ import java.util.Map; * This class logs operations on the IME keyboard, including what the user has typed. * Data is stored locally in a file in app-specific storage. * - * This functionality is off by default. See {@link ProductionFlag.IS_EXPERIMENTAL}. + * This functionality is off by default. See {@link ProductionFlag#IS_EXPERIMENTAL}. */ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChangeListener { private static final String TAG = ResearchLogger.class.getSimpleName(); diff --git a/java/src/com/android/inputmethod/latin/UserDictionary.java b/java/src/com/android/inputmethod/latin/UserDictionary.java index 81e2fdce4..c1efadd44 100644 --- a/java/src/com/android/inputmethod/latin/UserDictionary.java +++ b/java/src/com/android/inputmethod/latin/UserDictionary.java @@ -35,6 +35,7 @@ import java.util.Arrays; * * @deprecated Use {@link UserBinaryDictionary}. */ +@Deprecated public class UserDictionary extends ExpandableDictionary { // TODO: use Words.SHORTCUT when it's public in the SDK diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 563f8a99b..89c59f809 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -882,9 +882,9 @@ public class BinaryDictInputOutput { final int indexOfShortcutByteSize = index; index += GROUP_SHORTCUT_LIST_SIZE_SIZE; groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE; - final Iterator shortcutIterator = group.mShortcutTargets.iterator(); + final Iterator shortcutIterator = group.mShortcutTargets.iterator(); while (shortcutIterator.hasNext()) { - final WeightedString target = (WeightedString)shortcutIterator.next(); + final WeightedString target = shortcutIterator.next(); ++groupAddress; int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(), target.mFrequency); @@ -902,9 +902,9 @@ public class BinaryDictInputOutput { } // Write bigrams if (null != group.mBigrams) { - final Iterator bigramIterator = group.mBigrams.iterator(); + final Iterator bigramIterator = group.mBigrams.iterator(); while (bigramIterator.hasNext()) { - final WeightedString bigram = (WeightedString)bigramIterator.next(); + final WeightedString bigram = bigramIterator.next(); final CharGroup target = FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord); final int addressOfBigram = target.mCachedAddress; diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index c467ef7d4..8b53c9427 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -296,7 +296,6 @@ public class FusionDictionary implements Iterable { * @param word the word to add. * @param frequency the frequency of the word, in the range [0..255]. * @param shortcutTargets a list of shortcut targets for this word, or null. - * @param bigrams a list of bigrams, or null. */ public void add(final String word, final int frequency, final ArrayList shortcutTargets) { @@ -435,6 +434,8 @@ public class FusionDictionary implements Iterable { } } + private static int ARRAYS_ARE_EQUAL = 0; + /** * Custom comparison of two int arrays taken to contain character codes. * @@ -450,7 +451,6 @@ public class FusionDictionary implements Iterable { * @param dstOffset the offset in the right-hand side string. * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. */ - private static int ARRAYS_ARE_EQUAL = 0; private static int compareArrays(final int[] src, final int[] dst, int dstOffset) { // We do NOT test the first char, because we come from a method that already // tested it. @@ -469,6 +469,7 @@ public class FusionDictionary implements Iterable { * This comparator imposes orderings that are inconsistent with equals. */ static private class CharGroupComparator implements java.util.Comparator { + @Override public int compare(CharGroup c1, CharGroup c2) { if (c1.mChars[0] == c2.mChars[0]) return 0; return c1.mChars[0] < c2.mChars[0] ? -1 : 1; @@ -487,6 +488,8 @@ public class FusionDictionary implements Iterable { return result >= 0 ? result : -result - 1; } + private static int CHARACTER_NOT_FOUND = -1; + /** * Find the index of a char in a node, if it exists. * @@ -494,7 +497,6 @@ public class FusionDictionary implements Iterable { * @param character the character to search for. * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else. */ - private static int CHARACTER_NOT_FOUND = -1; private static int findIndexOfChar(final Node node, int character) { final int insertionIndex = findInsertionIndex(node, character); if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND; diff --git a/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java b/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java index aa3250185..0e3bf8011 100644 --- a/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java +++ b/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java @@ -499,10 +499,6 @@ public class AndroidSpellCheckerService extends SpellCheckerService } mUnigramSuggestionsInfoCache.put(query, new SuggestionsParams(suggestions, flags)); } - - public void remove(String key) { - mUnigramSuggestionsInfoCache.remove(key); - } } AndroidSpellCheckerSession(final AndroidSpellCheckerService service) { -- cgit v1.2.3-83-g751a From 7214617622fce8f3fea6620e782c16336260a2a3 Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 8 Jun 2012 16:00:02 +0900 Subject: Remove a slew of Eclipse warnings. Change-Id: I03236386aea13fbd4fb8eaeee18e0008aa136502 --- java/src/com/android/inputmethod/keyboard/KeyboardView.java | 1 + java/src/com/android/inputmethod/keyboard/internal/KeyboardState.java | 2 -- java/src/com/android/inputmethod/latin/ExpandableDictionary.java | 4 +++- java/src/com/android/inputmethod/latin/LatinIME.java | 1 + java/src/com/android/inputmethod/latin/ResearchLogger.java | 4 ++-- java/src/com/android/inputmethod/latin/SettingsValues.java | 1 - java/src/com/android/inputmethod/latin/UserHistoryDictionary.java | 2 +- .../android/inputmethod/latin/UserHistoryForgettingCurveUtils.java | 4 ++-- java/src/com/android/inputmethod/latin/Utils.java | 2 -- .../com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java | 2 +- 10 files changed, 11 insertions(+), 12 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/keyboard/KeyboardView.java b/java/src/com/android/inputmethod/keyboard/KeyboardView.java index 51a0f537f..18e01fb49 100644 --- a/java/src/com/android/inputmethod/keyboard/KeyboardView.java +++ b/java/src/com/android/inputmethod/keyboard/KeyboardView.java @@ -873,6 +873,7 @@ public class KeyboardView extends View implements PointerTracker.DrawingProxy { keyPreview, ViewLayoutUtils.newLayoutParam(mPreviewPlacer, 0, 0)); } + @SuppressWarnings("deprecation") // setBackgroundDrawable is replaced by setBackground in API16 @Override public void showKeyPreview(PointerTracker tracker) { if (!mShowKeyPreviewPopup) return; diff --git a/java/src/com/android/inputmethod/keyboard/internal/KeyboardState.java b/java/src/com/android/inputmethod/keyboard/internal/KeyboardState.java index 5aa9a0887..4ab6832c3 100644 --- a/java/src/com/android/inputmethod/keyboard/internal/KeyboardState.java +++ b/java/src/com/android/inputmethod/keyboard/internal/KeyboardState.java @@ -21,8 +21,6 @@ import android.util.Log; import com.android.inputmethod.keyboard.Keyboard; import com.android.inputmethod.latin.Constants; -import com.android.inputmethod.latin.ResearchLogger; -import com.android.inputmethod.latin.define.ProductionFlag; /** * Keyboard state machine. diff --git a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java index 34a92fd30..4a5471c85 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java @@ -514,8 +514,10 @@ public class ExpandableDictionary extends Dictionary { /** * Adds bigrams to the in-memory trie structure that is being used to retrieve any word + * @param word1 the first word of this bigram + * @param word2 the second word of this bigram * @param frequency frequency for this bigram - * @param addFrequency if true, it adds to current frequency, else it overwrites the old value + * @param fcp an instance of ForgettingCurveParams to use for decay policy * @return returns the final bigram frequency */ private int setBigramAndGetFrequency( diff --git a/java/src/com/android/inputmethod/latin/LatinIME.java b/java/src/com/android/inputmethod/latin/LatinIME.java index ae9e197a1..f5025e54a 100644 --- a/java/src/com/android/inputmethod/latin/LatinIME.java +++ b/java/src/com/android/inputmethod/latin/LatinIME.java @@ -747,6 +747,7 @@ public class LatinIME extends InputMethodService implements KeyboardActionListen if (TRACE) Debug.startMethodTracing("/data/trace/latinime"); } + @Override public void onTargetApplicationKnown(final ApplicationInfo info) { mTargetApplicationInfo = info; } diff --git a/java/src/com/android/inputmethod/latin/ResearchLogger.java b/java/src/com/android/inputmethod/latin/ResearchLogger.java index bb003f766..a7e7738d8 100644 --- a/java/src/com/android/inputmethod/latin/ResearchLogger.java +++ b/java/src/com/android/inputmethod/latin/ResearchLogger.java @@ -101,13 +101,13 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang private static class NullOutputStream extends OutputStream { /** {@inheritDoc} */ @Override - public void write(byte[] buffer, int offset, int count) throws IOException { + public void write(byte[] buffer, int offset, int count) { // nop } /** {@inheritDoc} */ @Override - public void write(byte[] buffer) throws IOException { + public void write(byte[] buffer) { // nop } diff --git a/java/src/com/android/inputmethod/latin/SettingsValues.java b/java/src/com/android/inputmethod/latin/SettingsValues.java index 4aae6a85e..dfe207cf2 100644 --- a/java/src/com/android/inputmethod/latin/SettingsValues.java +++ b/java/src/com/android/inputmethod/latin/SettingsValues.java @@ -29,7 +29,6 @@ import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; -import java.util.Map; /** * When you call the constructor of this class, you may want to change the current system locale by diff --git a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java index 9c54e0b81..10f92d29e 100644 --- a/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java +++ b/java/src/com/android/inputmethod/latin/UserHistoryDictionary.java @@ -440,7 +440,7 @@ public class UserHistoryDictionary extends ExpandableDictionary { if (nw != null) { final ForgettingCurveParams fcp = nw.getFcParams(); final byte prevFc = word1Bigrams.get(word2); - final byte fc = (byte)fcp.getFc(); + final byte fc = fcp.getFc(); final boolean isValid = fcp.isValid(); if (prevFc > 0 && prevFc == fc) { // No need to update since we found no changes for this entry. diff --git a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java index e5516dc62..3ae1bd336 100644 --- a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java +++ b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java @@ -50,7 +50,7 @@ public class UserHistoryForgettingCurveUtils { } private ForgettingCurveParams(long now, boolean isValid) { - this((int)pushCount((byte)0, isValid), now, now, isValid); + this(pushCount((byte)0, isValid), now, now, isValid); } /** This constructor is called when the user history bigram dictionary is being restored. */ @@ -201,7 +201,7 @@ public class UserHistoryForgettingCurveUtils { for (int i = 0; i < FC_LEVEL_MAX; ++i) { final double initialFreq; if (i >= 2) { - initialFreq = (double)FC_FREQ_MAX; + initialFreq = FC_FREQ_MAX; } else if (i == 1) { initialFreq = (double)FC_FREQ_MAX / 2; } else if (i == 0) { diff --git a/java/src/com/android/inputmethod/latin/Utils.java b/java/src/com/android/inputmethod/latin/Utils.java index 4178955bc..903b5a357 100644 --- a/java/src/com/android/inputmethod/latin/Utils.java +++ b/java/src/com/android/inputmethod/latin/Utils.java @@ -44,10 +44,8 @@ import java.io.IOException; import java.io.PrintWriter; import java.nio.channels.FileChannel; import java.text.SimpleDateFormat; -import java.util.Collections; import java.util.Date; import java.util.HashMap; -import java.util.Map; public class Utils { private Utils() { diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 89c59f809..0c5d41a5c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -788,7 +788,7 @@ public class BinaryDictInputOutput { // which is the best approximation. This is how we get the most precise result with // only four bits. final double stepSize = - (double)(MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5 + MAX_BIGRAM_FREQUENCY); + (MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5 + MAX_BIGRAM_FREQUENCY); final double firstStepStart = 1 + unigramFrequency + (stepSize / 2.0); final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 -- cgit v1.2.3-83-g751a From d10c473347c7e21c383c56786c9eb96fd6513a5c Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Fri, 8 Jun 2012 17:05:28 +0900 Subject: Small performance tweak Change-Id: Icd540742073d49d12e70b2d8bd99aaf7ccb5802d --- .../inputmethod/latin/UserHistoryForgettingCurveUtils.java | 10 +++++----- .../inputmethod/latin/makedict/BinaryDictInputOutput.java | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java') diff --git a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java index 3ae1bd336..6e71885cc 100644 --- a/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java +++ b/java/src/com/android/inputmethod/latin/UserHistoryForgettingCurveUtils.java @@ -199,20 +199,20 @@ public class UserHistoryForgettingCurveUtils { public static final int[][] SCORE_TABLE = new int[FC_LEVEL_MAX][ELAPSED_TIME_MAX + 1]; static { for (int i = 0; i < FC_LEVEL_MAX; ++i) { - final double initialFreq; + final float initialFreq; if (i >= 2) { initialFreq = FC_FREQ_MAX; } else if (i == 1) { - initialFreq = (double)FC_FREQ_MAX / 2; + initialFreq = FC_FREQ_MAX / 2; } else if (i == 0) { - initialFreq = (double)FC_FREQ_MAX / 4; + initialFreq = FC_FREQ_MAX / 4; } else { continue; } for (int j = 0; j < ELAPSED_TIME_MAX; ++j) { - final double elapsedHour = j * ELAPSED_TIME_INTERVAL_HOURS; + final float elapsedHours = j * ELAPSED_TIME_INTERVAL_HOURS; final double freq = - initialFreq * Math.pow(initialFreq, elapsedHour / HALF_LIFE_HOURS); + initialFreq * Math.pow(initialFreq, elapsedHours / HALF_LIFE_HOURS); final int intFreq = Math.min(FC_FREQ_MAX, Math.max(0, (int)freq)); SCORE_TABLE[i][j] = intFreq; } diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index 0c5d41a5c..2c3eee74c 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -787,9 +787,9 @@ public class BinaryDictInputOutput { // (discretizedFrequency + 0.5) times this value to get the median value of the step, // which is the best approximation. This is how we get the most precise result with // only four bits. - final double stepSize = - (MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5 + MAX_BIGRAM_FREQUENCY); - final double firstStepStart = 1 + unigramFrequency + (stepSize / 2.0); + final float stepSize = + (MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5f + MAX_BIGRAM_FREQUENCY); + final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 // here. The best approximation would be the unigram freq itself, so we should not -- cgit v1.2.3-83-g751a