diff options
Diffstat (limited to 'tools')
8 files changed, 426 insertions, 109 deletions
diff --git a/tools/Android.mk b/tools/Android.mk index 8f1acc55a..91b2fbbb0 100644 --- a/tools/Android.mk +++ b/tools/Android.mk @@ -12,6 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -LOCAL_PATH := $(call my-dir) - -include $(call all-makefiles-under,$(LOCAL_PATH)) +include $(call all-subdir-makefiles) diff --git a/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java index 92f402d3e..7aadc677b 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java @@ -26,6 +26,7 @@ import java.io.OutputStream; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Arrays; +import java.util.Iterator; import java.util.Map; import java.util.TreeMap; @@ -44,8 +45,9 @@ public class BinaryDictInputOutput { * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL - * | reserved 1 bit, 1 = yes, 0 = no + * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS + * | is shortcut only ? 1 bit, 1 = yes, 0 = no : FLAG_IS_SHORTCUT_ONLY * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers @@ -71,6 +73,8 @@ public class BinaryDictInputOutput { * d * dress * + * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS + * | shortcut targets address list * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS * | bigrams address list * @@ -126,7 +130,9 @@ public class BinaryDictInputOutput { private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; private static final int FLAG_IS_TERMINAL = 0x10; + private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; private static final int FLAG_HAS_BIGRAMS = 0x04; + private static final int FLAG_IS_SHORTCUT_ONLY = 0x02; private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; @@ -138,7 +144,6 @@ public class BinaryDictInputOutput { private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F; - private static final int GROUP_COUNT_SIZE = 1; private static final int GROUP_TERMINATOR_SIZE = 1; private static final int GROUP_FLAGS_SIZE = 1; private static final int GROUP_FREQUENCY_SIZE = 1; @@ -149,9 +154,8 @@ public class BinaryDictInputOutput { private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; private static final int INVALID_CHARACTER = -1; - // Limiting to 127 for upward compatibility - // TODO: implement a scheme to be able to shoot 256 chargroups in a node - private static final int MAX_CHARGROUPS_IN_A_NODE = 127; + private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 + private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 private static final int MAX_TERMINAL_FREQUENCY = 255; @@ -261,6 +265,31 @@ public class BinaryDictInputOutput { } /** + * Compute the binary size of the group count + * @param count the group count + * @return the size of the group count, either 1 or 2 bytes. + */ + private static int getGroupCountSize(final int count) { + if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) { + return 1; + } else if (MAX_CHARGROUPS_IN_A_NODE >= count) { + return 2; + } else { + throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE + + " groups in a node (found " + count +")"); + } + } + + /** + * Compute the binary size of the group count for a node + * @param node the node + * @return the size of the group count, either 1 or 2 bytes. + */ + private static int getGroupCountSize(final Node node) { + return getGroupCountSize(node.mData.size()); + } + + /** * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. * * @param group the CharGroup to compute the size of. @@ -271,10 +300,13 @@ public class BinaryDictInputOutput { // If terminal, one byte for the frequency if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; size += GROUP_MAX_ADDRESS_SIZE; // For children address + if (null != group.mShortcutTargets) { + size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) + * group.mShortcutTargets.size(); + } if (null != group.mBigrams) { - for (WeightedString bigram : group.mBigrams) { - size += GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE; - } + size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) + * group.mBigrams.size(); } return size; } @@ -286,7 +318,7 @@ public class BinaryDictInputOutput { * @param node the node to compute the maximum size of. */ private static void setNodeMaximumSize(Node node) { - int size = GROUP_COUNT_SIZE; + int size = getGroupCountSize(node); for (CharGroup g : node.mData) { final int groupSize = getCharGroupMaximumSize(g); g.mCachedSize = groupSize; @@ -303,6 +335,13 @@ public class BinaryDictInputOutput { } /** + * Helper method to find out if a character info is a shortcut only. + */ + private static boolean isShortcutOnly(final CharGroupInfo info) { + return 0 != (info.mFlags & FLAG_IS_SHORTCUT_ONLY); + } + + /** * Compute the size, in bytes, that an address will occupy. * * This can be used either for children addresses (which are always positive) or for @@ -378,7 +417,7 @@ public class BinaryDictInputOutput { * @param dict the dictionary in which the word/attributes are to be found. */ private static void computeActualNodeSize(Node node, FusionDictionary dict) { - int size = GROUP_COUNT_SIZE; + int size = getGroupCountSize(node); for (CharGroup group : node.mData) { int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group); if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; @@ -387,6 +426,15 @@ public class BinaryDictInputOutput { final int offset = group.mChildren.mCachedAddress - offsetBasePoint; groupSize += getByteSize(offset); } + if (null != group.mShortcutTargets) { + for (WeightedString target : group.mShortcutTargets) { + final int offsetBasePoint = groupSize + node.mCachedAddress + size + + GROUP_FLAGS_SIZE; + final int addressOfTarget = findAddressOfWord(dict, target.mWord); + final int offset = addressOfTarget - offsetBasePoint; + groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; + } + } if (null != group.mBigrams) { for (WeightedString bigram : group.mBigrams) { final int offsetBasePoint = groupSize + node.mCachedAddress + size @@ -412,12 +460,13 @@ public class BinaryDictInputOutput { int nodeOffset = 0; for (Node n : flatNodes) { n.mCachedAddress = nodeOffset; + int groupCountSize = getGroupCountSize(n); int groupOffset = 0; for (CharGroup g : n.mData) { - g.mCachedAddress = GROUP_COUNT_SIZE + nodeOffset + groupOffset; + g.mCachedAddress = groupCountSize + nodeOffset + groupOffset; groupOffset += g.mCachedSize; } - if (groupOffset + GROUP_COUNT_SIZE != n.mCachedSize) { + if (groupOffset + groupCountSize != n.mCachedSize) { throw new RuntimeException("Bug : Stored and computed node size differ"); } nodeOffset += n.mCachedSize; @@ -545,7 +594,21 @@ public class BinaryDictInputOutput { throw new RuntimeException("Node with a strange address"); } } - if (null != group.mBigrams) flags |= FLAG_HAS_BIGRAMS; + if (null != group.mShortcutTargets) { + if (0 == group.mShortcutTargets.size()) { + throw new RuntimeException("0-sized shortcut list must be null"); + } + flags |= FLAG_HAS_SHORTCUT_TARGETS; + } + if (null != group.mBigrams) { + if (0 == group.mBigrams.size()) { + throw new RuntimeException("0-sized bigram list must be null"); + } + flags |= FLAG_HAS_BIGRAMS; + } + if (group.mIsShortcutOnly) { + flags |= FLAG_IS_SHORTCUT_ONLY; + } return flags; } @@ -592,13 +655,20 @@ public class BinaryDictInputOutput { private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) { int index = node.mCachedAddress; - final int size = node.mData.size(); - if (size > MAX_CHARGROUPS_IN_A_NODE) - throw new RuntimeException("A node has a group count over 127 (" + size + ")."); - - buffer[index++] = (byte)size; + final int groupCount = node.mData.size(); + final int countSize = getGroupCountSize(node); + if (1 == countSize) { + buffer[index++] = (byte)groupCount; + } else if (2 == countSize) { + // We need to signal 2-byte size by setting the top bit of the MSB to 1, so + // we | 0x80 to do this. + buffer[index++] = (byte)((groupCount >> 8) | 0x80); + buffer[index++] = (byte)(groupCount & 0xFF); + } else { + throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); + } int groupAddress = index; - for (int i = 0; i < size; ++i) { + for (int i = 0; i < groupCount; ++i) { CharGroup group = node.mData.get(i); if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " + "the same as the cached address of the group"); @@ -624,20 +694,36 @@ public class BinaryDictInputOutput { index += shift; groupAddress += shift; + // Write shortcuts + if (null != group.mShortcutTargets) { + final Iterator shortcutIterator = group.mShortcutTargets.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = (WeightedString)shortcutIterator.next(); + final int addressOfTarget = findAddressOfWord(dict, target.mWord); + ++groupAddress; + final int offset = addressOfTarget - groupAddress; + int shortcutFlags = makeAttributeFlags(shortcutIterator.hasNext(), offset, + target.mFrequency); + buffer[index++] = (byte)shortcutFlags; + final int shortcutShift = writeVariableAddress(buffer, index, Math.abs(offset)); + index += shortcutShift; + groupAddress += shortcutShift; + } + } // Write bigrams if (null != group.mBigrams) { - int remainingBigrams = group.mBigrams.size(); - for (WeightedString bigram : group.mBigrams) { - boolean more = remainingBigrams > 1; + final Iterator bigramIterator = group.mBigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = (WeightedString)bigramIterator.next(); final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); ++groupAddress; final int offset = addressOfBigram - groupAddress; - int bigramFlags = makeAttributeFlags(more, offset, bigram.mFrequency); + int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset, + bigram.mFrequency); buffer[index++] = (byte)bigramFlags; final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); index += bigramShift; groupAddress += bigramShift; - --remainingBigrams; } } @@ -814,14 +900,43 @@ public class BinaryDictInputOutput { childrenAddress = NO_CHILDREN_ADDRESS; break; } + ArrayList<PendingAttribute> shortcutTargets = null; + if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { + shortcutTargets = new ArrayList<PendingAttribute>(); + while (true) { + final int targetFlags = source.readUnsignedByte(); + ++addressPointer; + final int sign = 0 == (targetFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; + int targetAddress = addressPointer; + switch (targetFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + targetAddress += sign * source.readUnsignedByte(); + addressPointer += 1; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + targetAddress += sign * source.readUnsignedShort(); + addressPointer += 2; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + final int offset = ((source.readUnsignedByte() << 16) + + source.readUnsignedShort()); + targetAddress += sign * offset; + addressPointer += 3; + break; + default: + throw new RuntimeException("Has shortcut targets with no address"); + } + shortcutTargets.add(new PendingAttribute(targetFlags & FLAG_ATTRIBUTE_FREQUENCY, + targetAddress)); + if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; + } + } ArrayList<PendingAttribute> bigrams = null; if (0 != (flags & FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); - boolean more = true; - while (more) { - int bigramFlags = source.readUnsignedByte(); + while (true) { + final int bigramFlags = source.readUnsignedByte(); ++addressPointer; - more = (0 != (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)); final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; int bigramAddress = addressPointer; switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { @@ -840,14 +955,28 @@ public class BinaryDictInputOutput { addressPointer += 3; break; default: - throw new RuntimeException("Has attribute with no address"); + throw new RuntimeException("Has bigrams with no address"); } bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, bigramAddress)); + if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; } } return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, - childrenAddress, bigrams); + childrenAddress, shortcutTargets, bigrams); + } + + /** + * Reads and returns the char group count out of a file and forwards the pointer. + */ + private static int readCharGroupCount(RandomAccessFile source) throws IOException { + final int msb = source.readUnsignedByte(); + if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) { + return msb; + } else { + return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8) + + source.readUnsignedByte(); + } } /** @@ -863,8 +992,8 @@ public class BinaryDictInputOutput { int address) throws IOException { final long originalPointer = source.getFilePointer(); source.seek(headerSize); - final int count = source.readUnsignedByte(); - int groupOffset = 1; // 1 for the group count + final int count = readCharGroupCount(source); + int groupOffset = getGroupCountSize(count); final StringBuilder builder = new StringBuilder(); String result = null; @@ -920,11 +1049,19 @@ public class BinaryDictInputOutput { Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap) throws IOException { final int nodeOrigin = (int)(source.getFilePointer() - headerSize); - final int count = source.readUnsignedByte(); + final int count = readCharGroupCount(source); final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>(); - int groupOffset = nodeOrigin + 1; // 1 byte for the group count + int groupOffset = nodeOrigin + getGroupCountSize(count); for (int i = count; i > 0; --i) { CharGroupInfo info = readCharGroup(source, groupOffset); + ArrayList<WeightedString> shortcutTargets = null; + if (null != info.mShortcutTargets) { + shortcutTargets = new ArrayList<WeightedString>(); + for (PendingAttribute target : info.mShortcutTargets) { + final String word = getWordAtAddress(source, headerSize, target.mAddress); + shortcutTargets.add(new WeightedString(word, target.mFrequency)); + } + } ArrayList<WeightedString> bigrams = null; if (null != info.mBigrams) { bigrams = new ArrayList<WeightedString>(); @@ -942,11 +1079,12 @@ public class BinaryDictInputOutput { source.seek(currentPosition); } nodeContents.add( - new CharGroup(info.mCharacters, bigrams, info.mFrequency, - children)); + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, + children, isShortcutOnly(info))); } else { nodeContents.add( - new CharGroup(info.mCharacters, bigrams, info.mFrequency)); + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, + isShortcutOnly(info))); } groupOffset = info.mEndAddress; } @@ -996,7 +1134,7 @@ public class BinaryDictInputOutput { new FusionDictionary.DictionaryOptions()); if (null != dict) { for (Word w : dict) { - newDict.add(w.mWord, w.mFrequency, w.mBigrams); + newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); } } diff --git a/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java b/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java index 6badfd13a..759cd452d 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java +++ b/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java @@ -29,10 +29,12 @@ public class CharGroupInfo { public final int[] mCharacters; public final int mFrequency; public final int mChildrenAddress; + public final ArrayList<PendingAttribute> mShortcutTargets; public final ArrayList<PendingAttribute> mBigrams; public CharGroupInfo(final int originalAddress, final int endAddress, final int flags, final int[] characters, final int frequency, final int childrenAddress, + final ArrayList<PendingAttribute> shortcutTargets, final ArrayList<PendingAttribute> bigrams) { mOriginalAddress = originalAddress; mEndAddress = endAddress; @@ -40,6 +42,7 @@ public class CharGroupInfo { mCharacters = characters; mFrequency = frequency; mChildrenAddress = childrenAddress; + mShortcutTargets = shortcutTargets; mBigrams = bigrams; } } diff --git a/tools/makedict/src/com/android/inputmethod/latin/DictionaryMaker.java b/tools/makedict/src/com/android/inputmethod/latin/DictionaryMaker.java index 1ba01075e..2fcd5750a 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/DictionaryMaker.java +++ b/tools/makedict/src/com/android/inputmethod/latin/DictionaryMaker.java @@ -39,11 +39,13 @@ public class DictionaryMaker { private final static String OPTION_VERSION_2 = "-2"; private final static String OPTION_INPUT_SOURCE = "-s"; private final static String OPTION_INPUT_BIGRAM_XML = "-b"; + private final static String OPTION_INPUT_SHORTCUT_XML = "-c"; private final static String OPTION_OUTPUT_BINARY = "-d"; private final static String OPTION_OUTPUT_XML = "-x"; private final static String OPTION_HELP = "-h"; public final String mInputBinary; public final String mInputUnigramXml; + public final String mInputShortcutXml; public final String mInputBigramXml; public final String mOutputBinary; public final String mOutputXml; @@ -72,8 +74,9 @@ public class DictionaryMaker { private void displayHelp() { MakedictLog.i("Usage: makedict " - + "[-s <unigrams.xml> [-b <bigrams.xml>] | -s <binary input>] " - + " [-d <binary output>] [-x <xml output>] [-2]\n" + + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts.xml>] " + + "| -s <binary input>] " + + "[-d <binary output>] [-x <xml output>] [-2]\n" + "\n" + " Converts a source dictionary file to one or several outputs.\n" + " Source can be an XML file, with an optional XML bigrams file, or a\n" @@ -90,6 +93,7 @@ public class DictionaryMaker { } String inputBinary = null; String inputUnigramXml = null; + String inputShortcutXml = null; String inputBigramXml = null; String outputBinary = null; String outputXml = null; @@ -105,7 +109,8 @@ public class DictionaryMaker { } else { // All these options need an argument if (args.isEmpty()) { - throw new RuntimeException("Option " + arg + " requires an argument"); + throw new IllegalArgumentException("Option " + arg + " is unknown or " + + "requires an argument"); } String filename = args.get(0); args.remove(0); @@ -115,12 +120,16 @@ public class DictionaryMaker { } else { inputUnigramXml = filename; } + } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) { + inputShortcutXml = filename; } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) { inputBigramXml = filename; } else if (OPTION_OUTPUT_BINARY.equals(arg)) { outputBinary = filename; } else if (OPTION_OUTPUT_XML.equals(arg)) { outputXml = filename; + } else { + throw new IllegalArgumentException("Unknown option : " + arg); } } } else { @@ -133,13 +142,14 @@ public class DictionaryMaker { } else if (null == outputBinary) { outputBinary = arg; } else { - throw new RuntimeException("Several output binary files specified"); + throw new IllegalArgumentException("Several output binary files specified"); } } } mInputBinary = inputBinary; mInputUnigramXml = inputUnigramXml; + mInputShortcutXml = inputShortcutXml; mInputBigramXml = inputBigramXml; mOutputBinary = outputBinary; mOutputXml = outputXml; @@ -167,7 +177,7 @@ public class DictionaryMaker { if (null != args.mInputBinary) { return readBinaryFile(args.mInputBinary); } else if (null != args.mInputUnigramXml) { - return readXmlFile(args.mInputUnigramXml, args.mInputBigramXml); + return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); } else { throw new RuntimeException("No input file specified"); } @@ -192,6 +202,7 @@ public class DictionaryMaker { * Read a dictionary from a unigram XML file, and optionally a bigram XML file. * * @param unigramXmlFilename the name of the unigram XML file. May not be null. + * @param shortcutXmlFilename the name of the shortcut XML file, or null if there is none. * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams. * @return the read dictionary. * @throws FileNotFoundException if one of the files can't be found @@ -200,12 +211,14 @@ public class DictionaryMaker { * @throws ParserConfigurationException if the system can't create a SAX parser */ private static FusionDictionary readXmlFile(final String unigramXmlFilename, - final String bigramXmlFilename) throws FileNotFoundException, SAXException, - IOException, ParserConfigurationException { + final String shortcutXmlFilename, final String bigramXmlFilename) + throws FileNotFoundException, SAXException, IOException, ParserConfigurationException { final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename)); + final FileInputStream shortcuts = null == shortcutXmlFilename ? null : + new FileInputStream(new File(shortcutXmlFilename)); final FileInputStream bigrams = null == bigramXmlFilename ? null : new FileInputStream(new File(bigramXmlFilename)); - return XmlDictInputOutput.readDictionaryXml(unigrams, bigrams); + return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams); } /** diff --git a/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java b/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java index f6220eea2..918b1ca4b 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java +++ b/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java @@ -68,7 +68,7 @@ public class FusionDictionary implements Iterable<Word> { } /** - * A group of characters, with a frequency, shortcuts, bigrams, and children. + * A group of characters, with a frequency, shortcut targets, bigrams, and children. * * This is the central class of the in-memory representation. A CharGroup is what can * be seen as a traditional "trie node", except it can hold several characters at the @@ -82,25 +82,39 @@ public class FusionDictionary implements Iterable<Word> { public static class CharGroup { public static final int NOT_A_TERMINAL = -1; final int mChars[]; + final ArrayList<WeightedString> mShortcutTargets; final ArrayList<WeightedString> mBigrams; final int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. + final boolean mIsShortcutOnly; // Only valid if this is a terminal. Node mChildren; // The two following members to help with binary generation int mCachedSize; int mCachedAddress; - public CharGroup(final int[] chars, - final ArrayList<WeightedString> bigrams, final int frequency) { + public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams, final int frequency, + final boolean isShortcutOnly) { mChars = chars; mFrequency = frequency; + mIsShortcutOnly = isShortcutOnly; + if (mIsShortcutOnly && NOT_A_TERMINAL == mFrequency) { + throw new RuntimeException("A node must be a terminal to be a shortcut only"); + } + mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = null; } - public CharGroup(final int[] chars, - final ArrayList<WeightedString> bigrams, final int frequency, final Node children) { + public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams, final int frequency, final Node children, + final boolean isShortcutOnly) { mChars = chars; mFrequency = frequency; + mIsShortcutOnly = isShortcutOnly; + if (mIsShortcutOnly && NOT_A_TERMINAL == mFrequency) { + throw new RuntimeException("A node must be a terminal to be a shortcut only"); + } + mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = children; } @@ -157,6 +171,24 @@ public class FusionDictionary implements Iterable<Word> { } /** + * Helper method to add all words in a list as 0-frequency entries + * + * These words are added when shortcuts targets or bigrams are not found in the dictionary + * yet. The same words may be added later with an actual frequency - this is handled by + * the private version of add(). + */ + private void addNeutralWords(final ArrayList<WeightedString> words) { + if (null != words) { + for (WeightedString word : words) { + final CharGroup t = findWordInTree(mRoot, word.mWord); + if (null == t) { + add(getCodePoints(word.mWord), 0, null, null, false /* isShortcutOnly */); + } + } + } + } + + /** * Helper method to add a word as a string. * * This method adds a word to the dictionary with the given frequency. Optional @@ -165,18 +197,19 @@ public class FusionDictionary implements Iterable<Word> { * * @param word the word to add. * @param frequency the frequency of the word, in the range [0..255]. + * @param shortcutTargets a list of shortcut targets for this word, or null. * @param bigrams a list of bigrams, or null. */ - public void add(String word, int frequency, ArrayList<WeightedString> bigrams) { + public void add(final String word, final int frequency, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams) { + if (null != shortcutTargets) { + addNeutralWords(shortcutTargets); + } if (null != bigrams) { - for (WeightedString bigram : bigrams) { - final CharGroup t = findWordInTree(mRoot, bigram.mWord); - if (null == t) { - add(getCodePoints(bigram.mWord), 0, null); - } - } + addNeutralWords(bigrams); } - add(getCodePoints(word), frequency, bigrams); + add(getCodePoints(word), frequency, shortcutTargets, bigrams, false /* isShortcutOnly */); } /** @@ -198,16 +231,37 @@ public class FusionDictionary implements Iterable<Word> { } /** + * Helper method to add a shortcut that should not be a dictionary word. + * + * @param word the word to add. + * @param frequency the frequency of the word, in the range [0..255]. + * @param shortcutTargets a list of shortcut targets. May not be null. + */ + public void addShortcutOnly(final String word, final int frequency, + final ArrayList<WeightedString> shortcutTargets) { + if (null == shortcutTargets) { + throw new RuntimeException("Can't add a shortcut without targets"); + } + addNeutralWords(shortcutTargets); + add(getCodePoints(word), frequency, shortcutTargets, null, true /* isShortcutOnly */); + } + + /** * Add a word to this dictionary. * - * The bigrams, if any, have to be in the dictionary already. If they aren't, + * The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't, * an exception is thrown. * * @param word the word, as an int array. * @param frequency the frequency of the word, in the range [0..255]. + * @param shortcutTargets an optional list of shortcut targets for this word (null if none). * @param bigrams an optional list of bigrams for this word (null if none). + * @param isShortcutOnly whether this should be a shortcut only. */ - private void add(int[] word, int frequency, ArrayList<WeightedString> bigrams) { + private void add(final int[] word, final int frequency, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams, + final boolean isShortcutOnly) { assert(frequency >= 0 && frequency <= 255); Node currentNode = mRoot; int charIndex = 0; @@ -231,7 +285,8 @@ public class FusionDictionary implements Iterable<Word> { // No node at this point to accept the word. Create one. final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final CharGroup newGroup = new CharGroup( - Arrays.copyOfRange(word, charIndex, word.length), bigrams, frequency); + Arrays.copyOfRange(word, charIndex, word.length), + shortcutTargets, bigrams, frequency, isShortcutOnly); currentNode.mData.add(insertionIndex, newGroup); checkStack(currentNode); } else { @@ -245,7 +300,8 @@ public class FusionDictionary implements Iterable<Word> { + new String(word, 0, word.length)); } else { final CharGroup newNode = new CharGroup(currentGroup.mChars, - bigrams, frequency, currentGroup.mChildren); + shortcutTargets, bigrams, frequency, currentGroup.mChildren, + isShortcutOnly); currentNode.mData.set(nodeIndex, newNode); checkStack(currentNode); } @@ -254,13 +310,13 @@ public class FusionDictionary implements Iterable<Word> { // We only have to create a new node and add it to the end of this. final CharGroup newNode = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), - bigrams, frequency); + shortcutTargets, bigrams, frequency, isShortcutOnly); currentGroup.mChildren = new Node(); currentGroup.mChildren.mData.add(newNode); } } else { if (0 == differentCharIndex) { - // Exact same word. Check the frequency is 0 or -1, and update. + // Exact same word. Check the frequency is 0 or NOT_A_TERMINAL, and update. if (0 != frequency) { if (0 < currentGroup.mFrequency) { throw new RuntimeException("This word already exists with frequency " @@ -268,7 +324,9 @@ public class FusionDictionary implements Iterable<Word> { + new String(word, 0, word.length)); } final CharGroup newGroup = new CharGroup(word, - currentGroup.mBigrams, frequency, currentGroup.mChildren); + currentGroup.mShortcutTargets, currentGroup.mBigrams, + frequency, currentGroup.mChildren, + currentGroup.mIsShortcutOnly && isShortcutOnly); currentNode.mData.set(nodeIndex, newGroup); } } else { @@ -277,22 +335,27 @@ public class FusionDictionary implements Iterable<Word> { Node newChildren = new Node(); final CharGroup newOldWord = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, - currentGroup.mChars.length), - currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren); + currentGroup.mChars.length), currentGroup.mShortcutTargets, + currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren, + currentGroup.mIsShortcutOnly); newChildren.mData.add(newOldWord); final CharGroup newParent; if (charIndex + differentCharIndex >= word.length) { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - bigrams, frequency, newChildren); + shortcutTargets, bigrams, frequency, newChildren, isShortcutOnly); } else { + // isShortcutOnly makes no sense for non-terminal nodes. The following node + // is non-terminal (frequency 0 in FusionDictionary representation) so we + // pass false for isShortcutOnly newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - null, -1, newChildren); + null, null, -1, newChildren, false /* isShortcutOnly */); final CharGroup newWord = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, - word.length), bigrams, frequency); + word.length), shortcutTargets, bigrams, frequency, + isShortcutOnly); final int addIndex = word[charIndex + differentCharIndex] > currentGroup.mChars[differentCharIndex] ? 1 : 0; newChildren.mData.add(addIndex, newWord); @@ -355,7 +418,8 @@ public class FusionDictionary implements Iterable<Word> { */ private static int findInsertionIndex(final Node node, int character) { final List data = node.mData; - final CharGroup reference = new CharGroup(new int[] { character }, null, 0); + final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0, + false /* isShortcutOnly */); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); return result >= 0 ? result : -result - 1; } @@ -399,6 +463,16 @@ public class FusionDictionary implements Iterable<Word> { } /** + * Helper method to find out whether a word is in the dict or not. + */ + public boolean hasWord(final String s) { + if (null == s || "".equals(s)) { + throw new RuntimeException("Can't search for a null or empty string"); + } + return null != findWordInTree(mRoot, s); + } + + /** * Recursively count the number of character groups in a given branch of the trie. * * @param node the parent node. @@ -573,7 +647,8 @@ public class FusionDictionary implements Iterable<Word> { } if (currentGroup.mFrequency >= 0) return new Word(mCurrentString.toString(), currentGroup.mFrequency, - currentGroup.mBigrams); + currentGroup.mShortcutTargets, currentGroup.mBigrams, + currentGroup.mIsShortcutOnly); } else { mPositions.removeLast(); currentPos = mPositions.getLast(); diff --git a/tools/makedict/src/com/android/inputmethod/latin/Word.java b/tools/makedict/src/com/android/inputmethod/latin/Word.java index 916165a41..cf6116f91 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/Word.java +++ b/tools/makedict/src/com/android/inputmethod/latin/Word.java @@ -28,12 +28,18 @@ import java.util.ArrayList; public class Word implements Comparable<Word> { final String mWord; final int mFrequency; + final boolean mIsShortcutOnly; + final ArrayList<WeightedString> mShortcutTargets; final ArrayList<WeightedString> mBigrams; - public Word(String word, int frequency, ArrayList<WeightedString> bigrams) { + public Word(final String word, final int frequency, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams, final boolean isShortcutOnly) { mWord = word; mFrequency = frequency; + mShortcutTargets = shortcutTargets; mBigrams = bigrams; + mIsShortcutOnly = isShortcutOnly; } /** @@ -60,6 +66,7 @@ public class Word implements Comparable<Word> { if (!(o instanceof Word)) return false; Word w = (Word)o; return mFrequency == w.mFrequency && mWord.equals(w.mWord) + && mShortcutTargets.equals(w.mShortcutTargets) && mBigrams.equals(w.mBigrams); } } diff --git a/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java index 35a7b51d6..77c536668 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java @@ -42,8 +42,12 @@ public class XmlDictInputOutput { private static final String WORD_TAG = "w"; private static final String BIGRAM_TAG = "bigram"; + private static final String SHORTCUT_TAG = "shortcut"; private static final String FREQUENCY_ATTR = "f"; private static final String WORD_ATTR = "word"; + private static final String SHORTCUT_ONLY_ATTR = "shortcutOnly"; + + private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1; /** * SAX handler for a unigram XML file. @@ -61,6 +65,7 @@ public class XmlDictInputOutput { int mState; // the state of the parser int mFreq; // the currently read freq String mWord; // the current word + final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; final HashMap<String, ArrayList<WeightedString>> mBigramsMap; /** @@ -69,9 +74,11 @@ public class XmlDictInputOutput { * @param dict the dictionary to construct. * @param bigrams the bigrams as a map. This may be empty, but may not be null. */ - public UnigramHandler(FusionDictionary dict, - HashMap<String, ArrayList<WeightedString>> bigrams) { + public UnigramHandler(final FusionDictionary dict, + final HashMap<String, ArrayList<WeightedString>> shortcuts, + final HashMap<String, ArrayList<WeightedString>> bigrams) { mDictionary = dict; + mShortcutsMap = shortcuts; mBigramsMap = bigrams; mWord = ""; mState = START; @@ -107,47 +114,96 @@ public class XmlDictInputOutput { @Override public void endElement(String uri, String localName, String qName) { if (WORD == mState) { - mDictionary.add(mWord, mFreq, mBigramsMap.get(mWord)); + mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord)); mState = START; } } } + static private class AssociativeListHandler extends DefaultHandler { + private final String SRC_TAG; + private final String SRC_ATTRIBUTE; + private final String DST_TAG; + private final String DST_ATTRIBUTE; + private final String DST_FREQ; + + // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX + private final static int XML_MAX = 256; + // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX + private final static int MEMORY_MAX = 16; + private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; + + private String mSrc; + private final HashMap<String, ArrayList<WeightedString>> mAssocMap; + + public AssociativeListHandler(final String srcTag, final String srcAttribute, + final String dstTag, final String dstAttribute, final String dstFreq) { + SRC_TAG = srcTag; + SRC_ATTRIBUTE = srcAttribute; + DST_TAG = dstTag; + DST_ATTRIBUTE = dstAttribute; + DST_FREQ = dstFreq; + mSrc = null; + mAssocMap = new HashMap<String, ArrayList<WeightedString>>(); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attrs) { + if (SRC_TAG.equals(localName)) { + mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); + } else if (DST_TAG.equals(localName)) { + String dst = attrs.getValue(uri, DST_ATTRIBUTE); + int freq = Integer.parseInt(attrs.getValue(uri, DST_FREQ)); + WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); + ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); + if (null == bigramList) bigramList = new ArrayList<WeightedString>(); + bigramList.add(bigram); + mAssocMap.put(mSrc, bigramList); + } + } + + public HashMap<String, ArrayList<WeightedString>> getAssocMap() { + return mAssocMap; + } + } + /** * SAX handler for a bigram XML file. */ - static private class BigramHandler extends DefaultHandler { + static private class BigramHandler extends AssociativeListHandler { private final static String BIGRAM_W1_TAG = "bi"; private final static String BIGRAM_W2_TAG = "w"; private final static String BIGRAM_W1_ATTRIBUTE = "w1"; private final static String BIGRAM_W2_ATTRIBUTE = "w2"; private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; - String mW1; - final HashMap<String, ArrayList<WeightedString>> mBigramsMap; - public BigramHandler() { - mW1 = null; - mBigramsMap = new HashMap<String, ArrayList<WeightedString>>(); + super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, + BIGRAM_FREQ_ATTRIBUTE); } - @Override - public void startElement(String uri, String localName, String qName, Attributes attrs) { - if (BIGRAM_W1_TAG.equals(localName)) { - mW1 = attrs.getValue(uri, BIGRAM_W1_ATTRIBUTE); - } else if (BIGRAM_W2_TAG.equals(localName)) { - String w2 = attrs.getValue(uri, BIGRAM_W2_ATTRIBUTE); - int freq = Integer.parseInt(attrs.getValue(uri, BIGRAM_FREQ_ATTRIBUTE)); - WeightedString bigram = new WeightedString(w2, freq / 8); - ArrayList<WeightedString> bigramList = mBigramsMap.get(mW1); - if (null == bigramList) bigramList = new ArrayList<WeightedString>(); - bigramList.add(bigram); - mBigramsMap.put(mW1, bigramList); - } + public HashMap<String, ArrayList<WeightedString>> getBigramMap() { + return getAssocMap(); } + } - public HashMap<String, ArrayList<WeightedString>> getBigramMap() { - return mBigramsMap; + /** + * SAX handler for a shortcut XML file. + */ + static private class ShortcutHandler extends AssociativeListHandler { + private final static String ENTRY_TAG = "entry"; + private final static String ENTRY_ATTRIBUTE = "shortcut"; + private final static String TARGET_TAG = "target"; + private final static String REPLACEMENT_ATTRIBUTE = "replacement"; + private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; + + public ShortcutHandler() { + super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, + TARGET_PRIORITY_ATTRIBUTE); + } + + public HashMap<String, ArrayList<WeightedString>> getShortcutMap() { + return getAssocMap(); } } @@ -158,9 +214,12 @@ public class XmlDictInputOutput { * representation. * * @param unigrams the file to read the data from. + * @param shortcuts the file to read the shortcuts from, or null. + * @param bigrams the file to read the bigrams from, or null. * @return the in-memory representation of the dictionary. */ - public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams) + public static FusionDictionary readDictionaryXml(final InputStream unigrams, + final InputStream shortcuts, final InputStream bigrams) throws SAXException, IOException, ParserConfigurationException { final SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(true); @@ -168,10 +227,23 @@ public class XmlDictInputOutput { final BigramHandler bigramHandler = new BigramHandler(); if (null != bigrams) parser.parse(bigrams, bigramHandler); + final ShortcutHandler shortcutHandler = new ShortcutHandler(); + if (null != shortcuts) parser.parse(shortcuts, shortcutHandler); + final FusionDictionary dict = new FusionDictionary(); final UnigramHandler unigramHandler = - new UnigramHandler(dict, bigramHandler.getBigramMap()); + new UnigramHandler(dict, shortcutHandler.getShortcutMap(), + bigramHandler.getBigramMap()); parser.parse(unigrams, unigramHandler); + + final HashMap<String, ArrayList<WeightedString>> shortcutMap = + shortcutHandler.getShortcutMap(); + for (final String shortcut : shortcutMap.keySet()) { + if (dict.hasWord(shortcut)) continue; + // TODO: list a frequency in the shortcut file and use it here, instead of + // a constant freq + dict.addShortcutOnly(shortcut, SHORTCUT_ONLY_DEFAULT_FREQ, shortcutMap.get(shortcut)); + } return dict; } @@ -204,9 +276,20 @@ public class XmlDictInputOutput { } // TODO: use an XMLSerializer if this gets big destination.write("<wordlist format=\"2\">\n"); + destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); for (Word word : set) { destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " - + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">"); + + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\" " + SHORTCUT_ONLY_ATTR + + "=\"" + word.mIsShortcutOnly + "\">"); + if (null != word.mShortcutTargets) { + destination.write("\n"); + for (WeightedString target : word.mShortcutTargets) { + destination.write(" <" + SHORTCUT_TAG + " " + FREQUENCY_ATTR + "=\"" + + target.mFrequency + "\">" + target.mWord + "</" + SHORTCUT_TAG + + ">\n"); + } + destination.write(" "); + } if (null != word.mBigrams) { destination.write("\n"); for (WeightedString bigram : word.mBigrams) { diff --git a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java index 79cf14b2b..6ac046bbf 100644 --- a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java +++ b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java @@ -39,11 +39,11 @@ public class BinaryDictInputOutputTest extends TestCase { // that it does not contain any duplicates. public void testFlattenNodes() { final FusionDictionary dict = new FusionDictionary(); - dict.add("foo", 1, null); - dict.add("fta", 1, null); - dict.add("ftb", 1, null); - dict.add("bar", 1, null); - dict.add("fool", 1, null); + dict.add("foo", 1, null, null); + dict.add("fta", 1, null, null); + dict.add("ftb", 1, null, null); + dict.add("bar", 1, null, null); + dict.add("fool", 1, null, null); final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot); assertEquals(4, result.size()); while (!result.isEmpty()) { |