diff options
Diffstat (limited to 'tools')
17 files changed, 773 insertions, 60 deletions
diff --git a/tools/dicttool/Android.mk b/tools/dicttool/Android.mk index 5bd836a01..159c1c160 100644 --- a/tools/dicttool/Android.mk +++ b/tools/dicttool/Android.mk @@ -16,14 +16,18 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) -LATINIME_CORE_SOURCE_DIRECTORY := ../../java/src/com/android/inputmethod/latin +LATINIME_BASE_SOURCE_DIRECTORY := ../../java/src/com/android/inputmethod +LATINIME_CORE_SOURCE_DIRECTORY := $(LATINIME_BASE_SOURCE_DIRECTORY)/latin +LATINIME_ANNOTATIONS_SOURCE_DIRECTORY := $(LATINIME_BASE_SOURCE_DIRECTORY)/annotations MAKEDICT_CORE_SOURCE_DIRECTORY := $(LATINIME_CORE_SOURCE_DIRECTORY)/makedict LOCAL_MAIN_SRC_FILES := $(call all-java-files-under,$(MAKEDICT_CORE_SOURCE_DIRECTORY)) LOCAL_TOOL_SRC_FILES := $(call all-java-files-under,src) +LOCAL_ANNOTATIONS_SRC_FILES := $(call all-java-files-under,$(LATINIME_ANNOTATIONS_SOURCE_DIRECTORY)) LOCAL_SRC_FILES := $(LOCAL_TOOL_SRC_FILES) \ $(filter-out $(addprefix %/, $(notdir $(LOCAL_TOOL_SRC_FILES))), $(LOCAL_MAIN_SRC_FILES)) \ $(call all-java-files-under,tests) \ + $(LOCAL_ANNOTATIONS_SRC_FILES) \ $(LATINIME_CORE_SOURCE_DIRECTORY)/Constants.java LOCAL_JAR_MANIFEST := etc/manifest.txt diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/AdditionalCommandList.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/AdditionalCommandList.java index 8d4eb751b..8d4eb751b 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/AdditionalCommandList.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/AdditionalCommandList.java diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java new file mode 100644 index 000000000..7a2fde8a0 --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; + +import java.io.File; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; + +/** + * Class grouping utilities for offline dictionary making. + * + * Those should not be used on-device, essentially because they are quite + * liberal about I/O and performance. + */ +public final class BinaryDictOffdeviceUtils { + // Prefix and suffix are arbitrary, the values do not really matter + private final static String PREFIX = "dicttool"; + private final static String SUFFIX = ".tmp"; + + public final static String COMPRESSION = "compressed"; + public final static String ENCRYPTION = "encrypted"; + + public static class DecoderChainSpec { + ArrayList<String> mDecoderSpec = new ArrayList<String>(); + File mFile; + public DecoderChainSpec addStep(final String stepDescription) { + mDecoderSpec.add(stepDescription); + return this; + } + public String describeChain() { + final StringBuilder s = new StringBuilder("raw"); + for (final String step : mDecoderSpec) { + s.append(" > "); + s.append(step); + } + return s.toString(); + } + } + + public static void copy(final InputStream input, final OutputStream output) throws IOException { + final byte[] buffer = new byte[1000]; + final BufferedInputStream in = new BufferedInputStream(input); + final BufferedOutputStream out = new BufferedOutputStream(output); + for (int readBytes = in.read(buffer); readBytes >= 0; readBytes = in.read(buffer)) + output.write(buffer, 0, readBytes); + in.close(); + out.close(); + } + + /** + * Returns a decrypted/uncompressed binary dictionary. + * + * This will decrypt/uncompress any number of times as necessary until it finds the binary + * dictionary signature, and copy the decoded file to a temporary place. + * If this is not a binary dictionary, the method returns null. + */ + public static DecoderChainSpec getRawBinaryDictionaryOrNull(final File src) { + return getRawBinaryDictionaryOrNullInternal(new DecoderChainSpec(), src); + } + + private static DecoderChainSpec getRawBinaryDictionaryOrNullInternal( + final DecoderChainSpec spec, final File src) { + // TODO: arrange for the intermediary files to be deleted + if (BinaryDictInputOutput.isBinaryDictionary(src)) { + spec.mFile = src; + return spec; + } + // It's not a raw dictionary - try to see if it's compressed. + final File uncompressedFile = tryGetUncompressedFile(src); + if (null != uncompressedFile) { + final DecoderChainSpec newSpec = + getRawBinaryDictionaryOrNullInternal(spec, uncompressedFile); + if (null == newSpec) return null; + return newSpec.addStep(COMPRESSION); + } + // It's not a compressed either - try to see if it's crypted. + final File decryptedFile = tryGetDecryptedFile(src); + if (null != decryptedFile) { + final DecoderChainSpec newSpec = + getRawBinaryDictionaryOrNullInternal(spec, decryptedFile); + if (null == newSpec) return null; + return newSpec.addStep(ENCRYPTION); + } + return null; + } + + /* Try to uncompress the file passed as an argument. + * + * If the file can be uncompressed, the uncompressed version is returned. Otherwise, null + * is returned. + */ + private static File tryGetUncompressedFile(final File src) { + try { + final File dst = File.createTempFile(PREFIX, SUFFIX); + final FileOutputStream dstStream = new FileOutputStream(dst); + copy(Compress.getUncompressedStream(new BufferedInputStream(new FileInputStream(src))), + new BufferedOutputStream(dstStream)); // #copy() closes the streams + return dst; + } catch (IOException e) { + // Could not uncompress the file: presumably the file is simply not a compressed file + return null; + } + } + + /* Try to decrypt the file passed as an argument. + * + * If the file can be decrypted, the decrypted version is returned. Otherwise, null + * is returned. + */ + private static File tryGetDecryptedFile(final File src) { + try { + final File dst = File.createTempFile(PREFIX, SUFFIX); + final FileOutputStream dstStream = new FileOutputStream(dst); + copy(Crypt.getDecryptedStream(new BufferedInputStream(new FileInputStream(src))), + dstStream); // #copy() closes the streams + return dst; + } catch (IOException e) { + // Could not uncompress the file: presumably the file is simply not a compressed file + return null; + } + } +} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java new file mode 100644 index 000000000..cd04d18bb --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.Word; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.TreeSet; + +/** + * Reads and writes combined format for a FusionDictionary. + * + * All functions in this class are static. + */ +public class CombinedInputOutput { + + private static final String DICTIONARY_TAG = "dictionary"; + private static final String BIGRAM_TAG = "bigram"; + private static final String SHORTCUT_TAG = "shortcut"; + private static final String FREQUENCY_TAG = "f"; + private static final String WORD_TAG = "word"; + private static final String NOT_A_WORD_TAG = "not_a_word"; + private static final String WHITELIST_TAG = "whitelist"; + private static final String OPTIONS_TAG = "options"; + private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; + private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; + + /** + * Basic test to find out whether the file is in the combined format or not. + * + * Concretely this only tests the header line. + * + * @param filename The name of the file to test. + * @return true if the file is in the combined format, false otherwise + */ + public static boolean isCombinedDictionary(final String filename) { + BufferedReader reader = null; + try { + reader = new BufferedReader(new FileReader(new File(filename))); + final String firstLine = reader.readLine(); + return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); + } catch (FileNotFoundException e) { + return false; + } catch (IOException e) { + return false; + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** + * Reads a dictionary from a combined format file. + * + * This is the public method that will read a combined file and return the corresponding memory + * representation. + * + * @param source the file to read the data from. + * @return the in-memory representation of the dictionary. + */ + public static FusionDictionary readDictionaryCombined(final InputStream source) + throws IOException { + final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8")); + final String headerLine = reader.readLine(); + final String header[] = headerLine.split(","); + final HashMap<String, String> attributes = new HashMap<String, String>(); + for (String item : header) { + final String keyValue[] = item.split("="); + if (2 != keyValue.length) { + throw new RuntimeException("Wrong header format : " + headerLine); + } + attributes.put(keyValue[0], keyValue[1]); + } + + final boolean processUmlauts = + GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); + final boolean processLigatures = + FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); + attributes.remove(OPTIONS_TAG); + final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions( + attributes, processUmlauts, processLigatures)); + + String line; + String word = null; + int freq = 0; + boolean isNotAWord = false; + ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>(); + ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>(); + while (null != (line = reader.readLine())) { + final String args[] = line.trim().split(","); + if (args[0].matches(WORD_TAG + "=.*")) { + if (null != word) { + dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + for (WeightedString s : bigrams) { + dict.setBigram(word, s.mWord, s.mFrequency); + } + } + if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>(); + if (!bigrams.isEmpty()) bigrams = new ArrayList<WeightedString>(); + isNotAWord = false; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (WORD_TAG.equals(params[0])) { + word = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + freq = Integer.parseInt(params[1]); + } else if (NOT_A_WORD_TAG.equals(params[0])) { + isNotAWord = "true".equals(params[1]); + } + } + } else if (args[0].matches(SHORTCUT_TAG + "=.*")) { + String shortcut = null; + int shortcutFreq = 0; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (SHORTCUT_TAG.equals(params[0])) { + shortcut = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + shortcutFreq = WHITELIST_TAG.equals(params[1]) + ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY + : Integer.parseInt(params[1]); + } + } + if (null != shortcut) { + shortcuts.add(new WeightedString(shortcut, shortcutFreq)); + } else { + throw new RuntimeException("Wrong format : " + line); + } + } else if (args[0].matches(BIGRAM_TAG + "=.*")) { + String secondWordOfBigram = null; + int bigramFreq = 0; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (BIGRAM_TAG.equals(params[0])) { + secondWordOfBigram = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + bigramFreq = Integer.parseInt(params[1]); + } + } + if (null != secondWordOfBigram) { + bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq)); + } else { + throw new RuntimeException("Wrong format : " + line); + } + } + } + if (null != word) { + dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + for (WeightedString s : bigrams) { + dict.setBigram(word, s.mWord, s.mFrequency); + } + } + + return dict; + } + + /** + * Writes a dictionary to a combined file. + * + * @param destination a destination stream to write to. + * @param dict the dictionary to write. + */ + public static void writeDictionaryCombined(Writer destination, FusionDictionary dict) + throws IOException { + final TreeSet<Word> set = new TreeSet<Word>(); + for (Word word : dict) { + set.add(word); // This for ordering by frequency, then by asciibetic order + } + final HashMap<String, String> options = dict.mOptions.mAttributes; + destination.write(DICTIONARY_TAG + "="); + if (options.containsKey(DICTIONARY_TAG)) { + destination.write(options.get(DICTIONARY_TAG)); + options.remove(DICTIONARY_TAG); + } + if (dict.mOptions.mGermanUmlautProcessing) { + destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION); + } else if (dict.mOptions.mFrenchLigatureProcessing) { + destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION); + } + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + destination.write("," + key + "=" + value); + } + destination.write("\n"); + for (Word word : set) { + destination.write("\t" + WORD_TAG + "=" + word.mWord + "," + + FREQUENCY_TAG + "=" + word.mFrequency + + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); + if (null != word.mShortcutTargets) { + for (WeightedString target : word.mShortcutTargets) { + destination.write("\t\t" + SHORTCUT_TAG + "=" + target.mWord + "," + + FREQUENCY_TAG + "=" + target.mFrequency + "\n"); + } + } + if (null != word.mBigrams) { + for (WeightedString bigram : word.mBigrams) { + destination.write("\t\t" + BIGRAM_TAG + "=" + bigram.mWord + "," + + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n"); + } + } + } + destination.close(); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/CommandList.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java index d16b069fe..d16b069fe 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/CommandList.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Compress.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java index 3cb0a12c4..072de5c01 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Compress.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java @@ -16,6 +16,8 @@ package com.android.inputmethod.latin.dicttool; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -27,23 +29,15 @@ import java.util.zip.GZIPOutputStream; public class Compress { - private static OutputStream getCompressedStream(final OutputStream out) + public static OutputStream getCompressedStream(final OutputStream out) throws java.io.IOException { return new GZIPOutputStream(out); } - private static InputStream getUncompressedStream(final InputStream in) throws IOException { + public static InputStream getUncompressedStream(final InputStream in) throws IOException { return new GZIPInputStream(in); } - public static void copy(final InputStream input, final OutputStream output) throws IOException { - final byte[] buffer = new byte[1000]; - for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) - output.write(buffer, 0, readBytes); - input.close(); - output.close(); - } - static public class Compressor extends Dicttool.Command { public static final String COMMAND = "compress"; public static final String STDIN_OR_STDOUT = "-"; @@ -63,10 +57,10 @@ public class Compress { final String inFilename = mArgs.length >= 1 ? mArgs[0] : STDIN_OR_STDOUT; final String outFilename = mArgs.length >= 2 ? mArgs[1] : STDIN_OR_STDOUT; final InputStream input = inFilename.equals(STDIN_OR_STDOUT) ? System.in - : new FileInputStream(new File(inFilename)); + : new BufferedInputStream(new FileInputStream(new File(inFilename))); final OutputStream output = outFilename.equals(STDIN_OR_STDOUT) ? System.out - : new FileOutputStream(new File(outFilename)); - copy(input, new GZIPOutputStream(output)); + : new BufferedOutputStream(new FileOutputStream(new File(outFilename))); + BinaryDictOffdeviceUtils.copy(input, new GZIPOutputStream(output)); } } @@ -89,10 +83,10 @@ public class Compress { final String inFilename = mArgs.length >= 1 ? mArgs[0] : STDIN_OR_STDOUT; final String outFilename = mArgs.length >= 2 ? mArgs[1] : STDIN_OR_STDOUT; final InputStream input = inFilename.equals(STDIN_OR_STDOUT) ? System.in - : new FileInputStream(new File(inFilename)); + : new BufferedInputStream(new FileInputStream(new File(inFilename))); final OutputStream output = outFilename.equals(STDIN_OR_STDOUT) ? System.out - : new FileOutputStream(new File(outFilename)); - copy(new GZIPInputStream(input), output); + : new BufferedOutputStream(new FileOutputStream(new File(outFilename))); + BinaryDictOffdeviceUtils.copy(new GZIPInputStream(input), output); } } } diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Info.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Crypt.java index e59261706..10a7301d7 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Info.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Crypt.java @@ -1,4 +1,4 @@ -/** +/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); you may not @@ -16,21 +16,17 @@ package com.android.inputmethod.latin.dicttool; -public class Info extends Dicttool.Command { - public static final String COMMAND = "info"; +import java.io.InputStream; +import java.io.OutputStream; - public Info() { +public class Crypt { + public static OutputStream getCryptedStream(final OutputStream out) { + // Encryption is not supported + return out; } - public String getHelp() { - return "info <filename>: prints various information about a dictionary file"; - } - - public void run() { - // TODO: implement this - if (mArgs.length < 1) { - throw new RuntimeException("Not enough arguments for command " + COMMAND); - } - System.out.println("Not implemented yet"); + public static InputStream getDecryptedStream(final InputStream in) { + // Decryption is not supported + return in; } } diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java index 4f8874985..cc890f60c 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -22,6 +22,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.MakedictLog; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; +import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -51,13 +52,16 @@ public class DictionaryMaker { private static final String OPTION_INPUT_SHORTCUT_XML = "-c"; private static final String OPTION_OUTPUT_BINARY = "-d"; private static final String OPTION_OUTPUT_XML = "-x"; + private static final String OPTION_OUTPUT_COMBINED = "-o"; private static final String OPTION_HELP = "-h"; public final String mInputBinary; + public final String mInputCombined; public final String mInputUnigramXml; public final String mInputShortcutXml; public final String mInputBigramXml; public final String mOutputBinary; public final String mOutputXml; + public final String mOutputCombined; public final int mOutputBinaryFormatVersion; private void checkIntegrity() throws IOException { @@ -65,28 +69,38 @@ public class DictionaryMaker { checkHasAtLeastOneOutput(); checkNotSameFile(mInputBinary, mOutputBinary); checkNotSameFile(mInputBinary, mOutputXml); + checkNotSameFile(mInputCombined, mOutputBinary); + checkNotSameFile(mInputCombined, mOutputXml); checkNotSameFile(mInputUnigramXml, mOutputBinary); checkNotSameFile(mInputUnigramXml, mOutputXml); + checkNotSameFile(mInputUnigramXml, mOutputCombined); checkNotSameFile(mInputShortcutXml, mOutputBinary); checkNotSameFile(mInputShortcutXml, mOutputXml); + checkNotSameFile(mInputShortcutXml, mOutputCombined); checkNotSameFile(mInputBigramXml, mOutputBinary); checkNotSameFile(mInputBigramXml, mOutputXml); + checkNotSameFile(mInputBigramXml, mOutputCombined); checkNotSameFile(mOutputBinary, mOutputXml); + checkNotSameFile(mOutputBinary, mOutputCombined); + checkNotSameFile(mOutputXml, mOutputCombined); } private void checkHasExactlyOneInput() { - if (null == mInputUnigramXml && null == mInputBinary) { + if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) { throw new RuntimeException("No input file specified"); - } else if (null != mInputUnigramXml && null != mInputBinary) { - throw new RuntimeException("Both input XML and binary specified"); - } else if (null != mInputBinary && null != mInputBigramXml) { - throw new RuntimeException("Cannot specify a binary input and a separate bigram " - + "file"); + } else if ((null != mInputUnigramXml && null != mInputBinary) + || (null != mInputUnigramXml && null != mInputCombined) + || (null != mInputBinary && null != mInputCombined)) { + throw new RuntimeException("Several input files specified"); + } else if ((null != mInputBinary || null != mInputCombined) + && (null != mInputBigramXml || null != mInputShortcutXml)) { + throw new RuntimeException("Separate bigrams/shortcut files are only supported" + + " with XML input (other formats include bigrams and shortcuts already)"); } } private void checkHasAtLeastOneOutput() { - if (null == mOutputBinary && null == mOutputXml) { + if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) { throw new RuntimeException("No output specified"); } } @@ -110,17 +124,16 @@ public class DictionaryMaker { public static String getHelp() { return "Usage: makedict " + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] " + + "| [-s <combined format input]" + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] " + + " [-o <combined output>]" + "[-1] [-2] [-3]\n" + "\n" + " Converts a source dictionary file to one or several outputs.\n" + " Source can be an XML file, with an optional XML bigrams file, or a\n" + " binary dictionary file.\n" - + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3 and XML outputs\n" - + " are supported. All three can be output at the same time, but the same\n" - + " output format cannot be specified several times. The behavior is\n" - + " unspecified if the same file is specified for input and output, or for\n" - + " several outputs."; + + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3, XML and\n" + + " combined format outputs are supported."; } public Arguments(String[] argsArray) throws IOException { @@ -129,11 +142,13 @@ public class DictionaryMaker { displayHelp(); } String inputBinary = null; + String inputCombined = null; String inputUnigramXml = null; String inputShortcutXml = null; String inputBigramXml = null; String outputBinary = null; String outputXml = null; + String outputCombined = null; int outputBinaryFormatVersion = 2; // the default version is 2. while (!args.isEmpty()) { @@ -157,10 +172,15 @@ public class DictionaryMaker { String filename = args.get(0); args.remove(0); if (OPTION_INPUT_SOURCE.equals(arg)) { - if (BinaryDictInputOutput.isBinaryDictionary(filename)) { + if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) { + inputUnigramXml = filename; + } else if (CombinedInputOutput.isCombinedDictionary(filename)) { + inputCombined = filename; + } else if (BinaryDictInputOutput.isBinaryDictionary(filename)) { inputBinary = filename; } else { - inputUnigramXml = filename; + throw new IllegalArgumentException( + "Unknown format for file " + filename); } } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) { inputShortcutXml = filename; @@ -170,6 +190,8 @@ public class DictionaryMaker { outputBinary = filename; } else if (OPTION_OUTPUT_XML.equals(arg)) { outputXml = filename; + } else if (OPTION_OUTPUT_COMBINED.equals(arg)) { + outputCombined = filename; } else { throw new IllegalArgumentException("Unknown option : " + arg); } @@ -178,6 +200,8 @@ public class DictionaryMaker { if (null == inputBinary && null == inputUnigramXml) { if (BinaryDictInputOutput.isBinaryDictionary(arg)) { inputBinary = arg; + } else if (CombinedInputOutput.isCombinedDictionary(arg)) { + inputCombined = arg; } else { inputUnigramXml = arg; } @@ -190,11 +214,13 @@ public class DictionaryMaker { } mInputBinary = inputBinary; + mInputCombined = inputCombined; mInputUnigramXml = inputUnigramXml; mInputShortcutXml = inputShortcutXml; mInputBigramXml = inputBigramXml; mOutputBinary = outputBinary; mOutputXml = outputXml; + mOutputCombined = outputCombined; mOutputBinaryFormatVersion = outputBinaryFormatVersion; checkIntegrity(); } @@ -219,6 +245,8 @@ public class DictionaryMaker { SAXException, FileNotFoundException { if (null != args.mInputBinary) { return readBinaryFile(args.mInputBinary); + } else if (null != args.mInputCombined) { + return readCombinedFile(args.mInputCombined); } else if (null != args.mInputUnigramXml) { return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); } else { @@ -258,6 +286,32 @@ public class DictionaryMaker { } /** + * Read a dictionary from the name of a combined file. + * + * @param combinedFilename the name of the file in the combined format. + * @return the read dictionary. + * @throws FileNotFoundException if the file can't be found + * @throws IOException if the input file can't be read + */ + private static FusionDictionary readCombinedFile(final String combinedFilename) + throws FileNotFoundException, IOException { + FileInputStream inStream = null; + try { + final File file = new File(combinedFilename); + inStream = new FileInputStream(file); + return CombinedInputOutput.readDictionaryCombined(inStream); + } finally { + if (null != inStream) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** * Read a dictionary from a unigram XML file, and optionally a bigram XML file. * * @param unigramXmlFilename the name of the unigram XML file. May not be null. @@ -298,6 +352,9 @@ public class DictionaryMaker { if (null != args.mOutputXml) { writeXmlDictionary(args.mOutputXml, dict); } + if (null != args.mOutputCombined) { + writeCombinedDictionary(args.mOutputCombined, dict); + } } /** @@ -328,6 +385,21 @@ public class DictionaryMaker { */ private static void writeXmlDictionary(final String outputFilename, final FusionDictionary dict) throws FileNotFoundException, IOException { - XmlDictInputOutput.writeDictionaryXml(new FileWriter(outputFilename), dict); + XmlDictInputOutput.writeDictionaryXml(new BufferedWriter(new FileWriter(outputFilename)), + dict); + } + + /** + * Write the dictionary in the combined format to the specified filename. + * + * @param outputFilename the name of the file to write to. + * @param dict the dictionary to write. + * @throws FileNotFoundException if the output file can't be created. + * @throws IOException if the output file can't be written to. + */ + private static void writeCombinedDictionary(final String outputFilename, + final FusionDictionary dict) throws FileNotFoundException, IOException { + CombinedInputOutput.writeDictionaryCombined( + new BufferedWriter(new FileWriter(outputFilename)), dict); } } diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java index bf417fb5a..75ce104e0 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java @@ -80,6 +80,7 @@ public class Dicttool { } catch (Exception e) { System.out.println("Exception while processing command " + command.getClass().getSimpleName() + " : " + e); + e.printStackTrace(); return; } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java new file mode 100644 index 000000000..be4b2b881 --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java @@ -0,0 +1,131 @@ +/** + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils.DecoderChainSpec; +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; +import com.android.inputmethod.latin.makedict.Word; + +import org.xml.sax.SAXException; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; + +import javax.xml.parsers.ParserConfigurationException; + +public class Info extends Dicttool.Command { + public static final String COMMAND = "info"; + + public Info() { + } + + @Override + public String getHelp() { + return COMMAND + "<filename>: prints various information about a dictionary file"; + } + + private static void crash(final String filename, final Exception e) { + throw new RuntimeException("Can't read file " + filename, e); + } + + private static FusionDictionary getDictionary(final String filename) { + final File file = new File(filename); + System.out.println("Dictionary : " + file.getAbsolutePath()); + System.out.println("Size : " + file.length() + " bytes"); + try { + if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) { + System.out.println("Format : XML unigram list"); + return XmlDictInputOutput.readDictionaryXml( + new BufferedInputStream(new FileInputStream(file)), + null /* shortcuts */, null /* bigrams */); + } else if (CombinedInputOutput.isCombinedDictionary(filename)) { + System.out.println("Format : Combined format"); + return CombinedInputOutput.readDictionaryCombined( + new BufferedInputStream(new FileInputStream(file))); + } else { + final DecoderChainSpec decodedSpec = + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(file); + if (null == decodedSpec) { + crash(filename, new RuntimeException( + filename + " does not seem to be a dictionary file")); + } + final FileInputStream inStream = new FileInputStream(decodedSpec.mFile); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, decodedSpec.mFile.length()); + System.out.println("Format : Binary dictionary format"); + System.out.println("Packaging : " + decodedSpec.describeChain()); + System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); + return BinaryDictInputOutput.readDictionaryBinary( + new BinaryDictInputOutput.ByteBufferWrapper(buffer), null); + } + } catch (IOException e) { + crash(filename, e); + } catch (SAXException e) { + crash(filename, e); + } catch (ParserConfigurationException e) { + crash(filename, e); + } catch (UnsupportedFormatException e) { + crash(filename, e); + } + return null; + } + + private static void showInfo(final FusionDictionary dict) { + System.out.println("Header attributes :"); + System.out.print(dict.mOptions.toString(2)); + int wordCount = 0; + int bigramCount = 0; + int shortcutCount = 0; + int whitelistCount = 0; + for (final Word w : dict) { + ++wordCount; + if (null != w.mBigrams) { + bigramCount += w.mBigrams.size(); + } + if (null != w.mShortcutTargets) { + shortcutCount += w.mShortcutTargets.size(); + for (WeightedString shortcutTarget : w.mShortcutTargets) { + if (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY == shortcutTarget.mFrequency) { + ++whitelistCount; + } + } + } + } + System.out.println("Words in the dictionary : " + wordCount); + System.out.println("Bigram count : " + bigramCount); + System.out.println("Shortcuts : " + shortcutCount + " (out of which " + whitelistCount + + " whitelist entries)"); + } + + @Override + public void run() { + if (mArgs.length < 1) { + throw new RuntimeException("Not enough arguments for command " + COMMAND); + } + final String filename = mArgs[0]; + final FusionDictionary dict = getDictionary(filename); + showInfo(dict); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java index c004cfbe4..c004cfbe4 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java index 252c3d655..d8d94a13c 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java @@ -22,6 +22,10 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.Node; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.Word; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.Writer; @@ -264,6 +268,35 @@ public class XmlDictInputOutput { } /** + * Basic test to find out whether the file is in the unigram XML format or not. + * + * Concretely this only tests the header line. + * + * @param filename The name of the file to test. + * @return true if the file is in the unigram XML format, false otherwise + */ + public static boolean isXmlUnigramDictionary(final String filename) { + BufferedReader reader = null; + try { + reader = new BufferedReader(new FileReader(new File(filename))); + final String firstLine = reader.readLine(); + return firstLine.matches("^\\s*<wordlist .*>\\s*$"); + } catch (FileNotFoundException e) { + return false; + } catch (IOException e) { + return false; + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** * Reads a dictionary from an XML file. * * This is the public method that will parse an XML file and return the corresponding memory diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java b/tools/dicttool/src/com/android/inputmethod/latin/makedict/MakedictLog.java index 7eccff2b4..7eccff2b4 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/makedict/MakedictLog.java diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java new file mode 100644 index 000000000..7a686e556 --- /dev/null +++ b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import junit.framework.TestCase; + +import java.io.File; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * Unit tests for BinaryDictOffdeviceUtilsTests + */ +public class BinaryDictOffdeviceUtilsTests extends TestCase { + private static final int TEST_FREQ = 37; // Some arbitrary value unlikely to happen by chance + + public void testGetRawDictWorks() throws IOException, UnsupportedFormatException { + // Create a thrice-compressed dictionary file. + final FusionDictionary dict = new FusionDictionary(new Node(), + new DictionaryOptions(new HashMap<String, String>(), + false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); + dict.add("foo", TEST_FREQ, null, false /* isNotAWord */); + dict.add("fta", 1, null, false /* isNotAWord */); + dict.add("ftb", 1, null, false /* isNotAWord */); + dict.add("bar", 1, null, false /* isNotAWord */); + dict.add("fool", 1, null, false /* isNotAWord */); + + final File dst = File.createTempFile("testGetRawDict", ".tmp"); + final OutputStream out = Compress.getCompressedStream( + Compress.getCompressedStream( + Compress.getCompressedStream( + new BufferedOutputStream(new FileOutputStream(dst))))); + + BinaryDictInputOutput.writeDictionaryBinary(out, dict, new FormatOptions(2, false)); + + // Test for an actually compressed dictionary and its contents + final BinaryDictOffdeviceUtils.DecoderChainSpec decodeSpec = + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(dst); + for (final String step : decodeSpec.mDecoderSpec) { + assertEquals("Wrong decode spec", BinaryDictOffdeviceUtils.COMPRESSION, step); + } + assertEquals("Wrong decode spec", 3, decodeSpec.mDecoderSpec.size()); + final FileInputStream inStream = new FileInputStream(decodeSpec.mFile); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, decodeSpec.mFile.length()); + final FusionDictionary resultDict = BinaryDictInputOutput.readDictionaryBinary( + new BinaryDictInputOutput.ByteBufferWrapper(buffer), + null /* dict : an optional dictionary to add words to, or null */); + assertEquals("Dictionary can't be read back correctly", + resultDict.findWordInTree(resultDict.mRoot, "foo").getFrequency(), TEST_FREQ); + } + + public void testGetRawDictFails() throws IOException { + // Randomly create some 4k file containing garbage + final File dst = File.createTempFile("testGetRawDict", ".tmp"); + final OutputStream out = new BufferedOutputStream(new FileOutputStream(dst)); + for (int i = 0; i < 1024; ++i) { + out.write(0x12345678); + } + out.close(); + + // Test that a random data file actually fails + assertNull("Wrongly identified data file", + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(dst)); + + final File gzDst = File.createTempFile("testGetRawDict", ".tmp"); + final OutputStream gzOut = + Compress.getCompressedStream(new BufferedOutputStream(new FileOutputStream(gzDst))); + for (int i = 0; i < 1024; ++i) { + gzOut.write(0x12345678); + } + gzOut.close(); + + // Test that a compressed random data file actually fails + assertNull("Wrongly identified data file", + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(gzDst)); + } +} diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java index 88589b815..096902879 100644 --- a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java +++ b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java @@ -19,24 +19,15 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import junit.framework.TestCase; + import java.util.ArrayList; import java.util.HashMap; -import junit.framework.TestCase; - /** * Unit tests for BinaryDictInputOutput. */ public class BinaryDictInputOutputTest extends TestCase { - - public void setUp() throws Exception { - super.setUp(); - } - - public void tearDown() throws Exception { - super.tearDown(); - } - // Test the flattened array contains the expected number of nodes, and // that it does not contain any duplicates. public void testFlattenNodes() { @@ -55,5 +46,4 @@ public class BinaryDictInputOutputTest extends TestCase { assertFalse("Flattened array contained the same node twice", result.contains(n)); } } - } diff --git a/tools/dicttool/tests/etc/test-dicttool.sh b/tools/dicttool/tests/etc/test-dicttool.sh index 8834611cd..0f3ed6d62 100755 --- a/tools/dicttool/tests/etc/test-dicttool.sh +++ b/tools/dicttool/tests/etc/test-dicttool.sh @@ -13,4 +13,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/../common/obj/JAVA_LIBRARIES/dicttool_intermediates/classes junit.textui.TestRunner com.android.inputmethod.latin.makedict.BinaryDictInputOutputTest +java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/framework/dicttool_aosp.jar junit.textui.TestRunner com.android.inputmethod.latin.makedict.BinaryDictInputOutputTest +java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/framework/dicttool_aosp.jar junit.textui.TestRunner com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtilsTests diff --git a/tools/maketext/res/com/android/inputmethod/keyboard/internal/KeyboardTextsSet.tmpl b/tools/maketext/res/com/android/inputmethod/keyboard/internal/KeyboardTextsSet.tmpl index 774094cd7..15aea9084 100644 --- a/tools/maketext/res/com/android/inputmethod/keyboard/internal/KeyboardTextsSet.tmpl +++ b/tools/maketext/res/com/android/inputmethod/keyboard/internal/KeyboardTextsSet.tmpl @@ -19,6 +19,7 @@ package com.android.inputmethod.keyboard.internal; import android.content.Context; import android.content.res.Resources; +import com.android.inputmethod.annotations.VisibleForTesting; import com.android.inputmethod.latin.CollectionUtils; import com.android.inputmethod.latin.R; @@ -64,7 +65,7 @@ public final class KeyboardTextsSet { loadStringResourcesInternal(context, RESOURCE_NAMES, R.string.english_ime_name); } - /* package for test */ + @VisibleForTesting void loadStringResourcesInternal(Context context, final String[] resourceNames, int referenceId) { final Resources res = context.getResources(); |