diff options
Diffstat (limited to 'tools/dicttool')
19 files changed, 1242 insertions, 114 deletions
diff --git a/tools/dicttool/Android.mk b/tools/dicttool/Android.mk index 5bd836a01..666887a2e 100644 --- a/tools/dicttool/Android.mk +++ b/tools/dicttool/Android.mk @@ -16,19 +16,27 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) -LATINIME_CORE_SOURCE_DIRECTORY := ../../java/src/com/android/inputmethod/latin +LATINIME_BASE_SOURCE_DIRECTORY := ../../java/src/com/android/inputmethod +LATINIME_CORE_SOURCE_DIRECTORY := $(LATINIME_BASE_SOURCE_DIRECTORY)/latin +LATINIME_ANNOTATIONS_SOURCE_DIRECTORY := $(LATINIME_BASE_SOURCE_DIRECTORY)/annotations MAKEDICT_CORE_SOURCE_DIRECTORY := $(LATINIME_CORE_SOURCE_DIRECTORY)/makedict -LOCAL_MAIN_SRC_FILES := $(call all-java-files-under,$(MAKEDICT_CORE_SOURCE_DIRECTORY)) -LOCAL_TOOL_SRC_FILES := $(call all-java-files-under,src) +LOCAL_MAIN_SRC_FILES := $(call all-java-files-under, $(MAKEDICT_CORE_SOURCE_DIRECTORY)) +LOCAL_TOOL_SRC_FILES := $(call all-java-files-under, src) +LOCAL_ANNOTATIONS_SRC_FILES := \ + $(call all-java-files-under, $(LATINIME_ANNOTATIONS_SOURCE_DIRECTORY)) LOCAL_SRC_FILES := $(LOCAL_TOOL_SRC_FILES) \ $(filter-out $(addprefix %/, $(notdir $(LOCAL_TOOL_SRC_FILES))), $(LOCAL_MAIN_SRC_FILES)) \ - $(call all-java-files-under,tests) \ + $(LOCAL_ANNOTATIONS_SRC_FILES) \ $(LATINIME_CORE_SOURCE_DIRECTORY)/Constants.java +ifeq ($(DICTTOOL_UNITTEST), true) + LOCAL_SRC_FILES += $(call all-java-files-under, tests) + LOCAL_JAVA_LIBRARIES := junit +endif + LOCAL_JAR_MANIFEST := etc/manifest.txt LOCAL_MODULE := dicttool_aosp -LOCAL_JAVA_LIBRARIES := junit include $(BUILD_HOST_JAVA_LIBRARY) include $(LOCAL_PATH)/etc/Android.mk diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/AdditionalCommandList.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/AdditionalCommandList.java deleted file mode 100644 index 8d4eb751b..000000000 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/AdditionalCommandList.java +++ /dev/null @@ -1,22 +0,0 @@ -/** - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package com.android.inputmethod.latin.dicttool; - -public class AdditionalCommandList { - public static void populate() { - } -} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Info.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/Info.java deleted file mode 100644 index e59261706..000000000 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Info.java +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package com.android.inputmethod.latin.dicttool; - -public class Info extends Dicttool.Command { - public static final String COMMAND = "info"; - - public Info() { - } - - public String getHelp() { - return "info <filename>: prints various information about a dictionary file"; - } - - public void run() { - // TODO: implement this - if (mArgs.length < 1) { - throw new RuntimeException("Not enough arguments for command " + COMMAND); - } - System.out.println("Not implemented yet"); - } -} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java new file mode 100644 index 000000000..c2c77d61a --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import org.xml.sax.SAXException; + +import java.io.File; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; + +import javax.xml.parsers.ParserConfigurationException; + +/** + * Class grouping utilities for offline dictionary making. + * + * Those should not be used on-device, essentially because they are quite + * liberal about I/O and performance. + */ +public final class BinaryDictOffdeviceUtils { + // Prefix and suffix are arbitrary, the values do not really matter + private final static String PREFIX = "dicttool"; + private final static String SUFFIX = ".tmp"; + + public final static String COMPRESSION = "compressed"; + public final static String ENCRYPTION = "encrypted"; + + private final static int MAX_DECODE_DEPTH = 8; + + public static class DecoderChainSpec { + ArrayList<String> mDecoderSpec = new ArrayList<String>(); + File mFile; + public DecoderChainSpec addStep(final String stepDescription) { + mDecoderSpec.add(stepDescription); + return this; + } + public String describeChain() { + final StringBuilder s = new StringBuilder("raw"); + for (final String step : mDecoderSpec) { + s.append(" > "); + s.append(step); + } + return s.toString(); + } + } + + public static void copy(final InputStream input, final OutputStream output) throws IOException { + final byte[] buffer = new byte[1000]; + final BufferedInputStream in = new BufferedInputStream(input); + final BufferedOutputStream out = new BufferedOutputStream(output); + for (int readBytes = in.read(buffer); readBytes >= 0; readBytes = in.read(buffer)) + output.write(buffer, 0, readBytes); + in.close(); + out.close(); + } + + /** + * Returns a decrypted/uncompressed binary dictionary. + * + * This will decrypt/uncompress any number of times as necessary until it finds the binary + * dictionary signature, and copy the decoded file to a temporary place. + * If this is not a binary dictionary, the method returns null. + */ + public static DecoderChainSpec getRawBinaryDictionaryOrNull(final File src) { + return getRawBinaryDictionaryOrNullInternal(new DecoderChainSpec(), src, 0); + } + + private static DecoderChainSpec getRawBinaryDictionaryOrNullInternal( + final DecoderChainSpec spec, final File src, final int depth) { + // Unfortunately the decoding scheme we use can consider any data to be encrypted + // and will product some output, meaning it's not possible to reliably detect encrypted + // data. Thus, some non-dictionary files (especially small) ones may successfully decrypt + // over and over, ending in a stack overflow. Hence we limit the depth at which we try + // decoding the file. + if (depth > MAX_DECODE_DEPTH) return null; + if (BinaryDictInputOutput.isBinaryDictionary(src)) { + spec.mFile = src; + return spec; + } + // It's not a raw dictionary - try to see if it's compressed. + final File uncompressedFile = tryGetUncompressedFile(src); + if (null != uncompressedFile) { + final DecoderChainSpec newSpec = + getRawBinaryDictionaryOrNullInternal(spec, uncompressedFile, depth + 1); + if (null == newSpec) return null; + return newSpec.addStep(COMPRESSION); + } + // It's not a compressed either - try to see if it's crypted. + final File decryptedFile = tryGetDecryptedFile(src); + if (null != decryptedFile) { + final DecoderChainSpec newSpec = + getRawBinaryDictionaryOrNullInternal(spec, decryptedFile, depth + 1); + if (null == newSpec) return null; + return newSpec.addStep(ENCRYPTION); + } + return null; + } + + /* Try to uncompress the file passed as an argument. + * + * If the file can be uncompressed, the uncompressed version is returned. Otherwise, null + * is returned. + */ + private static File tryGetUncompressedFile(final File src) { + try { + final File dst = File.createTempFile(PREFIX, SUFFIX); + dst.deleteOnExit(); + final FileOutputStream dstStream = new FileOutputStream(dst); + copy(Compress.getUncompressedStream(new BufferedInputStream(new FileInputStream(src))), + new BufferedOutputStream(dstStream)); // #copy() closes the streams + return dst; + } catch (IOException e) { + // Could not uncompress the file: presumably the file is simply not a compressed file + return null; + } + } + + /* Try to decrypt the file passed as an argument. + * + * If the file can be decrypted, the decrypted version is returned. Otherwise, null + * is returned. + */ + private static File tryGetDecryptedFile(final File src) { + try { + final File dst = File.createTempFile(PREFIX, SUFFIX); + dst.deleteOnExit(); + final FileOutputStream dstStream = new FileOutputStream(dst); + copy(Crypt.getDecryptedStream(new BufferedInputStream(new FileInputStream(src))), + dstStream); // #copy() closes the streams + return dst; + } catch (IOException e) { + // Could not decrypt the file: presumably the file is simply not a crypted file + return null; + } + } + + static void crash(final String filename, final Exception e) { + throw new RuntimeException("Can't read file " + filename, e); + } + + static FusionDictionary getDictionary(final String filename, final boolean report) { + final File file = new File(filename); + if (report) { + System.out.println("Dictionary : " + file.getAbsolutePath()); + System.out.println("Size : " + file.length() + " bytes"); + } + try { + if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) { + if (report) System.out.println("Format : XML unigram list"); + return XmlDictInputOutput.readDictionaryXml( + new BufferedInputStream(new FileInputStream(file)), + null /* shortcuts */, null /* bigrams */); + } else if (CombinedInputOutput.isCombinedDictionary(filename)) { + if (report) System.out.println("Format : Combined format"); + return CombinedInputOutput.readDictionaryCombined( + new BufferedInputStream(new FileInputStream(file))); + } else { + final DecoderChainSpec decodedSpec = getRawBinaryDictionaryOrNull(file); + if (null == decodedSpec) { + crash(filename, new RuntimeException( + filename + " does not seem to be a dictionary file")); + } else { + final FileInputStream inStream = new FileInputStream(decodedSpec.mFile); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, decodedSpec.mFile.length()); + if (report) { + System.out.println("Format : Binary dictionary format"); + System.out.println("Packaging : " + decodedSpec.describeChain()); + System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); + } + return BinaryDictInputOutput.readDictionaryBinary( + new BinaryDictInputOutput.ByteBufferWrapper(buffer), null); + } + } + } catch (IOException e) { + crash(filename, e); + } catch (SAXException e) { + crash(filename, e); + } catch (ParserConfigurationException e) { + crash(filename, e); + } catch (UnsupportedFormatException e) { + crash(filename, e); + } + return null; + } +} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java new file mode 100644 index 000000000..092ee767f --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -0,0 +1,248 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.Word; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.TreeSet; + +/** + * Reads and writes combined format for a FusionDictionary. + * + * All functions in this class are static. + */ +public class CombinedInputOutput { + + private static final String DICTIONARY_TAG = "dictionary"; + private static final String BIGRAM_TAG = "bigram"; + private static final String SHORTCUT_TAG = "shortcut"; + private static final String FREQUENCY_TAG = "f"; + private static final String WORD_TAG = "word"; + private static final String NOT_A_WORD_TAG = "not_a_word"; + private static final String WHITELIST_TAG = "whitelist"; + private static final String OPTIONS_TAG = "options"; + private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; + private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; + private static final String COMMENT_LINE_STARTER = "#"; + + /** + * Basic test to find out whether the file is in the combined format or not. + * + * Concretely this only tests the header line. + * + * @param filename The name of the file to test. + * @return true if the file is in the combined format, false otherwise + */ + public static boolean isCombinedDictionary(final String filename) { + BufferedReader reader = null; + try { + reader = new BufferedReader(new FileReader(new File(filename))); + String firstLine = reader.readLine(); + while (firstLine.startsWith(COMMENT_LINE_STARTER)) { + firstLine = reader.readLine(); + } + return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); + } catch (FileNotFoundException e) { + return false; + } catch (IOException e) { + return false; + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** + * Reads a dictionary from a combined format file. + * + * This is the public method that will read a combined file and return the corresponding memory + * representation. + * + * @param source the file to read the data from. + * @return the in-memory representation of the dictionary. + */ + public static FusionDictionary readDictionaryCombined(final InputStream source) + throws IOException { + final BufferedReader reader = new BufferedReader(new InputStreamReader(source, "UTF-8")); + String headerLine = reader.readLine(); + while (headerLine.startsWith(COMMENT_LINE_STARTER)) { + headerLine = reader.readLine(); + } + final String header[] = headerLine.split(","); + final HashMap<String, String> attributes = new HashMap<String, String>(); + for (String item : header) { + final String keyValue[] = item.split("="); + if (2 != keyValue.length) { + throw new RuntimeException("Wrong header format : " + headerLine); + } + attributes.put(keyValue[0], keyValue[1]); + } + + final boolean processUmlauts = + GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); + final boolean processLigatures = + FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); + attributes.remove(OPTIONS_TAG); + final FusionDictionary dict = new FusionDictionary(new Node(), new DictionaryOptions( + attributes, processUmlauts, processLigatures)); + + String line; + String word = null; + int freq = 0; + boolean isNotAWord = false; + ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>(); + ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>(); + while (null != (line = reader.readLine())) { + if (line.startsWith(COMMENT_LINE_STARTER)) continue; + final String args[] = line.trim().split(","); + if (args[0].matches(WORD_TAG + "=.*")) { + if (null != word) { + dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + for (WeightedString s : bigrams) { + dict.setBigram(word, s.mWord, s.mFrequency); + } + } + if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>(); + if (!bigrams.isEmpty()) bigrams = new ArrayList<WeightedString>(); + isNotAWord = false; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (WORD_TAG.equals(params[0])) { + word = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + freq = Integer.parseInt(params[1]); + } else if (NOT_A_WORD_TAG.equals(params[0])) { + isNotAWord = "true".equals(params[1]); + } + } + } else if (args[0].matches(SHORTCUT_TAG + "=.*")) { + String shortcut = null; + int shortcutFreq = 0; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (SHORTCUT_TAG.equals(params[0])) { + shortcut = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + shortcutFreq = WHITELIST_TAG.equals(params[1]) + ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY + : Integer.parseInt(params[1]); + } + } + if (null != shortcut) { + shortcuts.add(new WeightedString(shortcut, shortcutFreq)); + } else { + throw new RuntimeException("Wrong format : " + line); + } + } else if (args[0].matches(BIGRAM_TAG + "=.*")) { + String secondWordOfBigram = null; + int bigramFreq = 0; + for (String param : args) { + final String params[] = param.split("=", 2); + if (2 != params.length) throw new RuntimeException("Wrong format : " + line); + if (BIGRAM_TAG.equals(params[0])) { + secondWordOfBigram = params[1]; + } else if (FREQUENCY_TAG.equals(params[0])) { + bigramFreq = Integer.parseInt(params[1]); + } + } + if (null != secondWordOfBigram) { + bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq)); + } else { + throw new RuntimeException("Wrong format : " + line); + } + } + } + if (null != word) { + dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + for (WeightedString s : bigrams) { + dict.setBigram(word, s.mWord, s.mFrequency); + } + } + + return dict; + } + + /** + * Writes a dictionary to a combined file. + * + * @param destination a destination stream to write to. + * @param dict the dictionary to write. + */ + public static void writeDictionaryCombined(Writer destination, FusionDictionary dict) + throws IOException { + final TreeSet<Word> set = new TreeSet<Word>(); + for (Word word : dict) { + set.add(word); // This for ordering by frequency, then by asciibetic order + } + final HashMap<String, String> options = dict.mOptions.mAttributes; + destination.write(DICTIONARY_TAG + "="); + if (options.containsKey(DICTIONARY_TAG)) { + destination.write(options.get(DICTIONARY_TAG)); + options.remove(DICTIONARY_TAG); + } + if (dict.mOptions.mGermanUmlautProcessing) { + destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION); + } else if (dict.mOptions.mFrenchLigatureProcessing) { + destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION); + } + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + destination.write("," + key + "=" + value); + } + destination.write("\n"); + for (Word word : set) { + destination.write(" " + WORD_TAG + "=" + word.mWord + "," + + FREQUENCY_TAG + "=" + word.mFrequency + + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); + if (null != word.mShortcutTargets) { + for (WeightedString target : word.mShortcutTargets) { + destination.write(" " + SHORTCUT_TAG + "=" + target.mWord + "," + + FREQUENCY_TAG + "=" + target.mFrequency + "\n"); + } + } + if (null != word.mBigrams) { + for (WeightedString bigram : word.mBigrams) { + destination.write(" " + BIGRAM_TAG + "=" + bigram.mWord + "," + + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n"); + } + } + } + destination.close(); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/CommandList.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java index d16b069fe..0e0095bd6 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/CommandList.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java @@ -19,8 +19,13 @@ package com.android.inputmethod.latin.dicttool; public class CommandList { public static void populate() { Dicttool.addCommand("info", Info.class); + Dicttool.addCommand("diff", Diff.class); Dicttool.addCommand("compress", Compress.Compressor.class); Dicttool.addCommand("uncompress", Compress.Uncompressor.class); + Dicttool.addCommand("encrypt", Crypt.Encrypter.class); + Dicttool.addCommand("decrypt", Crypt.Decrypter.class); + Dicttool.addCommand("package", Package.Packager.class); + Dicttool.addCommand("unpackage", Package.Unpackager.class); Dicttool.addCommand("makedict", Makedict.class); } } diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Compress.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java index 3cb0a12c4..b7f48b522 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Compress.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Compress.java @@ -16,6 +16,8 @@ package com.android.inputmethod.latin.dicttool; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -26,24 +28,19 @@ import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; public class Compress { + private Compress() { + // This container class is not publicly instantiable. + } - private static OutputStream getCompressedStream(final OutputStream out) + public static OutputStream getCompressedStream(final OutputStream out) throws java.io.IOException { return new GZIPOutputStream(out); } - private static InputStream getUncompressedStream(final InputStream in) throws IOException { + public static InputStream getUncompressedStream(final InputStream in) throws IOException { return new GZIPInputStream(in); } - public static void copy(final InputStream input, final OutputStream output) throws IOException { - final byte[] buffer = new byte[1000]; - for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) - output.write(buffer, 0, readBytes); - input.close(); - output.close(); - } - static public class Compressor extends Dicttool.Command { public static final String COMMAND = "compress"; public static final String STDIN_OR_STDOUT = "-"; @@ -51,11 +48,13 @@ public class Compress { public Compressor() { } + @Override public String getHelp() { return COMMAND + " <src_filename> <dst_filename>: " + "Compresses a file using gzip compression"; } + @Override public void run() throws IOException { if (mArgs.length > 2) { throw new RuntimeException("Too many arguments for command " + COMMAND); @@ -63,10 +62,10 @@ public class Compress { final String inFilename = mArgs.length >= 1 ? mArgs[0] : STDIN_OR_STDOUT; final String outFilename = mArgs.length >= 2 ? mArgs[1] : STDIN_OR_STDOUT; final InputStream input = inFilename.equals(STDIN_OR_STDOUT) ? System.in - : new FileInputStream(new File(inFilename)); + : new BufferedInputStream(new FileInputStream(new File(inFilename))); final OutputStream output = outFilename.equals(STDIN_OR_STDOUT) ? System.out - : new FileOutputStream(new File(outFilename)); - copy(input, new GZIPOutputStream(output)); + : new BufferedOutputStream(new FileOutputStream(new File(outFilename))); + BinaryDictOffdeviceUtils.copy(input, new GZIPOutputStream(output)); } } @@ -77,11 +76,13 @@ public class Compress { public Uncompressor() { } + @Override public String getHelp() { return COMMAND + " <src_filename> <dst_filename>: " + "Uncompresses a file compressed with gzip compression"; } + @Override public void run() throws IOException { if (mArgs.length > 2) { throw new RuntimeException("Too many arguments for command " + COMMAND); @@ -89,10 +90,10 @@ public class Compress { final String inFilename = mArgs.length >= 1 ? mArgs[0] : STDIN_OR_STDOUT; final String outFilename = mArgs.length >= 2 ? mArgs[1] : STDIN_OR_STDOUT; final InputStream input = inFilename.equals(STDIN_OR_STDOUT) ? System.in - : new FileInputStream(new File(inFilename)); + : new BufferedInputStream(new FileInputStream(new File(inFilename))); final OutputStream output = outFilename.equals(STDIN_OR_STDOUT) ? System.out - : new FileOutputStream(new File(outFilename)); - copy(new GZIPInputStream(input), output); + : new BufferedOutputStream(new FileOutputStream(new File(outFilename))); + BinaryDictOffdeviceUtils.copy(new GZIPInputStream(input), output); } } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Crypt.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Crypt.java new file mode 100644 index 000000000..f8990231e --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Crypt.java @@ -0,0 +1,66 @@ +/** + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import java.io.InputStream; +import java.io.OutputStream; + +public class Crypt { + private Crypt() { + // This container class is not publicly instantiable. + } + + public static OutputStream getCryptedStream(final OutputStream out) { + // Encryption is not supported + return out; + } + + public static InputStream getDecryptedStream(final InputStream in) { + // Decryption is not supported + return in; + } + + static public class Encrypter extends Dicttool.Command { + public static final String COMMAND = "encrypt"; + + public Encrypter() { + } + + public String getHelp() { + return COMMAND + " <src_filename> <dst_filename>: Encrypts a file"; + } + + public void run() { + throw new UnsupportedOperationException(); + } + } + + static public class Decrypter extends Dicttool.Command { + public static final String COMMAND = "decrypt"; + + public Decrypter() { + } + + public String getHelp() { + return COMMAND + " <src_filename> <dst_filename>: Decrypts a file"; + } + + public void run() { + throw new UnsupportedOperationException(); + } + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java index 4f8874985..cc890f60c 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -22,6 +22,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.MakedictLog; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; +import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -51,13 +52,16 @@ public class DictionaryMaker { private static final String OPTION_INPUT_SHORTCUT_XML = "-c"; private static final String OPTION_OUTPUT_BINARY = "-d"; private static final String OPTION_OUTPUT_XML = "-x"; + private static final String OPTION_OUTPUT_COMBINED = "-o"; private static final String OPTION_HELP = "-h"; public final String mInputBinary; + public final String mInputCombined; public final String mInputUnigramXml; public final String mInputShortcutXml; public final String mInputBigramXml; public final String mOutputBinary; public final String mOutputXml; + public final String mOutputCombined; public final int mOutputBinaryFormatVersion; private void checkIntegrity() throws IOException { @@ -65,28 +69,38 @@ public class DictionaryMaker { checkHasAtLeastOneOutput(); checkNotSameFile(mInputBinary, mOutputBinary); checkNotSameFile(mInputBinary, mOutputXml); + checkNotSameFile(mInputCombined, mOutputBinary); + checkNotSameFile(mInputCombined, mOutputXml); checkNotSameFile(mInputUnigramXml, mOutputBinary); checkNotSameFile(mInputUnigramXml, mOutputXml); + checkNotSameFile(mInputUnigramXml, mOutputCombined); checkNotSameFile(mInputShortcutXml, mOutputBinary); checkNotSameFile(mInputShortcutXml, mOutputXml); + checkNotSameFile(mInputShortcutXml, mOutputCombined); checkNotSameFile(mInputBigramXml, mOutputBinary); checkNotSameFile(mInputBigramXml, mOutputXml); + checkNotSameFile(mInputBigramXml, mOutputCombined); checkNotSameFile(mOutputBinary, mOutputXml); + checkNotSameFile(mOutputBinary, mOutputCombined); + checkNotSameFile(mOutputXml, mOutputCombined); } private void checkHasExactlyOneInput() { - if (null == mInputUnigramXml && null == mInputBinary) { + if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) { throw new RuntimeException("No input file specified"); - } else if (null != mInputUnigramXml && null != mInputBinary) { - throw new RuntimeException("Both input XML and binary specified"); - } else if (null != mInputBinary && null != mInputBigramXml) { - throw new RuntimeException("Cannot specify a binary input and a separate bigram " - + "file"); + } else if ((null != mInputUnigramXml && null != mInputBinary) + || (null != mInputUnigramXml && null != mInputCombined) + || (null != mInputBinary && null != mInputCombined)) { + throw new RuntimeException("Several input files specified"); + } else if ((null != mInputBinary || null != mInputCombined) + && (null != mInputBigramXml || null != mInputShortcutXml)) { + throw new RuntimeException("Separate bigrams/shortcut files are only supported" + + " with XML input (other formats include bigrams and shortcuts already)"); } } private void checkHasAtLeastOneOutput() { - if (null == mOutputBinary && null == mOutputXml) { + if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) { throw new RuntimeException("No output specified"); } } @@ -110,17 +124,16 @@ public class DictionaryMaker { public static String getHelp() { return "Usage: makedict " + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] " + + "| [-s <combined format input]" + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] " + + " [-o <combined output>]" + "[-1] [-2] [-3]\n" + "\n" + " Converts a source dictionary file to one or several outputs.\n" + " Source can be an XML file, with an optional XML bigrams file, or a\n" + " binary dictionary file.\n" - + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3 and XML outputs\n" - + " are supported. All three can be output at the same time, but the same\n" - + " output format cannot be specified several times. The behavior is\n" - + " unspecified if the same file is specified for input and output, or for\n" - + " several outputs."; + + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean), 3, XML and\n" + + " combined format outputs are supported."; } public Arguments(String[] argsArray) throws IOException { @@ -129,11 +142,13 @@ public class DictionaryMaker { displayHelp(); } String inputBinary = null; + String inputCombined = null; String inputUnigramXml = null; String inputShortcutXml = null; String inputBigramXml = null; String outputBinary = null; String outputXml = null; + String outputCombined = null; int outputBinaryFormatVersion = 2; // the default version is 2. while (!args.isEmpty()) { @@ -157,10 +172,15 @@ public class DictionaryMaker { String filename = args.get(0); args.remove(0); if (OPTION_INPUT_SOURCE.equals(arg)) { - if (BinaryDictInputOutput.isBinaryDictionary(filename)) { + if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) { + inputUnigramXml = filename; + } else if (CombinedInputOutput.isCombinedDictionary(filename)) { + inputCombined = filename; + } else if (BinaryDictInputOutput.isBinaryDictionary(filename)) { inputBinary = filename; } else { - inputUnigramXml = filename; + throw new IllegalArgumentException( + "Unknown format for file " + filename); } } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) { inputShortcutXml = filename; @@ -170,6 +190,8 @@ public class DictionaryMaker { outputBinary = filename; } else if (OPTION_OUTPUT_XML.equals(arg)) { outputXml = filename; + } else if (OPTION_OUTPUT_COMBINED.equals(arg)) { + outputCombined = filename; } else { throw new IllegalArgumentException("Unknown option : " + arg); } @@ -178,6 +200,8 @@ public class DictionaryMaker { if (null == inputBinary && null == inputUnigramXml) { if (BinaryDictInputOutput.isBinaryDictionary(arg)) { inputBinary = arg; + } else if (CombinedInputOutput.isCombinedDictionary(arg)) { + inputCombined = arg; } else { inputUnigramXml = arg; } @@ -190,11 +214,13 @@ public class DictionaryMaker { } mInputBinary = inputBinary; + mInputCombined = inputCombined; mInputUnigramXml = inputUnigramXml; mInputShortcutXml = inputShortcutXml; mInputBigramXml = inputBigramXml; mOutputBinary = outputBinary; mOutputXml = outputXml; + mOutputCombined = outputCombined; mOutputBinaryFormatVersion = outputBinaryFormatVersion; checkIntegrity(); } @@ -219,6 +245,8 @@ public class DictionaryMaker { SAXException, FileNotFoundException { if (null != args.mInputBinary) { return readBinaryFile(args.mInputBinary); + } else if (null != args.mInputCombined) { + return readCombinedFile(args.mInputCombined); } else if (null != args.mInputUnigramXml) { return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); } else { @@ -258,6 +286,32 @@ public class DictionaryMaker { } /** + * Read a dictionary from the name of a combined file. + * + * @param combinedFilename the name of the file in the combined format. + * @return the read dictionary. + * @throws FileNotFoundException if the file can't be found + * @throws IOException if the input file can't be read + */ + private static FusionDictionary readCombinedFile(final String combinedFilename) + throws FileNotFoundException, IOException { + FileInputStream inStream = null; + try { + final File file = new File(combinedFilename); + inStream = new FileInputStream(file); + return CombinedInputOutput.readDictionaryCombined(inStream); + } finally { + if (null != inStream) { + try { + inStream.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** * Read a dictionary from a unigram XML file, and optionally a bigram XML file. * * @param unigramXmlFilename the name of the unigram XML file. May not be null. @@ -298,6 +352,9 @@ public class DictionaryMaker { if (null != args.mOutputXml) { writeXmlDictionary(args.mOutputXml, dict); } + if (null != args.mOutputCombined) { + writeCombinedDictionary(args.mOutputCombined, dict); + } } /** @@ -328,6 +385,21 @@ public class DictionaryMaker { */ private static void writeXmlDictionary(final String outputFilename, final FusionDictionary dict) throws FileNotFoundException, IOException { - XmlDictInputOutput.writeDictionaryXml(new FileWriter(outputFilename), dict); + XmlDictInputOutput.writeDictionaryXml(new BufferedWriter(new FileWriter(outputFilename)), + dict); + } + + /** + * Write the dictionary in the combined format to the specified filename. + * + * @param outputFilename the name of the file to write to. + * @param dict the dictionary to write. + * @throws FileNotFoundException if the output file can't be created. + * @throws IOException if the output file can't be written to. + */ + private static void writeCombinedDictionary(final String outputFilename, + final FusionDictionary dict) throws FileNotFoundException, IOException { + CombinedInputOutput.writeDictionaryCombined( + new BufferedWriter(new FileWriter(outputFilename)), dict); } } diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java index bf417fb5a..7b311c3ec 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Dicttool.java @@ -33,7 +33,6 @@ public class Dicttool { new HashMap<String, Class<? extends Command>>(); static { CommandList.populate(); - AdditionalCommandList.populate(); } public static void addCommand(final String commandName, final Class<? extends Command> cls) { sCommands.put(commandName, cls); @@ -80,6 +79,7 @@ public class Dicttool { } catch (Exception e) { System.out.println("Exception while processing command " + command.getClass().getSimpleName() + " : " + e); + e.printStackTrace(); return; } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java new file mode 100644 index 000000000..5c3e87e10 --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java @@ -0,0 +1,214 @@ +/** + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.Word; + +import java.util.Arrays; +import java.util.ArrayList; +import java.util.HashMap; + +public class Diff extends Dicttool.Command { + public static final String COMMAND = "diff"; + + public Diff() { + } + + @Override + public String getHelp() { + return COMMAND + " [-p] <dict> <dict> : shows differences between two dictionaries.\n" + + " If -p (plumbing) option is given, produce output suitable for a script"; + } + + @Override + public void run() { + if (mArgs.length < 2) { + throw new RuntimeException("Not enough arguments for command " + COMMAND); + } + final boolean plumbing; + if ("-p".equals(mArgs[0])) { + plumbing = true; + mArgs = Arrays.copyOfRange(mArgs, 1, mArgs.length); + if (mArgs.length != 2) { // There should be only 2 arguments left + throw new RuntimeException("Wrong number of arguments for command " + COMMAND); + } + } else { + plumbing = false; + } + final FusionDictionary dict0 = + BinaryDictOffdeviceUtils.getDictionary(mArgs[0], false /* report */); + if (null == dict0) throw new RuntimeException("Can't read dictionary " + mArgs[0]); + final FusionDictionary dict1 = + BinaryDictOffdeviceUtils.getDictionary(mArgs[1], false /* report */); + if (null == dict1) throw new RuntimeException("Can't read dictionary " + mArgs[1]); + if (!plumbing) { + System.out.println("Header :"); + diffHeaders(dict0, dict1); + if (languageDiffers(dict0, dict1)) { + // We only check for the language here. The rationale is that one may meaningfully + // diff a en_US with a en_GB dictionary, but someone who diffs a de dict with a + // pt_BR dict is almost certainly only interested in header-level diff, and the word + // diff would be very large, meaningless, and annoying. + return; + } + System.out.println("Body :"); + } + diffWords(dict0, dict1); + } + + private static boolean languageDiffers(final FusionDictionary dict0, + final FusionDictionary dict1) { + // If either of the dictionaries have no locale, assume it's okay + if (null == dict0.mOptions.mAttributes.get("locale")) return false; + if (null == dict1.mOptions.mAttributes.get("locale")) return false; + final String dict0Lang = dict0.mOptions.mAttributes.get("locale").split("_", 3)[0]; + final String dict1Lang = dict1.mOptions.mAttributes.get("locale").split("_", 3)[0]; + return !dict0Lang.equals(dict1Lang); + } + + private static void diffHeaders(final FusionDictionary dict0, final FusionDictionary dict1) { + boolean hasDifferences = false; + if (dict0.mOptions.mFrenchLigatureProcessing != dict1.mOptions.mFrenchLigatureProcessing) { + System.out.println(" French ligature processing : " + + dict0.mOptions.mFrenchLigatureProcessing + " <=> " + + dict1.mOptions.mFrenchLigatureProcessing); + hasDifferences = true; + } + else if (dict0.mOptions.mGermanUmlautProcessing != dict1.mOptions.mGermanUmlautProcessing) { + System.out.println(" German umlaut processing : " + + dict0.mOptions.mGermanUmlautProcessing + " <=> " + + dict1.mOptions.mGermanUmlautProcessing); + hasDifferences = true; + } + final HashMap<String, String> options1 = + new HashMap<String, String>(dict1.mOptions.mAttributes); + for (final String optionKey : dict0.mOptions.mAttributes.keySet()) { + if (!dict0.mOptions.mAttributes.get(optionKey).equals( + dict1.mOptions.mAttributes.get(optionKey))) { + System.out.println(" " + optionKey + " : " + + dict0.mOptions.mAttributes.get(optionKey) + " <=> " + + dict1.mOptions.mAttributes.get(optionKey)); + hasDifferences = true; + } + options1.remove(optionKey); + } + for (final String optionKey : options1.keySet()) { + System.out.println(" " + optionKey + " : null <=> " + options1.get(optionKey)); + hasDifferences = true; + } + if (!hasDifferences) { + System.out.println(" No differences"); + } + } + + private static void diffWords(final FusionDictionary dict0, final FusionDictionary dict1) { + boolean hasDifferences = false; + for (final Word word0 : dict0) { + final CharGroup word1 = FusionDictionary.findWordInTree(dict1.mRoot, word0.mWord); + if (null == word1) { + // This word is not in dict1 + System.out.println("Deleted: " + word0.mWord + " " + word0.mFrequency); + hasDifferences = true; + } else { + // We found the word. Compare frequencies, shortcuts, bigrams + if (word0.mFrequency != word1.getFrequency()) { + System.out.println("Freq changed: " + word0.mWord + " " + word0.mFrequency + + " -> " + word1.getFrequency()); + hasDifferences = true; + } + if (word0.mIsNotAWord != word1.getIsNotAWord()) { + System.out.println("Not a word: " + word0.mWord + " " + word0.mIsNotAWord + + " -> " + word1.getIsNotAWord()); + hasDifferences = true; + } + if (word0.mIsBlacklistEntry != word1.getIsBlacklistEntry()) { + System.out.println("Blacklist: " + word0.mWord + " " + word0.mIsBlacklistEntry + + " -> " + word1.getIsBlacklistEntry()); + hasDifferences = true; + } + hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0.mWord, + "Bigram", word0.mBigrams, word1.getBigrams()); + hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0.mWord, + "Shortcut", word0.mShortcutTargets, word1.getShortcutTargets()); + } + } + for (final Word word1 : dict1) { + final CharGroup word0 = FusionDictionary.findWordInTree(dict0.mRoot, word1.mWord); + if (null == word0) { + // This word is not in dict0 + System.out.println("Added: " + word1.mWord + " " + word1.mFrequency); + hasDifferences = true; + } + } + if (!hasDifferences) { + System.out.println(" No differences"); + } + } + + private static boolean hasAttributesDifferencesAndPrintThemIfAny(final String word, + final String type, final ArrayList<WeightedString> list0, + final ArrayList<WeightedString> list1) { + if (null == list1) { + if (null == list0) return false; + for (final WeightedString attribute0 : list0) { + System.out.println(type + " removed: " + word + " " + attribute0.mWord + " " + + attribute0.mFrequency); + } + return true; + } + boolean hasDifferences = false; + if (null != list0) { + for (final WeightedString attribute0 : list0) { + // The following tests with #equals(). The WeightedString#equals() method returns + // true if both the string and the frequency are the same. + if (!list1.contains(attribute0)) { + hasDifferences = true; + // Search for a word with the same string but a different frequency + boolean foundString = false; + for (final WeightedString attribute1 : list1) { + if (attribute0.mWord.equals(attribute1.mWord)) { + System.out.println(type + " freq changed: " + word + " " + + attribute0.mWord + " " + attribute0.mFrequency + " -> " + + attribute1.mFrequency); + list1.remove(attribute1); + foundString = true; + break; + } + } + if (!foundString) { + // We come here if we haven't found any matching string. + System.out.println(type + " removed: " + word + " " + attribute0.mWord + " " + + attribute0.mFrequency); + } + } else { + list1.remove(attribute0); + } + } + } + // We removed any matching word that we found, so now list1 only contains words that + // are not included in list0. + for (final WeightedString attribute1 : list1) { + hasDifferences = true; + System.out.println(type + " added: " + word + " " + attribute1.mWord + " " + + attribute1.mFrequency); + } + return hasDifferences; + } +} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java new file mode 100644 index 000000000..f2894544f --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java @@ -0,0 +1,128 @@ +/** + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.Word; + +import java.util.Arrays; +import java.util.ArrayList; + +public class Info extends Dicttool.Command { + public static final String COMMAND = "info"; + + public Info() { + } + + @Override + public String getHelp() { + return COMMAND + " <filename>: prints various information about a dictionary file"; + } + + private static void showInfo(final FusionDictionary dict, final boolean plumbing) { + System.out.println("Header attributes :"); + System.out.print(dict.mOptions.toString(2, plumbing)); + int wordCount = 0; + int bigramCount = 0; + int shortcutCount = 0; + int whitelistCount = 0; + for (final Word w : dict) { + ++wordCount; + if (null != w.mBigrams) { + bigramCount += w.mBigrams.size(); + } + if (null != w.mShortcutTargets) { + shortcutCount += w.mShortcutTargets.size(); + for (WeightedString shortcutTarget : w.mShortcutTargets) { + if (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY == shortcutTarget.mFrequency) { + ++whitelistCount; + } + } + } + } + System.out.println("Words in the dictionary : " + wordCount); + System.out.println("Bigram count : " + bigramCount); + System.out.println("Shortcuts : " + shortcutCount + " (out of which " + whitelistCount + + " whitelist entries)"); + } + + private static void showWordInfo(final FusionDictionary dict, final String word, + final boolean plumbing) { + final CharGroup group = FusionDictionary.findWordInTree(dict.mRoot, word); + if (null == group) { + System.out.println(word + " is not in the dictionary"); + return; + } + System.out.println("Word: " + word); + System.out.println(" Freq: " + group.getFrequency()); + if (group.getIsNotAWord()) { + System.out.println(" Is not a word"); + } + if (group.getIsBlacklistEntry()) { + System.out.println(" Is a blacklist entry"); + } + final ArrayList<WeightedString> shortcutTargets = group.getShortcutTargets(); + if (null == shortcutTargets || shortcutTargets.isEmpty()) { + System.out.println(" No shortcuts"); + } else { + for (final WeightedString shortcutTarget : shortcutTargets) { + System.out.println(" Shortcut target: " + shortcutTarget.mWord + " (" + + (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY == shortcutTarget.mFrequency + ? "whitelist" : shortcutTarget.mFrequency) + ")"); + } + } + final ArrayList<WeightedString> bigrams = group.getBigrams(); + if (null == bigrams || bigrams.isEmpty()) { + System.out.println(" No bigrams"); + } else { + for (final WeightedString bigram : bigrams) { + System.out.println(" Bigram: " + bigram.mWord + " (" + bigram.mFrequency + ")"); + } + } + } + + @Override + public void run() { + if (mArgs.length < 1) { + throw new RuntimeException("Not enough arguments for command " + COMMAND); + } + final boolean plumbing; + if ("-p".equals(mArgs[0])) { + plumbing = true; + mArgs = Arrays.copyOfRange(mArgs, 1, mArgs.length); + if (mArgs.length != 1) { // There should be only 1 argument left + throw new RuntimeException("Wrong number of arguments for command " + COMMAND); + } + } else { + plumbing = false; + } + final String filename = mArgs[0]; + final boolean hasWordArguments = (1 == mArgs.length); + final FusionDictionary dict = BinaryDictOffdeviceUtils.getDictionary(filename, + hasWordArguments /* report */); + if (hasWordArguments) { + showInfo(dict, plumbing); + } else { + for (int i = 1; i < mArgs.length; ++i) { + showWordInfo(dict, mArgs[i], plumbing); + } + } + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java index c004cfbe4..808e1d4c8 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Makedict.java @@ -29,10 +29,12 @@ public class Makedict extends Dicttool.Command { public Makedict() { } + @Override public String getHelp() { return DictionaryMaker.Arguments.getHelp(); } + @Override public void run() throws FileNotFoundException, IOException, ParserConfigurationException, SAXException, UnsupportedFormatException { DictionaryMaker.main(mArgs); diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Package.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Package.java new file mode 100644 index 000000000..b29480764 --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Package.java @@ -0,0 +1,92 @@ +/** + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.IOException; +import java.io.OutputStream; + +public class Package { + private Package() { + // This container class is not publicly instantiable. + } + + static public class Packager extends Dicttool.Command { + public static final String COMMAND = "package"; + private final static String PREFIX = "dicttool"; + private final static String SUFFIX = ".tmp"; + + public Packager() { + } + + public String getHelp() { + return COMMAND + " <src_filename> <dst_filename>: Package a file for distribution"; + } + + public void run() throws IOException { + if (mArgs.length != 2) { + throw new RuntimeException("Too many/too few arguments for command " + COMMAND); + } + final File intermediateFile = File.createTempFile(PREFIX, SUFFIX); + try { + final Compress.Compressor compressCommand = new Compress.Compressor(); + compressCommand.setArgs(new String[] { mArgs[0], intermediateFile.getPath() }); + compressCommand.run(); + final Crypt.Encrypter cryptCommand = new Crypt.Encrypter(); + cryptCommand.setArgs(new String[] { intermediateFile.getPath(), mArgs[1] }); + cryptCommand.run(); + } finally { + intermediateFile.delete(); + } + } + } + + static public class Unpackager extends Dicttool.Command { + public static final String COMMAND = "unpackage"; + + public Unpackager() { + } + + public String getHelp() { + return COMMAND + " <src_filename> <dst_filename>: Detects how a file is packaged and\n" + + "decrypts/uncompresses as necessary to produce a raw binary file."; + } + + public void run() throws FileNotFoundException, IOException { + if (mArgs.length != 2) { + throw new RuntimeException("Too many/too few arguments for command " + COMMAND); + } + final BinaryDictOffdeviceUtils.DecoderChainSpec decodedSpec = + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(new File(mArgs[0])); + if (null == decodedSpec) { + System.out.println(mArgs[0] + " does not seem to be a dictionary"); + return; + } + System.out.println("Packaging : " + decodedSpec.describeChain()); + System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); + final FileOutputStream dstStream = new FileOutputStream(new File(mArgs[1])); + BinaryDictOffdeviceUtils.copy(new BufferedInputStream( + new FileInputStream(decodedSpec.mFile)), new BufferedOutputStream(dstStream)); + } + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java index 252c3d655..1fd2cba7a 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java @@ -22,6 +22,10 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.Node; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.Word; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.Writer; @@ -52,8 +56,6 @@ public class XmlDictInputOutput { private static final String WORD_ATTR = "word"; private static final String NOT_A_WORD_ATTR = "not_a_word"; - private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1; - private static final String OPTIONS_KEY = "options"; private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; @@ -63,12 +65,9 @@ public class XmlDictInputOutput { */ static private class UnigramHandler extends DefaultHandler { // Parser states - private static final int NONE = 0; private static final int START = 1; private static final int WORD = 2; - private static final int BIGRAM = 4; - private static final int END = 5; - private static final int UNKNOWN = 6; + private static final int UNKNOWN = 3; FusionDictionary mDictionary; int mState; // the state of the parser @@ -264,6 +263,35 @@ public class XmlDictInputOutput { } /** + * Basic test to find out whether the file is in the unigram XML format or not. + * + * Concretely this only tests the header line. + * + * @param filename The name of the file to test. + * @return true if the file is in the unigram XML format, false otherwise + */ + public static boolean isXmlUnigramDictionary(final String filename) { + BufferedReader reader = null; + try { + reader = new BufferedReader(new FileReader(new File(filename))); + final String firstLine = reader.readLine(); + return firstLine.matches("^\\s*<wordlist .*>\\s*$"); + } catch (FileNotFoundException e) { + return false; + } catch (IOException e) { + return false; + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + // do nothing + } + } + } + } + + /** * Reads a dictionary from an XML file. * * This is the public method that will parse an XML file and return the corresponding memory diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java b/tools/dicttool/src/com/android/inputmethod/latin/makedict/MakedictLog.java index 7eccff2b4..7eccff2b4 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/makedict/MakedictLog.java diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java new file mode 100644 index 000000000..554bd2478 --- /dev/null +++ b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import junit.framework.TestCase; + +import java.io.File; +import java.io.BufferedOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * Unit tests for BinaryDictOffdeviceUtilsTests + */ +public class BinaryDictOffdeviceUtilsTests extends TestCase { + private static final int TEST_FREQ = 37; // Some arbitrary value unlikely to happen by chance + + public void testGetRawDictWorks() throws IOException, UnsupportedFormatException { + // Create a thrice-compressed dictionary file. + final FusionDictionary dict = new FusionDictionary(new Node(), + new DictionaryOptions(new HashMap<String, String>(), + false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */)); + dict.add("foo", TEST_FREQ, null, false /* isNotAWord */); + dict.add("fta", 1, null, false /* isNotAWord */); + dict.add("ftb", 1, null, false /* isNotAWord */); + dict.add("bar", 1, null, false /* isNotAWord */); + dict.add("fool", 1, null, false /* isNotAWord */); + + final File dst = File.createTempFile("testGetRawDict", ".tmp"); + dst.deleteOnExit(); + final OutputStream out = Compress.getCompressedStream( + Compress.getCompressedStream( + Compress.getCompressedStream( + new BufferedOutputStream(new FileOutputStream(dst))))); + + BinaryDictInputOutput.writeDictionaryBinary(out, dict, new FormatOptions(2, false)); + + // Test for an actually compressed dictionary and its contents + final BinaryDictOffdeviceUtils.DecoderChainSpec decodeSpec = + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(dst); + for (final String step : decodeSpec.mDecoderSpec) { + assertEquals("Wrong decode spec", BinaryDictOffdeviceUtils.COMPRESSION, step); + } + assertEquals("Wrong decode spec", 3, decodeSpec.mDecoderSpec.size()); + final FileInputStream inStream = new FileInputStream(decodeSpec.mFile); + final ByteBuffer buffer = inStream.getChannel().map( + FileChannel.MapMode.READ_ONLY, 0, decodeSpec.mFile.length()); + final FusionDictionary resultDict = BinaryDictInputOutput.readDictionaryBinary( + new BinaryDictInputOutput.ByteBufferWrapper(buffer), + null /* dict : an optional dictionary to add words to, or null */); + assertEquals("Dictionary can't be read back correctly", + resultDict.findWordInTree(resultDict.mRoot, "foo").getFrequency(), TEST_FREQ); + } + + public void testGetRawDictFails() throws IOException { + // Randomly create some 4k file containing garbage + final File dst = File.createTempFile("testGetRawDict", ".tmp"); + dst.deleteOnExit(); + final OutputStream out = new BufferedOutputStream(new FileOutputStream(dst)); + for (int i = 0; i < 1024; ++i) { + out.write(0x12345678); + } + out.close(); + + // Test that a random data file actually fails + assertNull("Wrongly identified data file", + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(dst)); + + final File gzDst = File.createTempFile("testGetRawDict", ".tmp"); + gzDst.deleteOnExit(); + final OutputStream gzOut = + Compress.getCompressedStream(new BufferedOutputStream(new FileOutputStream(gzDst))); + for (int i = 0; i < 1024; ++i) { + gzOut.write(0x12345678); + } + gzOut.close(); + + // Test that a compressed random data file actually fails + assertNull("Wrongly identified data file", + BinaryDictOffdeviceUtils.getRawBinaryDictionaryOrNull(gzDst)); + } +} diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java index 88589b815..096902879 100644 --- a/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java +++ b/tools/dicttool/tests/com/android/inputmethod/latin/makedict/BinaryDictInputOutputTest.java @@ -19,24 +19,15 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import junit.framework.TestCase; + import java.util.ArrayList; import java.util.HashMap; -import junit.framework.TestCase; - /** * Unit tests for BinaryDictInputOutput. */ public class BinaryDictInputOutputTest extends TestCase { - - public void setUp() throws Exception { - super.setUp(); - } - - public void tearDown() throws Exception { - super.tearDown(); - } - // Test the flattened array contains the expected number of nodes, and // that it does not contain any duplicates. public void testFlattenNodes() { @@ -55,5 +46,4 @@ public class BinaryDictInputOutputTest extends TestCase { assertFalse("Flattened array contained the same node twice", result.contains(n)); } } - } diff --git a/tools/dicttool/tests/etc/test-dicttool.sh b/tools/dicttool/tests/etc/test-dicttool.sh index 8834611cd..092120769 100755 --- a/tools/dicttool/tests/etc/test-dicttool.sh +++ b/tools/dicttool/tests/etc/test-dicttool.sh @@ -13,4 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/../common/obj/JAVA_LIBRARIES/dicttool_intermediates/classes junit.textui.TestRunner com.android.inputmethod.latin.makedict.BinaryDictInputOutputTest +if [[ $(type -t mmm) != function ]]; then +echo "Usage:" 1>&2 +echo " source $0" 1>&2 +echo " or" 1>&2 +echo " . $0" 1>&2 +exit 1 +fi + +find out -name "dicttool_aosp*" -exec rm -rf {} \; > /dev/null 2>&1 +mmm -j8 external/junit +DICTTOOL_UNITTEST=true mmm -j8 packages/inputmethods/LatinIME/tools/dicttool +java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/framework/dicttool_aosp.jar junit.textui.TestRunner com.android.inputmethod.latin.makedict.BinaryDictInputOutputTest +java -classpath ${ANDROID_HOST_OUT}/framework/junit.jar:${ANDROID_HOST_OUT}/framework/dicttool_aosp.jar junit.textui.TestRunner com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtilsTests |