diff options
Diffstat (limited to 'tools')
10 files changed, 384 insertions, 198 deletions
diff --git a/tools/dicttool/Android.mk b/tools/dicttool/Android.mk index 81c0706c1..1a9f029ae 100644 --- a/tools/dicttool/Android.mk +++ b/tools/dicttool/Android.mk @@ -42,23 +42,15 @@ LATINIME_TESTS_SRC_DIR := $(LATINIME_LOCAL_DIR)/tests/src/com/android/inputmetho # a significant part of the dependencies are mocked in the compat/ directory, with empty or # nearly-empty implementations, for parts that we don't use in Dicttool. LATINIME_SRC_FILES_FOR_DICTTOOL := \ - event/Combiner.java \ - event/Event.java \ latin/BinaryDictionary.java \ latin/DicTraverseSession.java \ latin/Dictionary.java \ - latin/LastComposedWord.java \ latin/NgramContext.java \ latin/SuggestedWords.java \ - latin/WordComposer.java \ - latin/settings/NativeSuggestOptions.java \ latin/settings/SettingsValuesForSuggestion.java \ latin/utils/BinaryDictionaryUtils.java \ latin/utils/CombinedFormatUtils.java \ - latin/utils/CoordinateUtils.java \ - latin/utils/FileUtils.java \ - latin/utils/JniUtils.java \ - latin/utils/LocaleUtils.java + latin/utils/JniUtils.java LATINIME_OVERRIDABLE_SRC_FILES_FOR_DICTTOOL := \ latin/define/DebugFlags.java diff --git a/tools/dicttool/compat/com/android/inputmethod/event/CombinerChain.java b/tools/dicttool/compat/com/android/inputmethod/event/CombinerChain.java deleted file mode 100644 index c4457a1b7..000000000 --- a/tools/dicttool/compat/com/android/inputmethod/event/CombinerChain.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.event; - -import java.util.ArrayList; - -/** - * Compatibility class that stands in for the combiner chain in LatinIME. - * - * This is not used by dicttool, it's just needed by the dependency chain. - */ -// TODO: there should not be a dependency to this in dicttool, so there -// should be a sensible way to separate them cleanly. -public class CombinerChain { - private StringBuilder mComposingWord; - public CombinerChain(final String initialText, final Combiner... combinerList) { - mComposingWord = new StringBuilder(initialText); - } - - public Event processEvent(final ArrayList<Event> previousEvents, final Event newEvent) { - return newEvent; - } - - public void applyProcessedEvent(final Event event) { - mComposingWord.append(event.getTextToCommit()); - } - - public CharSequence getComposingWordWithCombiningFeedback() { - return mComposingWord; - } - - public void reset() { - mComposingWord.setLength(0); - } - - public static Combiner[] createCombiners(final String spec) { - // Dicttool never uses a combiner at all, so we just return a zero-sized array. - return new Combiner[0]; - } -} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java index 1c5dfa9fb..84c3956f7 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java @@ -19,6 +19,10 @@ package com.android.inputmethod.latin.dicttool; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils; import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; import com.android.inputmethod.latin.makedict.DictDecoder; +import com.android.inputmethod.latin.makedict.DictionaryHeader; +import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; @@ -27,12 +31,18 @@ import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.util.Arrays; +import java.util.ArrayList; +import java.util.HashMap; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; /** * Class grouping utilities for offline dictionary making. @@ -44,26 +54,27 @@ public final class BinaryDictOffdeviceUtils { // Prefix and suffix are arbitrary, the values do not really matter private final static String PREFIX = "dicttool"; private final static String SUFFIX = ".tmp"; - private final static int COPY_BUFFER_SIZE = 8192; - public static class DecoderChainSpec { + public static class DecoderChainSpec<T> { public final static int COMPRESSION = 1; public final static int ENCRYPTION = 2; - private final static int MAX_DECODE_DEPTH = 4; - final int[] mDecoderSpec; - File mFile; + private final static int[][] VALID_DECODER_CHAINS = { + { }, { COMPRESSION }, { ENCRYPTION, COMPRESSION } + }; + + private final int mDecoderSpecIndex; + public T mResult; public DecoderChainSpec() { - mDecoderSpec = new int[0]; - mFile = null; + mDecoderSpecIndex = 0; + mResult = null; } - public DecoderChainSpec(final DecoderChainSpec src, final int newStep) { - mDecoderSpec = Arrays.copyOf(src.mDecoderSpec, src.mDecoderSpec.length + 1); - mDecoderSpec[src.mDecoderSpec.length] = newStep; - mFile = src.mFile; + private DecoderChainSpec(final DecoderChainSpec<T> src) { + mDecoderSpecIndex = src.mDecoderSpecIndex + 1; + mResult = src.mResult; } private String getStepDescription(final int step) { @@ -79,110 +90,177 @@ public final class BinaryDictOffdeviceUtils { public String describeChain() { final StringBuilder s = new StringBuilder("raw"); - for (final int step : mDecoderSpec) { + for (final int step : VALID_DECODER_CHAINS[mDecoderSpecIndex]) { s.append(" > "); s.append(getStepDescription(step)); } return s.toString(); } - } - public static void copy(final InputStream input, final OutputStream output) throws IOException { - final byte[] buffer = new byte[COPY_BUFFER_SIZE]; - for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) { - output.write(buffer, 0, readBytes); + /** + * Returns the next sequential spec. If exhausted, return null. + */ + public DecoderChainSpec next() { + if (mDecoderSpecIndex + 1 >= VALID_DECODER_CHAINS.length) { + return null; + } + return new DecoderChainSpec(this); + } + + public InputStream getStream(final File src) throws FileNotFoundException, IOException { + InputStream input = new BufferedInputStream(new FileInputStream(src)); + for (final int step : VALID_DECODER_CHAINS[mDecoderSpecIndex]) { + switch (step) { + case COMPRESSION: + input = Compress.getUncompressedStream(input); + break; + case ENCRYPTION: + input = Crypt.getDecryptedStream(input); + break; + } + } + return input; } } - /** - * Returns a decrypted/uncompressed dictionary. - * - * This will decrypt/uncompress any number of times as necessary until it finds the - * dictionary signature, and copy the decoded file to a temporary place. - * If this is not a dictionary, the method returns null. - */ - public static DecoderChainSpec getRawDictionaryOrNull(final File src) { - return getRawDictionaryOrNullInternal(new DecoderChainSpec(), src, 0); + public interface InputProcessor<T> { + @Nonnull + public T process(@Nonnull final InputStream input) + throws IOException, UnsupportedFormatException; } - private static DecoderChainSpec getRawDictionaryOrNullInternal( - final DecoderChainSpec spec, final File src, final int depth) { - // Unfortunately the decoding scheme we use can consider any data to be encrypted - // and will produce some output, meaning it's not possible to reliably detect encrypted - // data. Thus, some non-dictionary files (especially small) ones may successfully decrypt - // over and over, ending in a stack overflow. Hence we limit the depth at which we try - // decoding the file. - if (depth > DecoderChainSpec.MAX_DECODE_DEPTH) { - return null; + public static class CopyProcessor implements InputProcessor<File> { + @Override @Nonnull + public File process(@Nonnull final InputStream input) throws IOException, + UnsupportedFormatException { + final File dst = File.createTempFile(PREFIX, SUFFIX); + dst.deleteOnExit(); + try (final OutputStream output = new BufferedOutputStream(new FileOutputStream(dst))) { + copy(input, output); + output.flush(); + output.close(); + if (BinaryDictDecoderUtils.isBinaryDictionary(dst) + || CombinedInputOutput.isCombinedDictionary(dst.getAbsolutePath())) { + return dst; + } + } + throw new UnsupportedFormatException("Input stream not at the expected format"); } - if (BinaryDictDecoderUtils.isBinaryDictionary(src) - || CombinedInputOutput.isCombinedDictionary(src.getAbsolutePath())) { - spec.mFile = src; - return spec; + } + + public static class HeaderReaderProcessor implements InputProcessor<DictionaryHeader> { + // Arbitrarily limit the header length to 32k. Sounds like it would never be larger + // than this. Revisit this if needed later. + private final int MAX_HEADER_LENGTH = 32 * 1024; + @Override @Nonnull + public DictionaryHeader process(final InputStream input) throws IOException, + UnsupportedFormatException { + // Do everything as curtly and ad-hoc as possible for performance. + final byte[] tmpBuffer = new byte[12]; + if (tmpBuffer.length != input.read(tmpBuffer)) { + throw new UnsupportedFormatException("File too short, not a dictionary"); + } + // Ad-hoc check for the magic number. See FormatSpec.java as well as + // byte_array_utils.h and BinaryDictEncoderUtils#writeDictionaryHeader(). + final int MAGIC_NUMBER_START_OFFSET = 0; + final int VERSION_START_OFFSET = 4; + final int HEADER_SIZE_OFFSET = 8; + final int magicNumber = ((tmpBuffer[MAGIC_NUMBER_START_OFFSET] & 0xFF) << 24) + + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 1] & 0xFF) << 16) + + ((tmpBuffer[MAGIC_NUMBER_START_OFFSET + 2] & 0xFF) << 8) + + (tmpBuffer[MAGIC_NUMBER_START_OFFSET + 3] & 0xFF); + if (magicNumber != FormatSpec.MAGIC_NUMBER) { + throw new UnsupportedFormatException("Wrong magic number"); + } + final int version = ((tmpBuffer[VERSION_START_OFFSET] & 0xFF) << 8) + + (tmpBuffer[VERSION_START_OFFSET + 1] & 0xFF); + if (version != FormatSpec.VERSION2 && version != FormatSpec.VERSION201 + && version != FormatSpec.VERSION202) { + throw new UnsupportedFormatException("Only versions 2, 201, 202 are supported"); + } + final int totalHeaderSize = ((tmpBuffer[HEADER_SIZE_OFFSET] & 0xFF) << 24) + + ((tmpBuffer[HEADER_SIZE_OFFSET + 1] & 0xFF) << 16) + + ((tmpBuffer[HEADER_SIZE_OFFSET + 2] & 0xFF) << 8) + + (tmpBuffer[HEADER_SIZE_OFFSET + 3] & 0xFF); + if (totalHeaderSize > MAX_HEADER_LENGTH) { + throw new UnsupportedFormatException("Header too large"); + } + final byte[] headerBuffer = new byte[totalHeaderSize - tmpBuffer.length]; + readStreamExhaustively(input, headerBuffer); + final HashMap<String, String> attributes = + BinaryDictDecoderUtils.decodeHeaderAttributes(headerBuffer); + return new DictionaryHeader(totalHeaderSize, new DictionaryOptions(attributes), + new FormatOptions(version, false /* hasTimestamp */)); } - // It's not a raw dictionary - try to see if it's compressed. - final File uncompressedFile = tryGetUncompressedFile(src); - if (null != uncompressedFile) { - final DecoderChainSpec newSpec = - getRawDictionaryOrNullInternal(spec, uncompressedFile, depth + 1); - if (null == newSpec) return null; - return new DecoderChainSpec(newSpec, DecoderChainSpec.COMPRESSION); + } + + private static void readStreamExhaustively(final InputStream inputStream, + final byte[] outBuffer) throws IOException, UnsupportedFormatException { + int readBytes = 0; + int readBytesLastCycle = -1; + while (readBytes != outBuffer.length) { + readBytesLastCycle = inputStream.read(outBuffer, readBytes, + outBuffer.length - readBytes); + if (readBytesLastCycle == -1) + throw new UnsupportedFormatException("File shorter than specified in the header" + + " (expected " + outBuffer.length + ", read " + readBytes + ")"); + readBytes += readBytesLastCycle; } - // It's not a compressed either - try to see if it's crypted. - final File decryptedFile = tryGetDecryptedFile(src); - if (null != decryptedFile) { - final DecoderChainSpec newSpec = - getRawDictionaryOrNullInternal(spec, decryptedFile, depth + 1); - if (null == newSpec) return null; - return new DecoderChainSpec(newSpec, DecoderChainSpec.ENCRYPTION); + } + + public static void copy(final InputStream input, final OutputStream output) throws IOException { + final byte[] buffer = new byte[COPY_BUFFER_SIZE]; + for (int readBytes = input.read(buffer); readBytes >= 0; readBytes = input.read(buffer)) { + output.write(buffer, 0, readBytes); } - return null; } - /* Try to uncompress the file passed as an argument. + /** + * Process a dictionary, decrypting/uncompressing it on the fly as necessary. * - * If the file can be uncompressed, the uncompressed version is returned. Otherwise, null - * is returned. + * This will execute the given processor repeatedly with the possible alternatives + * for dictionary format until the processor does not throw an exception. + * If the processor succeeds for none of the possible formats, the method returns null. */ - private static File tryGetUncompressedFile(final File src) { - try { - final File dst = File.createTempFile(PREFIX, SUFFIX); - dst.deleteOnExit(); - try ( - final InputStream input = Compress.getUncompressedStream( - new BufferedInputStream(new FileInputStream(src))); - final OutputStream output = new BufferedOutputStream(new FileOutputStream(dst)) - ) { - copy(input, output); - return dst; + @Nullable + public static <T> DecoderChainSpec<T> decodeDictionaryForProcess(@Nonnull final File src, + @Nonnull final InputProcessor<T> processor) { + @Nonnull DecoderChainSpec spec = new DecoderChainSpec(); + while (null != spec) { + try { + final InputStream input = spec.getStream(src); + spec.mResult = processor.process(input); + try { + input.close(); + } catch (IOException e) { + // CipherInputStream doesn't like being closed without having read the + // entire stream, for some reason. But we don't want to because it's a waste + // of resources. We really, really don't care about this. + // However on close() CipherInputStream does throw this exception, wrapped + // in an IOException so we need to catch it. + if (!(e.getCause() instanceof javax.crypto.BadPaddingException)) { + throw e; + } + } + return spec; + } catch (IOException | UnsupportedFormatException | ArrayIndexOutOfBoundsException e) { + // If the format is not the right one for this file, the processor will throw one + // of these exceptions. In our case, that means we should try the next spec, + // since it may still be at another format we haven't tried yet. + // TODO: stop using exceptions for this non-exceptional case. } - } catch (final IOException e) { - // Could not uncompress the file: presumably the file is simply not a compressed file - return null; + spec = spec.next(); } + return null; } - /* Try to decrypt the file passed as an argument. - * - * If the file can be decrypted, the decrypted version is returned. Otherwise, null - * is returned. + /** + * Get a decoder chain spec with a raw dictionary file. This makes a new file on the + * disk ready for any treatment the client wants. */ - private static File tryGetDecryptedFile(final File src) { - try { - final File dst = File.createTempFile(PREFIX, SUFFIX); - dst.deleteOnExit(); - try ( - final InputStream input = Crypt.getDecryptedStream( - new BufferedInputStream(new FileInputStream(src))); - final OutputStream output = new BufferedOutputStream(new FileOutputStream(dst)) - ) { - copy(input, output); - return dst; - } - } catch (final IOException e) { - // Could not decrypt the file: presumably the file is simply not a crypted file - return null; - } + @Nullable + public static DecoderChainSpec<File> getRawDictionaryOrNull(@Nonnull final File src) { + return decodeDictionaryForProcess(src, new CopyProcessor()); } static FusionDictionary getDictionary(final String filename, final boolean report) { @@ -192,28 +270,28 @@ public final class BinaryDictOffdeviceUtils { System.out.println("Size : " + file.length() + " bytes"); } try { - final DecoderChainSpec decodedSpec = getRawDictionaryOrNull(file); + final DecoderChainSpec<File> decodedSpec = getRawDictionaryOrNull(file); if (null == decodedSpec) { throw new RuntimeException("Does not seem to be a dictionary file " + filename); } - if (CombinedInputOutput.isCombinedDictionary(decodedSpec.mFile.getAbsolutePath())) { + if (CombinedInputOutput.isCombinedDictionary(decodedSpec.mResult.getAbsolutePath())) { if (report) { System.out.println("Format : Combined format"); System.out.println("Packaging : " + decodedSpec.describeChain()); - System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); + System.out.println("Uncompressed size : " + decodedSpec.mResult.length()); } try (final BufferedReader reader = new BufferedReader( - new InputStreamReader(new FileInputStream(decodedSpec.mFile), "UTF-8"))) { + new InputStreamReader(new FileInputStream(decodedSpec.mResult), "UTF-8"))) { return CombinedInputOutput.readDictionaryCombined(reader); } } final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder( - decodedSpec.mFile, 0, decodedSpec.mFile.length(), + decodedSpec.mResult, 0, decodedSpec.mResult.length(), DictDecoder.USE_BYTEARRAY); if (report) { System.out.println("Format : Binary dictionary format"); System.out.println("Packaging : " + decodedSpec.describeChain()); - System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); + System.out.println("Uncompressed size : " + decodedSpec.mResult.length()); } return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); } catch (final IOException | UnsupportedFormatException e) { diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java index 48d2e5922..955c5728c 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -98,6 +98,7 @@ public class CombinedInputOutput { String word = null; ProbabilityInfo probabilityInfo = new ProbabilityInfo(0); boolean isNotAWord = false; + boolean isPossiblyOffensive = false; ArrayList<WeightedString> bigrams = new ArrayList<>(); ArrayList<WeightedString> shortcuts = new ArrayList<>(); while (null != (line = reader.readLine())) { @@ -106,7 +107,7 @@ public class CombinedInputOutput { if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { if (null != word) { dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, - isNotAWord, false /* isPossiblyOffensive */); + isNotAWord, isPossiblyOffensive); for (WeightedString s : bigrams) { dict.setBigram(word, s.mWord, s.mProbabilityInfo); } @@ -114,27 +115,37 @@ public class CombinedInputOutput { if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>(); if (!bigrams.isEmpty()) bigrams = new ArrayList<>(); isNotAWord = false; + isPossiblyOffensive = false; for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); - if (CombinedFormatUtils.WORD_TAG.equals(params[0])) { - word = params[1]; - } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { - probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), - probabilityInfo.mTimestamp, probabilityInfo.mLevel, - probabilityInfo.mCount); - } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { - final String[] historicalInfoParams = - params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); - if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { - throw new RuntimeException("Wrong format (historical info) : " + line); - } - probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, - Integer.parseInt(historicalInfoParams[0]), - Integer.parseInt(historicalInfoParams[1]), - Integer.parseInt(historicalInfoParams[2])); - } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) { - isNotAWord = "true".equals(params[1]); + switch (params[0]) { + case CombinedFormatUtils.WORD_TAG: + word = params[1]; + break; + case CombinedFormatUtils.PROBABILITY_TAG: + probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), + probabilityInfo.mTimestamp, probabilityInfo.mLevel, + probabilityInfo.mCount); + break; + case CombinedFormatUtils.HISTORICAL_INFO_TAG: + final String[] historicalInfoParams = params[1].split( + CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); + if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { + throw new RuntimeException("Wrong format (historical info) : " + + line); + } + probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, + Integer.parseInt(historicalInfoParams[0]), + Integer.parseInt(historicalInfoParams[1]), + Integer.parseInt(historicalInfoParams[2])); + break; + case CombinedFormatUtils.NOT_A_WORD_TAG: + isNotAWord = CombinedFormatUtils.isLiteralTrue(params[1]); + break; + case CombinedFormatUtils.POSSIBLY_OFFENSIVE_TAG: + isPossiblyOffensive = CombinedFormatUtils.isLiteralTrue(params[1]); + break; } } } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { @@ -190,7 +201,7 @@ public class CombinedInputOutput { } if (null != word) { dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord, - false /* isPossiblyOffensive */); + isPossiblyOffensive); for (WeightedString s : bigrams) { dict.setBigram(word, s.mWord, s.mProbabilityInfo); } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java index 0d93c7fa9..8fdf7633f 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CommandList.java @@ -18,7 +18,9 @@ package com.android.inputmethod.latin.dicttool; public class CommandList { public static void populate() { + // TODO: Move some commands to native code. Dicttool.addCommand("info", Info.class); + Dicttool.addCommand("header", Header.class); Dicttool.addCommand("diff", Diff.class); Dicttool.addCommand("compress", Compress.Compressor.class); Dicttool.addCommand("uncompress", Compress.Uncompressor.class); diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java index 8f9e4a3a6..6187853c8 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -120,7 +120,7 @@ public class DictionaryMaker { String inputCombined = null; String outputBinary = null; String outputCombined = null; - int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201. + int outputBinaryFormatVersion = FormatSpec.VERSION202; // the default version is 202. // Don't use code point table by default. int codePointTableMode = Ver2DictEncoder.CODE_POINT_TABLE_OFF; diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Header.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Header.java new file mode 100644 index 000000000..ba96c0aeb --- /dev/null +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Header.java @@ -0,0 +1,70 @@ +/** + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils.DecoderChainSpec; +import com.android.inputmethod.latin.makedict.DictionaryHeader; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import java.io.File; +import java.util.Arrays; +import java.util.Locale; + +public class Header extends Dicttool.Command { + public static final String COMMAND = "header"; + + public Header() { + } + + @Override + public String getHelp() { + return COMMAND + " <filename>: prints the header contents of a dictionary file"; + } + + @Override + public void run() throws UnsupportedFormatException { + final boolean plumbing; + if (mArgs.length > 0 && "-p".equals(mArgs[0])) { + plumbing = true; + mArgs = Arrays.copyOfRange(mArgs, 1, mArgs.length); + } else { + plumbing = false; + } + if (mArgs.length < 1) { + throw new RuntimeException("Not enough arguments for command " + COMMAND); + } + final String filename = mArgs[0]; + final File dictFile = new File(filename); + final DecoderChainSpec<DictionaryHeader> spec = + BinaryDictOffdeviceUtils.decodeDictionaryForProcess(dictFile, + new BinaryDictOffdeviceUtils.HeaderReaderProcessor()); + if (null == spec) { + throw new UnsupportedFormatException(filename + + " doesn't seem to be a valid version 2 dictionary file"); + } + + final DictionaryHeader header = spec.mResult; + System.out.println("Dictionary : " + dictFile.getAbsolutePath()); + System.out.println("Size : " + dictFile.length() + " bytes"); + System.out.println("Format : Binary dictionary format"); + System.out.println("Format version : " + header.mFormatOptions.mVersion); + System.out.println("Packaging : " + spec.describeChain()); + System.out.println("Header attributes :"); + System.out.print(header.mDictionaryOptions.toString(2 /* indentCount */, plumbing)); + } +} diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Package.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Package.java index 47ea70629..3efa10a80 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Package.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Package.java @@ -16,6 +16,8 @@ package com.android.inputmethod.latin.dicttool; +import com.android.inputmethod.latin.makedict.DictionaryHeader; + import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; @@ -77,16 +79,16 @@ public class Package { if (mArgs.length != 2) { throw new RuntimeException("Too many/too few arguments for command " + COMMAND); } - final BinaryDictOffdeviceUtils.DecoderChainSpec decodedSpec = - BinaryDictOffdeviceUtils.getRawDictionaryOrNull(new File(mArgs[0])); + final BinaryDictOffdeviceUtils.DecoderChainSpec<DictionaryHeader> decodedSpec = + BinaryDictOffdeviceUtils.decodeDictionaryForProcess(new File(mArgs[0]), + new BinaryDictOffdeviceUtils.HeaderReaderProcessor()); if (null == decodedSpec) { System.out.println(mArgs[0] + " does not seem to be a dictionary"); return; } System.out.println("Packaging : " + decodedSpec.describeChain()); - System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); try ( - final InputStream input = getFileInputStream(decodedSpec.mFile); + final InputStream input = decodedSpec.getStream(new File(mArgs[0])); final OutputStream output = new BufferedOutputStream( getFileOutputStreamOrStdOut(mArgs[1])) ) { diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java index b6383d788..e2dd5199b 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java @@ -16,10 +16,10 @@ package com.android.inputmethod.latin.dicttool; +import com.android.inputmethod.latin.common.FileUtils; import com.android.inputmethod.latin.makedict.BinaryDictDecoderEncoderTests; import com.android.inputmethod.latin.makedict.BinaryDictEncoderFlattenTreeTests; import com.android.inputmethod.latin.makedict.FusionDictionaryTest; -import com.android.inputmethod.latin.utils.FileUtils; import java.io.File; import java.io.IOException; diff --git a/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java index 6cdbff7e5..ea9d4cc19 100644 --- a/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java +++ b/tools/dicttool/tests/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtilsTests.java @@ -16,10 +16,17 @@ package com.android.inputmethod.latin.dicttool; +import com.android.inputmethod.latin.common.CodePointUtils; +import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils; +import com.android.inputmethod.latin.dicttool.Compress; +import com.android.inputmethod.latin.dicttool.Crypt; +import com.android.inputmethod.latin.dicttool.BinaryDictOffdeviceUtils.DecoderChainSpec; import com.android.inputmethod.latin.makedict.BinaryDictIOUtils; +import com.android.inputmethod.latin.makedict.BinaryDictUtils; import com.android.inputmethod.latin.makedict.DictDecoder; import com.android.inputmethod.latin.makedict.DictEncoder; import com.android.inputmethod.latin.makedict.DictionaryHeader; +import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary; @@ -35,13 +42,37 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Random; +import java.util.Set; /** * Unit tests for BinaryDictOffdeviceUtils */ public class BinaryDictOffdeviceUtilsTests extends TestCase { private static final int TEST_FREQ = 37; // Some arbitrary value unlikely to happen by chance + private static final int CODE_POINT_SET_SIZE = 300; + final Random mRandom; + private static final ArrayList<String> sWords = new ArrayList<>(); + + public BinaryDictOffdeviceUtilsTests(final long seed, final int maxUnigrams) { + super(); + mRandom = new Random(seed); + sWords.clear(); + generateWords(maxUnigrams, mRandom); + } + + private static void generateWords(final int maxUnigrams, final Random random) { + final int[] codePointSet = CodePointUtils.generateCodePointSet( + CODE_POINT_SET_SIZE, random); + final Set<String> wordSet = new HashSet<>(); + while (wordSet.size() < maxUnigrams) { + wordSet.add(CodePointUtils.generateWord(random, codePointSet)); + } + sWords.addAll(wordSet); + } public void testGetRawDictWorks() throws IOException, UnsupportedFormatException { final String VERSION = "1"; @@ -68,23 +99,17 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase { final File dst = File.createTempFile("testGetRawDict", ".tmp"); dst.deleteOnExit(); try (final OutputStream out = Compress.getCompressedStream( - Compress.getCompressedStream( - Compress.getCompressedStream( - new BufferedOutputStream(new FileOutputStream(dst)))))) { + new BufferedOutputStream(new FileOutputStream(dst)))) { final DictEncoder dictEncoder = new Ver2DictEncoder(out); - dictEncoder.writeDictionary(dict, new FormatOptions(2, false)); + dictEncoder.writeDictionary(dict, new FormatOptions(FormatSpec.VERSION202, false)); } // Test for an actually compressed dictionary and its contents - final BinaryDictOffdeviceUtils.DecoderChainSpec decodeSpec = + final BinaryDictOffdeviceUtils.DecoderChainSpec<File> decodeSpec = BinaryDictOffdeviceUtils.getRawDictionaryOrNull(dst); - for (final int step : decodeSpec.mDecoderSpec) { - assertEquals("Wrong decode spec", - BinaryDictOffdeviceUtils.DecoderChainSpec.COMPRESSION, step); - } - assertEquals("Wrong decode spec", 3, decodeSpec.mDecoderSpec.length); - final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(decodeSpec.mFile, 0, - decodeSpec.mFile.length()); + assertEquals("Wrong decode spec", "raw > compression", decodeSpec.describeChain()); + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(decodeSpec.mResult, 0, + decodeSpec.mResult.length()); final FusionDictionary resultDict = dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); assertEquals("Wrong version attribute", VERSION, resultDict.mOptions.mAttributes.get( @@ -125,4 +150,64 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase { assertNull("Wrongly identified data file", BinaryDictOffdeviceUtils.getRawDictionaryOrNull(gzDst)); } + + public void runTestHeaderReaderProcessorWithOneSpec(final boolean compress, final boolean crypt) + throws IOException, UnsupportedFormatException { + final String dictName = "testHeaderReaderProcessor"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS; + final int MAX_NUMBER_OF_OPTIONS_TO_ADD = 5; + final HashMap<String, String> options = new HashMap<>(); + // Required attributes + options.put("dictionary", "main:en_US"); + options.put("locale", "en_US"); + options.put("version", Integer.toString(mRandom.nextInt())); + // Add some random options for test + final int numberOfOptionsToAdd = mRandom.nextInt() % (MAX_NUMBER_OF_OPTIONS_TO_ADD + 1); + for (int i = 0; i < numberOfOptionsToAdd; ++i) { + options.put(sWords.get(2 * i), sWords.get(2 * 1 + 1)); + } + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + new DictionaryOptions(options)); + + for (int i = 0; i < sWords.size(); ++i) { + final String word = sWords.get(i); + dict.add(word, new ProbabilityInfo(TEST_FREQ), null /* shortcuts */, + false /* isNotAWord */, false /* isPossiblyOffensive */); + } + + File file = File.createTempFile(dictName, ".tmp"); + final DictEncoder dictEncoder = BinaryDictUtils.getDictEncoder(file, formatOptions); + dictEncoder.writeDictionary(dict, formatOptions); + + if (compress) { + final File rawFile = file; + file = File.createTempFile(dictName + ".compress", ".tmp"); + final Compress.Compressor compressCommand = new Compress.Compressor(); + compressCommand.setArgs(new String[] { rawFile.getPath(), file.getPath() }); + compressCommand.run(); + } + if (crypt) { + final File rawFile = file; + file = File.createTempFile(dictName + ".crypt", ".tmp"); + final Crypt.Encrypter cryptCommand = new Crypt.Encrypter(); + cryptCommand.setArgs(new String[] { rawFile.getPath(), file.getPath() }); + cryptCommand.run(); + } + + final DecoderChainSpec<DictionaryHeader> spec = + BinaryDictOffdeviceUtils.decodeDictionaryForProcess(file, + new BinaryDictOffdeviceUtils.HeaderReaderProcessor()); + assertNotNull("Can't decode a dictionary we just wrote : " + file, spec); + final DictionaryHeader header = spec.mResult; + assertEquals("raw" + (crypt ? " > encryption" : "") + (compress ? " > compression" : ""), + spec.describeChain()); + assertEquals(header.mDictionaryOptions.mAttributes, options); + } + + public void testHeaderReaderProcessor() throws IOException, UnsupportedFormatException { + runTestHeaderReaderProcessorWithOneSpec(false /* compress */, false /* crypt */); + runTestHeaderReaderProcessorWithOneSpec(true /* compress */, false /* crypt */); + runTestHeaderReaderProcessorWithOneSpec(true /* compress */, true /* crypt */); + } } |