diff options
Diffstat (limited to 'tools/dicttool/src')
5 files changed, 749 insertions, 0 deletions
diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java new file mode 100644 index 000000000..9ebd3bbdd --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -0,0 +1,320 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.MakedictLog; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.Arrays; +import java.util.LinkedList; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +/** + * Main class/method for DictionaryMaker. + */ +public class DictionaryMaker { + + static class Arguments { + private final static String OPTION_VERSION_2 = "-2"; + private final static String OPTION_INPUT_SOURCE = "-s"; + private final static String OPTION_INPUT_BIGRAM_XML = "-b"; + private final static String OPTION_INPUT_SHORTCUT_XML = "-c"; + private final static String OPTION_OUTPUT_BINARY = "-d"; + private final static String OPTION_OUTPUT_BINARY_FORMAT_VERSION_1 = "-d1"; + private final static String OPTION_OUTPUT_XML = "-x"; + private final static String OPTION_HELP = "-h"; + public final String mInputBinary; + public final String mInputUnigramXml; + public final String mInputShortcutXml; + public final String mInputBigramXml; + public final String mOutputBinary; + public final String mOutputBinaryFormat1; + public final String mOutputXml; + + private void checkIntegrity() throws IOException { + checkHasExactlyOneInput(); + checkHasAtLeastOneOutput(); + checkNotSameFile(mInputBinary, mOutputBinary); + checkNotSameFile(mInputBinary, mOutputBinaryFormat1); + checkNotSameFile(mInputBinary, mOutputXml); + checkNotSameFile(mInputUnigramXml, mOutputBinary); + checkNotSameFile(mInputUnigramXml, mOutputBinaryFormat1); + checkNotSameFile(mInputUnigramXml, mOutputXml); + checkNotSameFile(mInputShortcutXml, mOutputBinary); + checkNotSameFile(mInputShortcutXml, mOutputBinaryFormat1); + checkNotSameFile(mInputShortcutXml, mOutputXml); + checkNotSameFile(mInputBigramXml, mOutputBinary); + checkNotSameFile(mInputBigramXml, mOutputBinaryFormat1); + checkNotSameFile(mInputBigramXml, mOutputXml); + checkNotSameFile(mOutputBinary, mOutputBinaryFormat1); + checkNotSameFile(mOutputBinary, mOutputXml); + checkNotSameFile(mOutputBinaryFormat1, mOutputXml); + } + + private void checkHasExactlyOneInput() { + if (null == mInputUnigramXml && null == mInputBinary) { + throw new RuntimeException("No input file specified"); + } else if (null != mInputUnigramXml && null != mInputBinary) { + throw new RuntimeException("Both input XML and binary specified"); + } else if (null != mInputBinary && null != mInputBigramXml) { + throw new RuntimeException("Cannot specify a binary input and a separate bigram " + + "file"); + } + } + + private void checkHasAtLeastOneOutput() { + if (null == mOutputBinary && null == mOutputBinaryFormat1 && null == mOutputXml) { + throw new RuntimeException("No output specified"); + } + } + + /** + * Utility method that throws an exception if path1 and path2 point to the same file. + */ + private static void checkNotSameFile(final String path1, final String path2) + throws IOException { + if (null == path1 || null == path2) return; + if (new File(path1).getCanonicalPath().equals(new File(path2).getCanonicalPath())) { + throw new RuntimeException(path1 + " and " + path2 + " are the same file: " + + " refusing to process."); + } + } + + private void displayHelp() { + MakedictLog.i(getHelp()); + } + + public static String getHelp() { + return "Usage: makedict " + + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts.xml>] " + + "| -s <binary input>] [-d <binary output format version 2>] " + + "[-d1 <binary output format version 1>] [-x <xml output>] [-2]\n" + + "\n" + + " Converts a source dictionary file to one or several outputs.\n" + + " Source can be an XML file, with an optional XML bigrams file, or a\n" + + " binary dictionary file.\n" + + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean) and XML outputs\n" + + " are supported. All three can be output at the same time, but the same\n" + + " output format cannot be specified several times. The behavior is\n" + + " unspecified if the same file is specified for input and output, or for\n" + + " several outputs."; + } + + public Arguments(String[] argsArray) throws IOException { + final LinkedList<String> args = new LinkedList<String>(Arrays.asList(argsArray)); + if (args.isEmpty()) { + displayHelp(); + } + String inputBinary = null; + String inputUnigramXml = null; + String inputShortcutXml = null; + String inputBigramXml = null; + String outputBinary = null; + String outputBinaryFormat1 = null; + String outputXml = null; + + while (!args.isEmpty()) { + final String arg = args.get(0); + args.remove(0); + if (arg.charAt(0) == '-') { + if (OPTION_VERSION_2.equals(arg)) { + // Do nothing, this is the default + } else if (OPTION_HELP.equals(arg)) { + displayHelp(); + } else { + // All these options need an argument + if (args.isEmpty()) { + throw new IllegalArgumentException("Option " + arg + " is unknown or " + + "requires an argument"); + } + String filename = args.get(0); + args.remove(0); + if (OPTION_INPUT_SOURCE.equals(arg)) { + if (BinaryDictInputOutput.isBinaryDictionary(filename)) { + inputBinary = filename; + } else { + inputUnigramXml = filename; + } + } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) { + inputShortcutXml = filename; + } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) { + inputBigramXml = filename; + } else if (OPTION_OUTPUT_BINARY.equals(arg)) { + outputBinary = filename; + } else if (OPTION_OUTPUT_BINARY_FORMAT_VERSION_1.equals(arg)) { + outputBinaryFormat1 = filename; + } else if (OPTION_OUTPUT_XML.equals(arg)) { + outputXml = filename; + } else { + throw new IllegalArgumentException("Unknown option : " + arg); + } + } + } else { + if (null == inputBinary && null == inputUnigramXml) { + if (BinaryDictInputOutput.isBinaryDictionary(arg)) { + inputBinary = arg; + } else { + inputUnigramXml = arg; + } + } else if (null == outputBinary) { + outputBinary = arg; + } else { + throw new IllegalArgumentException("Several output binary files specified"); + } + } + } + + mInputBinary = inputBinary; + mInputUnigramXml = inputUnigramXml; + mInputShortcutXml = inputShortcutXml; + mInputBigramXml = inputBigramXml; + mOutputBinary = outputBinary; + mOutputBinaryFormat1 = outputBinaryFormat1; + mOutputXml = outputXml; + checkIntegrity(); + } + } + + public static void main(String[] args) + throws FileNotFoundException, ParserConfigurationException, SAXException, IOException, + UnsupportedFormatException { + final Arguments parsedArgs = new Arguments(args); + FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs); + writeOutputToParsedArgs(parsedArgs, dictionary); + } + + /** + * Invoke the right input method according to args. + * + * @param args the parsed command line arguments. + * @return the read dictionary. + */ + private static FusionDictionary readInputFromParsedArgs(final Arguments args) + throws IOException, UnsupportedFormatException, ParserConfigurationException, + SAXException, FileNotFoundException { + if (null != args.mInputBinary) { + return readBinaryFile(args.mInputBinary); + } else if (null != args.mInputUnigramXml) { + return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); + } else { + throw new RuntimeException("No input file specified"); + } + } + + /** + * Read a dictionary from the name of a binary file. + * + * @param binaryFilename the name of the file in the binary dictionary format. + * @return the read dictionary. + * @throws FileNotFoundException if the file can't be found + * @throws IOException if the input file can't be read + * @throws UnsupportedFormatException if the binary file is not in the expected format + */ + private static FusionDictionary readBinaryFile(final String binaryFilename) + throws FileNotFoundException, IOException, UnsupportedFormatException { + final RandomAccessFile inputFile = new RandomAccessFile(binaryFilename, "r"); + return BinaryDictInputOutput.readDictionaryBinary(inputFile, null); + } + + /** + * Read a dictionary from a unigram XML file, and optionally a bigram XML file. + * + * @param unigramXmlFilename the name of the unigram XML file. May not be null. + * @param shortcutXmlFilename the name of the shortcut XML file, or null if there is none. + * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams. + * @return the read dictionary. + * @throws FileNotFoundException if one of the files can't be found + * @throws SAXException if one or more of the XML files is not well-formed + * @throws IOException if one the input files can't be read + * @throws ParserConfigurationException if the system can't create a SAX parser + */ + private static FusionDictionary readXmlFile(final String unigramXmlFilename, + final String shortcutXmlFilename, final String bigramXmlFilename) + throws FileNotFoundException, SAXException, IOException, ParserConfigurationException { + final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename)); + final FileInputStream shortcuts = null == shortcutXmlFilename ? null : + new FileInputStream(new File(shortcutXmlFilename)); + final FileInputStream bigrams = null == bigramXmlFilename ? null : + new FileInputStream(new File(bigramXmlFilename)); + return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams); + } + + /** + * Invoke the right output method according to args. + * + * This will write the passed dictionary to the file(s) passed in the command line arguments. + * @param args the parsed arguments. + * @param dict the file to output. + * @throws FileNotFoundException if one of the output files can't be created. + * @throws IOException if one of the output files can't be written to. + */ + private static void writeOutputToParsedArgs(final Arguments args, final FusionDictionary dict) + throws FileNotFoundException, IOException, UnsupportedFormatException, + IllegalArgumentException { + if (null != args.mOutputBinary) { + writeBinaryDictionary(args.mOutputBinary, dict, 2); + } + if (null != args.mOutputBinaryFormat1) { + writeBinaryDictionary(args.mOutputBinaryFormat1, dict, 1); + } + if (null != args.mOutputXml) { + writeXmlDictionary(args.mOutputXml, dict); + } + } + + /** + * Write the dictionary in binary format to the specified filename. + * + * @param outputFilename the name of the file to write to. + * @param dict the dictionary to write. + * @param version the binary format version to use. + * @throws FileNotFoundException if the output file can't be created. + * @throws IOException if the output file can't be written to. + */ + private static void writeBinaryDictionary(final String outputFilename, + final FusionDictionary dict, final int version) + throws FileNotFoundException, IOException, UnsupportedFormatException { + final File outputFile = new File(outputFilename); + BinaryDictInputOutput.writeDictionaryBinary(new FileOutputStream(outputFilename), dict, + version); + } + + /** + * Write the dictionary in XML format to the specified filename. + * + * @param outputFilename the name of the file to write to. + * @param dict the dictionary to write. + * @throws FileNotFoundException if the output file can't be created. + * @throws IOException if the output file can't be written to. + */ + private static void writeXmlDictionary(final String outputFilename, + final FusionDictionary dict) throws FileNotFoundException, IOException { + XmlDictInputOutput.writeDictionaryXml(new FileWriter(outputFilename), dict); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java index 8fc0423b8..c14ce7b88 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java @@ -35,6 +35,7 @@ public class Dicttool { sCommands.put("info", Info.class); sCommands.put("compress", Compress.Compressor.class); sCommands.put("uncompress", Compress.Uncompressor.class); + sCommands.put("makedict", Makedict.class); } private static Command getCommandInstance(final String commandName) { diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java new file mode 100644 index 000000000..c004cfbe4 --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java @@ -0,0 +1,40 @@ +/** + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import java.io.FileNotFoundException; +import java.io.IOException; +import javax.xml.parsers.ParserConfigurationException; +import org.xml.sax.SAXException; + +public class Makedict extends Dicttool.Command { + public static final String COMMAND = "makedict"; + + public Makedict() { + } + + public String getHelp() { + return DictionaryMaker.Arguments.getHelp(); + } + + public void run() throws FileNotFoundException, IOException, ParserConfigurationException, + SAXException, UnsupportedFormatException { + DictionaryMaker.main(mArgs); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java new file mode 100644 index 000000000..7eccff2b4 --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.makedict; + +/** + * Wrapper to redirect log events to the right output medium. + */ +public class MakedictLog { + public static final boolean DBG = true; + + private static void print(String message) { + System.out.println(message); + } + + public static void d(String message) { + print(message); + } + + public static void i(String message) { + print(message); + } + + public static void w(String message) { + print(message); + } + + public static void e(String message) { + print(message); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java new file mode 100644 index 000000000..8e2e73505 --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.Word; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.TreeSet; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Reads and writes XML files for a FusionDictionary. + * + * All functions in this class are static. + */ +public class XmlDictInputOutput { + + private static final String ROOT_TAG = "wordlist"; + private static final String WORD_TAG = "w"; + private static final String BIGRAM_TAG = "bigram"; + private static final String SHORTCUT_TAG = "shortcut"; + private static final String FREQUENCY_ATTR = "f"; + private static final String WORD_ATTR = "word"; + + private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1; + + private static final String OPTIONS_KEY = "options"; + private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; + private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; + + /** + * SAX handler for a unigram XML file. + */ + static private class UnigramHandler extends DefaultHandler { + // Parser states + private static final int NONE = 0; + private static final int START = 1; + private static final int WORD = 2; + private static final int BIGRAM = 4; + private static final int END = 5; + private static final int UNKNOWN = 6; + + FusionDictionary mDictionary; + int mState; // the state of the parser + int mFreq; // the currently read freq + String mWord; // the current word + final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; + + /** + * Create the handler. + * + * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. + */ + public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) { + mDictionary = null; + mShortcutsMap = shortcuts; + mWord = ""; + mState = START; + mFreq = 0; + } + + public FusionDictionary getFinalDictionary() { + final FusionDictionary dict = mDictionary; + mDictionary = null; + mShortcutsMap.clear(); + mWord = ""; + mState = START; + mFreq = 0; + return dict; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attrs) { + if (WORD_TAG.equals(localName)) { + mState = WORD; + mWord = ""; + for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { + final String attrName = attrs.getLocalName(attrIndex); + if (FREQUENCY_ATTR.equals(attrName)) { + mFreq = Integer.parseInt(attrs.getValue(attrIndex)); + } + } + } else if (ROOT_TAG.equals(localName)) { + final HashMap<String, String> attributes = new HashMap<String, String>(); + for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { + final String attrName = attrs.getLocalName(attrIndex); + attributes.put(attrName, attrs.getValue(attrIndex)); + } + final String optionsString = attributes.get(OPTIONS_KEY); + final boolean processUmlauts = + GERMAN_UMLAUT_PROCESSING_OPTION.equals(optionsString); + final boolean processLigatures = + FRENCH_LIGATURE_PROCESSING_OPTION.equals(optionsString); + mDictionary = new FusionDictionary(new Node(), new DictionaryOptions(attributes, + processUmlauts, processLigatures)); + } else { + mState = UNKNOWN; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + if (WORD == mState) { + // The XML parser is free to return text in arbitrary chunks one after the + // other. In particular, this happens in some implementations when it finds + // an escape code like "&". + mWord += String.copyValueOf(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if (WORD == mState) { + mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord)); + mState = START; + } + } + } + + static private class AssociativeListHandler extends DefaultHandler { + private final String SRC_TAG; + private final String SRC_ATTRIBUTE; + private final String DST_TAG; + private final String DST_ATTRIBUTE; + private final String DST_FREQ; + + // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX + private final static int XML_MAX = 256; + // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX + private final static int MEMORY_MAX = 256; + private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; + + private String mSrc; + private final HashMap<String, ArrayList<WeightedString>> mAssocMap; + + public AssociativeListHandler(final String srcTag, final String srcAttribute, + final String dstTag, final String dstAttribute, final String dstFreq) { + SRC_TAG = srcTag; + SRC_ATTRIBUTE = srcAttribute; + DST_TAG = dstTag; + DST_ATTRIBUTE = dstAttribute; + DST_FREQ = dstFreq; + mSrc = null; + mAssocMap = new HashMap<String, ArrayList<WeightedString>>(); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attrs) { + if (SRC_TAG.equals(localName)) { + mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); + } else if (DST_TAG.equals(localName)) { + String dst = attrs.getValue(uri, DST_ATTRIBUTE); + int freq = Integer.parseInt(attrs.getValue(uri, DST_FREQ)); + WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); + ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); + if (null == bigramList) bigramList = new ArrayList<WeightedString>(); + bigramList.add(bigram); + mAssocMap.put(mSrc, bigramList); + } + } + + // This may return an empty map, but will never return null. + public HashMap<String, ArrayList<WeightedString>> getAssocMap() { + return mAssocMap; + } + } + + /** + * SAX handler for a bigram XML file. + */ + static private class BigramHandler extends AssociativeListHandler { + private final static String BIGRAM_W1_TAG = "bi"; + private final static String BIGRAM_W2_TAG = "w"; + private final static String BIGRAM_W1_ATTRIBUTE = "w1"; + private final static String BIGRAM_W2_ATTRIBUTE = "w2"; + private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; + + public BigramHandler() { + super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, + BIGRAM_FREQ_ATTRIBUTE); + } + + // As per getAssocMap(), this never returns null. + public HashMap<String, ArrayList<WeightedString>> getBigramMap() { + return getAssocMap(); + } + } + + /** + * SAX handler for a shortcut XML file. + */ + static private class ShortcutHandler extends AssociativeListHandler { + private final static String ENTRY_TAG = "entry"; + private final static String ENTRY_ATTRIBUTE = "shortcut"; + private final static String TARGET_TAG = "target"; + private final static String REPLACEMENT_ATTRIBUTE = "replacement"; + private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; + + public ShortcutHandler() { + super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, + TARGET_PRIORITY_ATTRIBUTE); + } + + // As per getAssocMap(), this never returns null. + public HashMap<String, ArrayList<WeightedString>> getShortcutMap() { + return getAssocMap(); + } + } + + /** + * Reads a dictionary from an XML file. + * + * This is the public method that will parse an XML file and return the corresponding memory + * representation. + * + * @param unigrams the file to read the data from. + * @param shortcuts the file to read the shortcuts from, or null. + * @param bigrams the file to read the bigrams from, or null. + * @return the in-memory representation of the dictionary. + */ + public static FusionDictionary readDictionaryXml(final InputStream unigrams, + final InputStream shortcuts, final InputStream bigrams) + throws SAXException, IOException, ParserConfigurationException { + final SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + final SAXParser parser = factory.newSAXParser(); + final BigramHandler bigramHandler = new BigramHandler(); + if (null != bigrams) parser.parse(bigrams, bigramHandler); + + final ShortcutHandler shortcutHandler = new ShortcutHandler(); + if (null != shortcuts) parser.parse(shortcuts, shortcutHandler); + + final UnigramHandler unigramHandler = + new UnigramHandler(shortcutHandler.getShortcutMap()); + parser.parse(unigrams, unigramHandler); + final FusionDictionary dict = unigramHandler.getFinalDictionary(); + final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap(); + for (final String firstWord : bigramMap.keySet()) { + if (!dict.hasWord(firstWord)) continue; + final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); + for (final WeightedString bigram : bigramList) { + if (!dict.hasWord(bigram.mWord)) continue; + dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency); + } + } + return dict; + } + + /** + * Reads a dictionary in the first, legacy XML format + * + * This method reads data from the parser and creates a new FusionDictionary with it. + * The format parsed by this method is the format used before Ice Cream Sandwich, + * which has no support for bigrams or shortcuts. + * It is important to note that this method expects the parser to have already eaten + * the first, all-encompassing tag. + * + * @param xpp the parser to read the data from. + * @return the parsed dictionary. + */ + + /** + * Writes a dictionary to an XML file. + * + * The output format is the "second" format, which supports bigrams and shortcuts. + * + * @param destination a destination stream to write to. + * @param dict the dictionary to write. + */ + public static void writeDictionaryXml(Writer destination, FusionDictionary dict) + throws IOException { + final TreeSet<Word> set = new TreeSet<Word>(); + for (Word word : dict) { + set.add(word); + } + // TODO: use an XMLSerializer if this gets big + destination.write("<wordlist format=\"2\""); + final HashMap<String, String> options = dict.mOptions.mAttributes; + if (dict.mOptions.mGermanUmlautProcessing) { + destination.write(" " + OPTIONS_KEY + "=\"" + GERMAN_UMLAUT_PROCESSING_OPTION + "\""); + } else if (dict.mOptions.mFrenchLigatureProcessing) { + destination.write(" " + OPTIONS_KEY + "=\"" + FRENCH_LIGATURE_PROCESSING_OPTION + "\""); + } + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + destination.write(" " + key + "=\"" + value + "\""); + } + destination.write(">\n"); + destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); + for (Word word : set) { + destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " + + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">"); + if (null != word.mShortcutTargets) { + destination.write("\n"); + for (WeightedString target : word.mShortcutTargets) { + destination.write(" <" + SHORTCUT_TAG + " " + FREQUENCY_ATTR + "=\"" + + target.mFrequency + "\">" + target.mWord + "</" + SHORTCUT_TAG + + ">\n"); + } + destination.write(" "); + } + if (null != word.mBigrams) { + destination.write("\n"); + for (WeightedString bigram : word.mBigrams) { + destination.write(" <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\"" + + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n"); + } + destination.write(" "); + } + destination.write("</" + WORD_TAG + ">\n"); + } + destination.write("</wordlist>\n"); + destination.close(); + } +} |