diff options
author | 2012-08-03 17:05:41 +0900 | |
---|---|---|
committer | 2012-08-04 01:11:46 +0900 | |
commit | 54e84a00fc032ba566cbda41feafa71de77e1c43 (patch) | |
tree | e7f058ff1039572b8318955131a8a8003479b4a6 /tools/dicttool/src | |
parent | 1644a3c7323ae33063774d32ce2e0f8698ff712d (diff) | |
download | latinime-54e84a00fc032ba566cbda41feafa71de77e1c43.tar.gz latinime-54e84a00fc032ba566cbda41feafa71de77e1c43.tar.xz latinime-54e84a00fc032ba566cbda41feafa71de77e1c43.zip |
Make a makedict command for dicttool (A3)
This behaves exactly as the old makedict command. Further
changes will redirect the calls to makedict to this, so as
to consolidate similar code.
Groundwork for
Bug: 6429606
Change-Id: Ibeadbf48bec70f988a15ca36ebf5d1ce3b5b54ea
Diffstat (limited to 'tools/dicttool/src')
5 files changed, 749 insertions, 0 deletions
diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java new file mode 100644 index 000000000..9ebd3bbdd --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -0,0 +1,320 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.BinaryDictInputOutput; +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.MakedictLog; +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.Arrays; +import java.util.LinkedList; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +/** + * Main class/method for DictionaryMaker. + */ +public class DictionaryMaker { + + static class Arguments { + private final static String OPTION_VERSION_2 = "-2"; + private final static String OPTION_INPUT_SOURCE = "-s"; + private final static String OPTION_INPUT_BIGRAM_XML = "-b"; + private final static String OPTION_INPUT_SHORTCUT_XML = "-c"; + private final static String OPTION_OUTPUT_BINARY = "-d"; + private final static String OPTION_OUTPUT_BINARY_FORMAT_VERSION_1 = "-d1"; + private final static String OPTION_OUTPUT_XML = "-x"; + private final static String OPTION_HELP = "-h"; + public final String mInputBinary; + public final String mInputUnigramXml; + public final String mInputShortcutXml; + public final String mInputBigramXml; + public final String mOutputBinary; + public final String mOutputBinaryFormat1; + public final String mOutputXml; + + private void checkIntegrity() throws IOException { + checkHasExactlyOneInput(); + checkHasAtLeastOneOutput(); + checkNotSameFile(mInputBinary, mOutputBinary); + checkNotSameFile(mInputBinary, mOutputBinaryFormat1); + checkNotSameFile(mInputBinary, mOutputXml); + checkNotSameFile(mInputUnigramXml, mOutputBinary); + checkNotSameFile(mInputUnigramXml, mOutputBinaryFormat1); + checkNotSameFile(mInputUnigramXml, mOutputXml); + checkNotSameFile(mInputShortcutXml, mOutputBinary); + checkNotSameFile(mInputShortcutXml, mOutputBinaryFormat1); + checkNotSameFile(mInputShortcutXml, mOutputXml); + checkNotSameFile(mInputBigramXml, mOutputBinary); + checkNotSameFile(mInputBigramXml, mOutputBinaryFormat1); + checkNotSameFile(mInputBigramXml, mOutputXml); + checkNotSameFile(mOutputBinary, mOutputBinaryFormat1); + checkNotSameFile(mOutputBinary, mOutputXml); + checkNotSameFile(mOutputBinaryFormat1, mOutputXml); + } + + private void checkHasExactlyOneInput() { + if (null == mInputUnigramXml && null == mInputBinary) { + throw new RuntimeException("No input file specified"); + } else if (null != mInputUnigramXml && null != mInputBinary) { + throw new RuntimeException("Both input XML and binary specified"); + } else if (null != mInputBinary && null != mInputBigramXml) { + throw new RuntimeException("Cannot specify a binary input and a separate bigram " + + "file"); + } + } + + private void checkHasAtLeastOneOutput() { + if (null == mOutputBinary && null == mOutputBinaryFormat1 && null == mOutputXml) { + throw new RuntimeException("No output specified"); + } + } + + /** + * Utility method that throws an exception if path1 and path2 point to the same file. + */ + private static void checkNotSameFile(final String path1, final String path2) + throws IOException { + if (null == path1 || null == path2) return; + if (new File(path1).getCanonicalPath().equals(new File(path2).getCanonicalPath())) { + throw new RuntimeException(path1 + " and " + path2 + " are the same file: " + + " refusing to process."); + } + } + + private void displayHelp() { + MakedictLog.i(getHelp()); + } + + public static String getHelp() { + return "Usage: makedict " + + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts.xml>] " + + "| -s <binary input>] [-d <binary output format version 2>] " + + "[-d1 <binary output format version 1>] [-x <xml output>] [-2]\n" + + "\n" + + " Converts a source dictionary file to one or several outputs.\n" + + " Source can be an XML file, with an optional XML bigrams file, or a\n" + + " binary dictionary file.\n" + + " Binary version 1 (Ice Cream Sandwich), 2 (Jelly Bean) and XML outputs\n" + + " are supported. All three can be output at the same time, but the same\n" + + " output format cannot be specified several times. The behavior is\n" + + " unspecified if the same file is specified for input and output, or for\n" + + " several outputs."; + } + + public Arguments(String[] argsArray) throws IOException { + final LinkedList<String> args = new LinkedList<String>(Arrays.asList(argsArray)); + if (args.isEmpty()) { + displayHelp(); + } + String inputBinary = null; + String inputUnigramXml = null; + String inputShortcutXml = null; + String inputBigramXml = null; + String outputBinary = null; + String outputBinaryFormat1 = null; + String outputXml = null; + + while (!args.isEmpty()) { + final String arg = args.get(0); + args.remove(0); + if (arg.charAt(0) == '-') { + if (OPTION_VERSION_2.equals(arg)) { + // Do nothing, this is the default + } else if (OPTION_HELP.equals(arg)) { + displayHelp(); + } else { + // All these options need an argument + if (args.isEmpty()) { + throw new IllegalArgumentException("Option " + arg + " is unknown or " + + "requires an argument"); + } + String filename = args.get(0); + args.remove(0); + if (OPTION_INPUT_SOURCE.equals(arg)) { + if (BinaryDictInputOutput.isBinaryDictionary(filename)) { + inputBinary = filename; + } else { + inputUnigramXml = filename; + } + } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) { + inputShortcutXml = filename; + } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) { + inputBigramXml = filename; + } else if (OPTION_OUTPUT_BINARY.equals(arg)) { + outputBinary = filename; + } else if (OPTION_OUTPUT_BINARY_FORMAT_VERSION_1.equals(arg)) { + outputBinaryFormat1 = filename; + } else if (OPTION_OUTPUT_XML.equals(arg)) { + outputXml = filename; + } else { + throw new IllegalArgumentException("Unknown option : " + arg); + } + } + } else { + if (null == inputBinary && null == inputUnigramXml) { + if (BinaryDictInputOutput.isBinaryDictionary(arg)) { + inputBinary = arg; + } else { + inputUnigramXml = arg; + } + } else if (null == outputBinary) { + outputBinary = arg; + } else { + throw new IllegalArgumentException("Several output binary files specified"); + } + } + } + + mInputBinary = inputBinary; + mInputUnigramXml = inputUnigramXml; + mInputShortcutXml = inputShortcutXml; + mInputBigramXml = inputBigramXml; + mOutputBinary = outputBinary; + mOutputBinaryFormat1 = outputBinaryFormat1; + mOutputXml = outputXml; + checkIntegrity(); + } + } + + public static void main(String[] args) + throws FileNotFoundException, ParserConfigurationException, SAXException, IOException, + UnsupportedFormatException { + final Arguments parsedArgs = new Arguments(args); + FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs); + writeOutputToParsedArgs(parsedArgs, dictionary); + } + + /** + * Invoke the right input method according to args. + * + * @param args the parsed command line arguments. + * @return the read dictionary. + */ + private static FusionDictionary readInputFromParsedArgs(final Arguments args) + throws IOException, UnsupportedFormatException, ParserConfigurationException, + SAXException, FileNotFoundException { + if (null != args.mInputBinary) { + return readBinaryFile(args.mInputBinary); + } else if (null != args.mInputUnigramXml) { + return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml); + } else { + throw new RuntimeException("No input file specified"); + } + } + + /** + * Read a dictionary from the name of a binary file. + * + * @param binaryFilename the name of the file in the binary dictionary format. + * @return the read dictionary. + * @throws FileNotFoundException if the file can't be found + * @throws IOException if the input file can't be read + * @throws UnsupportedFormatException if the binary file is not in the expected format + */ + private static FusionDictionary readBinaryFile(final String binaryFilename) + throws FileNotFoundException, IOException, UnsupportedFormatException { + final RandomAccessFile inputFile = new RandomAccessFile(binaryFilename, "r"); + return BinaryDictInputOutput.readDictionaryBinary(inputFile, null); + } + + /** + * Read a dictionary from a unigram XML file, and optionally a bigram XML file. + * + * @param unigramXmlFilename the name of the unigram XML file. May not be null. + * @param shortcutXmlFilename the name of the shortcut XML file, or null if there is none. + * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams. + * @return the read dictionary. + * @throws FileNotFoundException if one of the files can't be found + * @throws SAXException if one or more of the XML files is not well-formed + * @throws IOException if one the input files can't be read + * @throws ParserConfigurationException if the system can't create a SAX parser + */ + private static FusionDictionary readXmlFile(final String unigramXmlFilename, + final String shortcutXmlFilename, final String bigramXmlFilename) + throws FileNotFoundException, SAXException, IOException, ParserConfigurationException { + final FileInputStream unigrams = new FileInputStream(new File(unigramXmlFilename)); + final FileInputStream shortcuts = null == shortcutXmlFilename ? null : + new FileInputStream(new File(shortcutXmlFilename)); + final FileInputStream bigrams = null == bigramXmlFilename ? null : + new FileInputStream(new File(bigramXmlFilename)); + return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams); + } + + /** + * Invoke the right output method according to args. + * + * This will write the passed dictionary to the file(s) passed in the command line arguments. + * @param args the parsed arguments. + * @param dict the file to output. + * @throws FileNotFoundException if one of the output files can't be created. + * @throws IOException if one of the output files can't be written to. + */ + private static void writeOutputToParsedArgs(final Arguments args, final FusionDictionary dict) + throws FileNotFoundException, IOException, UnsupportedFormatException, + IllegalArgumentException { + if (null != args.mOutputBinary) { + writeBinaryDictionary(args.mOutputBinary, dict, 2); + } + if (null != args.mOutputBinaryFormat1) { + writeBinaryDictionary(args.mOutputBinaryFormat1, dict, 1); + } + if (null != args.mOutputXml) { + writeXmlDictionary(args.mOutputXml, dict); + } + } + + /** + * Write the dictionary in binary format to the specified filename. + * + * @param outputFilename the name of the file to write to. + * @param dict the dictionary to write. + * @param version the binary format version to use. + * @throws FileNotFoundException if the output file can't be created. + * @throws IOException if the output file can't be written to. + */ + private static void writeBinaryDictionary(final String outputFilename, + final FusionDictionary dict, final int version) + throws FileNotFoundException, IOException, UnsupportedFormatException { + final File outputFile = new File(outputFilename); + BinaryDictInputOutput.writeDictionaryBinary(new FileOutputStream(outputFilename), dict, + version); + } + + /** + * Write the dictionary in XML format to the specified filename. + * + * @param outputFilename the name of the file to write to. + * @param dict the dictionary to write. + * @throws FileNotFoundException if the output file can't be created. + * @throws IOException if the output file can't be written to. + */ + private static void writeXmlDictionary(final String outputFilename, + final FusionDictionary dict) throws FileNotFoundException, IOException { + XmlDictInputOutput.writeDictionaryXml(new FileWriter(outputFilename), dict); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java index 8fc0423b8..c14ce7b88 100644 --- a/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/Dicttool.java @@ -35,6 +35,7 @@ public class Dicttool { sCommands.put("info", Info.class); sCommands.put("compress", Compress.Compressor.class); sCommands.put("uncompress", Compress.Uncompressor.class); + sCommands.put("makedict", Makedict.class); } private static Command getCommandInstance(final String commandName) { diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java new file mode 100644 index 000000000..c004cfbe4 --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/Makedict.java @@ -0,0 +1,40 @@ +/** + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.UnsupportedFormatException; + +import java.io.FileNotFoundException; +import java.io.IOException; +import javax.xml.parsers.ParserConfigurationException; +import org.xml.sax.SAXException; + +public class Makedict extends Dicttool.Command { + public static final String COMMAND = "makedict"; + + public Makedict() { + } + + public String getHelp() { + return DictionaryMaker.Arguments.getHelp(); + } + + public void run() throws FileNotFoundException, IOException, ParserConfigurationException, + SAXException, UnsupportedFormatException { + DictionaryMaker.main(mArgs); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java new file mode 100644 index 000000000..7eccff2b4 --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/MakedictLog.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.makedict; + +/** + * Wrapper to redirect log events to the right output medium. + */ +public class MakedictLog { + public static final boolean DBG = true; + + private static void print(String message) { + System.out.println(message); + } + + public static void d(String message) { + print(message); + } + + public static void i(String message) { + print(message); + } + + public static void w(String message) { + print(message); + } + + public static void e(String message) { + print(message); + } +} diff --git a/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java new file mode 100644 index 000000000..8e2e73505 --- /dev/null +++ b/tools/dicttool/src/android/inputmethod/latin/dicttool/XmlDictInputOutput.java @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package com.android.inputmethod.latin.dicttool; + +import com.android.inputmethod.latin.makedict.FusionDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.Node; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.makedict.Word; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Writer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.TreeSet; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Reads and writes XML files for a FusionDictionary. + * + * All functions in this class are static. + */ +public class XmlDictInputOutput { + + private static final String ROOT_TAG = "wordlist"; + private static final String WORD_TAG = "w"; + private static final String BIGRAM_TAG = "bigram"; + private static final String SHORTCUT_TAG = "shortcut"; + private static final String FREQUENCY_ATTR = "f"; + private static final String WORD_ATTR = "word"; + + private static final int SHORTCUT_ONLY_DEFAULT_FREQ = 1; + + private static final String OPTIONS_KEY = "options"; + private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; + private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; + + /** + * SAX handler for a unigram XML file. + */ + static private class UnigramHandler extends DefaultHandler { + // Parser states + private static final int NONE = 0; + private static final int START = 1; + private static final int WORD = 2; + private static final int BIGRAM = 4; + private static final int END = 5; + private static final int UNKNOWN = 6; + + FusionDictionary mDictionary; + int mState; // the state of the parser + int mFreq; // the currently read freq + String mWord; // the current word + final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; + + /** + * Create the handler. + * + * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. + */ + public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) { + mDictionary = null; + mShortcutsMap = shortcuts; + mWord = ""; + mState = START; + mFreq = 0; + } + + public FusionDictionary getFinalDictionary() { + final FusionDictionary dict = mDictionary; + mDictionary = null; + mShortcutsMap.clear(); + mWord = ""; + mState = START; + mFreq = 0; + return dict; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attrs) { + if (WORD_TAG.equals(localName)) { + mState = WORD; + mWord = ""; + for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { + final String attrName = attrs.getLocalName(attrIndex); + if (FREQUENCY_ATTR.equals(attrName)) { + mFreq = Integer.parseInt(attrs.getValue(attrIndex)); + } + } + } else if (ROOT_TAG.equals(localName)) { + final HashMap<String, String> attributes = new HashMap<String, String>(); + for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { + final String attrName = attrs.getLocalName(attrIndex); + attributes.put(attrName, attrs.getValue(attrIndex)); + } + final String optionsString = attributes.get(OPTIONS_KEY); + final boolean processUmlauts = + GERMAN_UMLAUT_PROCESSING_OPTION.equals(optionsString); + final boolean processLigatures = + FRENCH_LIGATURE_PROCESSING_OPTION.equals(optionsString); + mDictionary = new FusionDictionary(new Node(), new DictionaryOptions(attributes, + processUmlauts, processLigatures)); + } else { + mState = UNKNOWN; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + if (WORD == mState) { + // The XML parser is free to return text in arbitrary chunks one after the + // other. In particular, this happens in some implementations when it finds + // an escape code like "&". + mWord += String.copyValueOf(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if (WORD == mState) { + mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord)); + mState = START; + } + } + } + + static private class AssociativeListHandler extends DefaultHandler { + private final String SRC_TAG; + private final String SRC_ATTRIBUTE; + private final String DST_TAG; + private final String DST_ATTRIBUTE; + private final String DST_FREQ; + + // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX + private final static int XML_MAX = 256; + // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX + private final static int MEMORY_MAX = 256; + private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; + + private String mSrc; + private final HashMap<String, ArrayList<WeightedString>> mAssocMap; + + public AssociativeListHandler(final String srcTag, final String srcAttribute, + final String dstTag, final String dstAttribute, final String dstFreq) { + SRC_TAG = srcTag; + SRC_ATTRIBUTE = srcAttribute; + DST_TAG = dstTag; + DST_ATTRIBUTE = dstAttribute; + DST_FREQ = dstFreq; + mSrc = null; + mAssocMap = new HashMap<String, ArrayList<WeightedString>>(); + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attrs) { + if (SRC_TAG.equals(localName)) { + mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); + } else if (DST_TAG.equals(localName)) { + String dst = attrs.getValue(uri, DST_ATTRIBUTE); + int freq = Integer.parseInt(attrs.getValue(uri, DST_FREQ)); + WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); + ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); + if (null == bigramList) bigramList = new ArrayList<WeightedString>(); + bigramList.add(bigram); + mAssocMap.put(mSrc, bigramList); + } + } + + // This may return an empty map, but will never return null. + public HashMap<String, ArrayList<WeightedString>> getAssocMap() { + return mAssocMap; + } + } + + /** + * SAX handler for a bigram XML file. + */ + static private class BigramHandler extends AssociativeListHandler { + private final static String BIGRAM_W1_TAG = "bi"; + private final static String BIGRAM_W2_TAG = "w"; + private final static String BIGRAM_W1_ATTRIBUTE = "w1"; + private final static String BIGRAM_W2_ATTRIBUTE = "w2"; + private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; + + public BigramHandler() { + super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, + BIGRAM_FREQ_ATTRIBUTE); + } + + // As per getAssocMap(), this never returns null. + public HashMap<String, ArrayList<WeightedString>> getBigramMap() { + return getAssocMap(); + } + } + + /** + * SAX handler for a shortcut XML file. + */ + static private class ShortcutHandler extends AssociativeListHandler { + private final static String ENTRY_TAG = "entry"; + private final static String ENTRY_ATTRIBUTE = "shortcut"; + private final static String TARGET_TAG = "target"; + private final static String REPLACEMENT_ATTRIBUTE = "replacement"; + private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; + + public ShortcutHandler() { + super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, + TARGET_PRIORITY_ATTRIBUTE); + } + + // As per getAssocMap(), this never returns null. + public HashMap<String, ArrayList<WeightedString>> getShortcutMap() { + return getAssocMap(); + } + } + + /** + * Reads a dictionary from an XML file. + * + * This is the public method that will parse an XML file and return the corresponding memory + * representation. + * + * @param unigrams the file to read the data from. + * @param shortcuts the file to read the shortcuts from, or null. + * @param bigrams the file to read the bigrams from, or null. + * @return the in-memory representation of the dictionary. + */ + public static FusionDictionary readDictionaryXml(final InputStream unigrams, + final InputStream shortcuts, final InputStream bigrams) + throws SAXException, IOException, ParserConfigurationException { + final SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + final SAXParser parser = factory.newSAXParser(); + final BigramHandler bigramHandler = new BigramHandler(); + if (null != bigrams) parser.parse(bigrams, bigramHandler); + + final ShortcutHandler shortcutHandler = new ShortcutHandler(); + if (null != shortcuts) parser.parse(shortcuts, shortcutHandler); + + final UnigramHandler unigramHandler = + new UnigramHandler(shortcutHandler.getShortcutMap()); + parser.parse(unigrams, unigramHandler); + final FusionDictionary dict = unigramHandler.getFinalDictionary(); + final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap(); + for (final String firstWord : bigramMap.keySet()) { + if (!dict.hasWord(firstWord)) continue; + final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); + for (final WeightedString bigram : bigramList) { + if (!dict.hasWord(bigram.mWord)) continue; + dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency); + } + } + return dict; + } + + /** + * Reads a dictionary in the first, legacy XML format + * + * This method reads data from the parser and creates a new FusionDictionary with it. + * The format parsed by this method is the format used before Ice Cream Sandwich, + * which has no support for bigrams or shortcuts. + * It is important to note that this method expects the parser to have already eaten + * the first, all-encompassing tag. + * + * @param xpp the parser to read the data from. + * @return the parsed dictionary. + */ + + /** + * Writes a dictionary to an XML file. + * + * The output format is the "second" format, which supports bigrams and shortcuts. + * + * @param destination a destination stream to write to. + * @param dict the dictionary to write. + */ + public static void writeDictionaryXml(Writer destination, FusionDictionary dict) + throws IOException { + final TreeSet<Word> set = new TreeSet<Word>(); + for (Word word : dict) { + set.add(word); + } + // TODO: use an XMLSerializer if this gets big + destination.write("<wordlist format=\"2\""); + final HashMap<String, String> options = dict.mOptions.mAttributes; + if (dict.mOptions.mGermanUmlautProcessing) { + destination.write(" " + OPTIONS_KEY + "=\"" + GERMAN_UMLAUT_PROCESSING_OPTION + "\""); + } else if (dict.mOptions.mFrenchLigatureProcessing) { + destination.write(" " + OPTIONS_KEY + "=\"" + FRENCH_LIGATURE_PROCESSING_OPTION + "\""); + } + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + destination.write(" " + key + "=\"" + value + "\""); + } + destination.write(">\n"); + destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); + for (Word word : set) { + destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " + + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">"); + if (null != word.mShortcutTargets) { + destination.write("\n"); + for (WeightedString target : word.mShortcutTargets) { + destination.write(" <" + SHORTCUT_TAG + " " + FREQUENCY_ATTR + "=\"" + + target.mFrequency + "\">" + target.mWord + "</" + SHORTCUT_TAG + + ">\n"); + } + destination.write(" "); + } + if (null != word.mBigrams) { + destination.write("\n"); + for (WeightedString bigram : word.mBigrams) { + destination.write(" <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\"" + + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n"); + } + destination.write(" "); + } + destination.write("</" + WORD_TAG + ">\n"); + } + destination.write("</wordlist>\n"); + destination.close(); + } +} |