aboutsummaryrefslogtreecommitdiffstats
path: root/tools/makedict2/src/com/android/inputmethod/latin/XmlDictInputOutput.java
diff options
context:
space:
mode:
Diffstat (limited to 'tools/makedict2/src/com/android/inputmethod/latin/XmlDictInputOutput.java')
-rw-r--r--tools/makedict2/src/com/android/inputmethod/latin/XmlDictInputOutput.java215
1 files changed, 215 insertions, 0 deletions
diff --git a/tools/makedict2/src/com/android/inputmethod/latin/XmlDictInputOutput.java b/tools/makedict2/src/com/android/inputmethod/latin/XmlDictInputOutput.java
new file mode 100644
index 000000000..096bfd182
--- /dev/null
+++ b/tools/makedict2/src/com/android/inputmethod/latin/XmlDictInputOutput.java
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.android.inputmethod.latin;
+
+import com.android.inputmethod.latin.FusionDictionary.WeightedString;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.TreeSet;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Reads and writes XML files for a FusionDictionary.
+ *
+ * All functions in this class are static.
+ */
+public class XmlDictInputOutput {
+
+ private static final String WORD_TAG = "w";
+ private static final String BIGRAM_TAG = "bigram";
+ private static final String FREQUENCY_ATTR = "f";
+ private static final String WORD_ATTR = "word";
+
+ /**
+ * SAX handler for a unigram XML file.
+ */
+ static private class UnigramHandler extends DefaultHandler {
+ // Parser states
+ private static final int NONE = 0;
+ private static final int START = 1;
+ private static final int WORD = 2;
+ private static final int BIGRAM = 4;
+ private static final int END = 5;
+ private static final int UNKNOWN = 6;
+
+ final FusionDictionary mDictionary;
+ int mState; // the state of the parser
+ int mFreq; // the currently read freq
+ final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
+
+ /**
+ * Create the handler.
+ *
+ * @param dict the dictionary to construct.
+ * @param bigrams the bigrams as a map. This may be empty, but may not be null.
+ */
+ public UnigramHandler(FusionDictionary dict,
+ HashMap<String, ArrayList<WeightedString>> bigrams) {
+ mDictionary = dict;
+ mBigramsMap = bigrams;
+ mState = START;
+ mFreq = 0;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attrs) {
+ if (WORD_TAG.equals(localName)) {
+ mState = WORD;
+ for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
+ final String attrName = attrs.getLocalName(attrIndex);
+ if (FREQUENCY_ATTR.equals(attrName)) {
+ mFreq = Integer.parseInt(attrs.getValue(attrIndex));
+ }
+ }
+ } else {
+ mState = UNKNOWN;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ if (WORD == mState) {
+ final String word = String.copyValueOf(ch, start, length);
+ mDictionary.add(word, mFreq, mBigramsMap.get(word));
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) {
+ if (WORD == mState) mState = START;
+ }
+ }
+
+ /**
+ * SAX handler for a bigram XML file.
+ */
+ static private class BigramHandler extends DefaultHandler {
+ private final static String BIGRAM_W1_TAG = "bi";
+ private final static String BIGRAM_W2_TAG = "w";
+ private final static String BIGRAM_W1_ATTRIBUTE = "w1";
+ private final static String BIGRAM_W2_ATTRIBUTE = "w2";
+ private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
+
+ String mW1;
+ final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
+
+ public BigramHandler() {
+ mW1 = null;
+ mBigramsMap = new HashMap<String, ArrayList<WeightedString>>();
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attrs) {
+ if (BIGRAM_W1_TAG.equals(localName)) {
+ mW1 = attrs.getValue(uri, BIGRAM_W1_ATTRIBUTE);
+ } else if (BIGRAM_W2_TAG.equals(localName)) {
+ String w2 = attrs.getValue(uri, BIGRAM_W2_ATTRIBUTE);
+ int freq = Integer.parseInt(attrs.getValue(uri, BIGRAM_FREQ_ATTRIBUTE));
+ WeightedString bigram = new WeightedString(w2, freq / 8);
+ ArrayList<WeightedString> bigramList = mBigramsMap.get(mW1);
+ if (null == bigramList) bigramList = new ArrayList<WeightedString>();
+ bigramList.add(bigram);
+ mBigramsMap.put(mW1, bigramList);
+ }
+ }
+
+ public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
+ return mBigramsMap;
+ }
+ }
+
+ /**
+ * Reads a dictionary from an XML file.
+ *
+ * This is the public method that will parse an XML file and return the corresponding memory
+ * representation.
+ *
+ * @param unigrams the file to read the data from.
+ * @return the in-memory representation of the dictionary.
+ */
+ public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams)
+ throws SAXException, IOException, ParserConfigurationException {
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setNamespaceAware(true);
+ final SAXParser parser = factory.newSAXParser();
+ final BigramHandler bigramHandler = new BigramHandler();
+ if (null != bigrams) parser.parse(bigrams, bigramHandler);
+
+ final FusionDictionary dict = new FusionDictionary();
+ final UnigramHandler unigramHandler =
+ new UnigramHandler(dict, bigramHandler.getBigramMap());
+ parser.parse(unigrams, unigramHandler);
+ return dict;
+ }
+
+ /**
+ * Reads a dictionary in the first, legacy XML format
+ *
+ * This method reads data from the parser and creates a new FusionDictionary with it.
+ * The format parsed by this method is the format used before Ice Cream Sandwich,
+ * which has no support for bigrams or shortcuts.
+ * It is important to note that this method expects the parser to have already eaten
+ * the first, all-encompassing tag.
+ *
+ * @param xpp the parser to read the data from.
+ * @return the parsed dictionary.
+ */
+
+ /**
+ * Writes a dictionary to an XML file.
+ *
+ * The output format is the "second" format, which supports bigrams and shortcuts.
+ *
+ * @param destination a destination stream to write to.
+ * @param dict the dictionary to write.
+ */
+ public static void writeDictionaryXml(Writer destination, FusionDictionary dict)
+ throws IOException {
+ final TreeSet<Word> set = new TreeSet<Word>();
+ for (Word word : dict) {
+ set.add(word);
+ }
+ // TODO: use an XMLSerializer if this gets big
+ destination.write("<wordlist format=\"2\">\n");
+ for (Word word : set) {
+ destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
+ + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">");
+ if (null != word.mBigrams) {
+ destination.write("\n");
+ for (WeightedString bigram : word.mBigrams) {
+ destination.write(" <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\""
+ + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n");
+ }
+ destination.write(" ");
+ }
+ destination.write("</" + WORD_TAG + ">\n");
+ }
+ destination.write("</wordlist>\n");
+ destination.close();
+ }
+}