aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKen Wakasa <kwakasa@google.com>2010-12-10 20:59:10 +0900
committerKen Wakasa <kwakasa@google.com>2010-12-11 10:46:54 +0900
commitbd6f4bfc591fd1e7f6165c117636bf9275ab91b3 (patch)
treecdc6f21598a91505f554804896ebafde6ef50e07
parent35bcd5c27bb7f8eda50d50eee9bbb74d8854b1e9 (diff)
downloadlatinime-bd6f4bfc591fd1e7f6165c117636bf9275ab91b3.tar.gz
latinime-bd6f4bfc591fd1e7f6165c117636bf9275ab91b3.tar.xz
latinime-bd6f4bfc591fd1e7f6165c117636bf9275ab91b3.zip
Move tools/makedict from platform/development to platform/packages/inputmethods/LatinIME
The corresponding change is I559207ab Change-Id: I01ef7084cffa72468e87147e0ec7a9b16dd19990
-rw-r--r--tools/Android.mk17
-rw-r--r--tools/makedict/Android.mk24
-rw-r--r--tools/makedict/etc/Android.mk20
-rwxr-xr-xtools/makedict/etc/makedict63
-rw-r--r--tools/makedict/etc/manifest.txt1
-rw-r--r--tools/makedict/src/com/android/tools/dict/BigramDictionary.java286
-rwxr-xr-xtools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java443
7 files changed, 854 insertions, 0 deletions
diff --git a/tools/Android.mk b/tools/Android.mk
new file mode 100644
index 000000000..8f1acc55a
--- /dev/null
+++ b/tools/Android.mk
@@ -0,0 +1,17 @@
+# Copyright (C) 2010 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(call all-makefiles-under,$(LOCAL_PATH))
diff --git a/tools/makedict/Android.mk b/tools/makedict/Android.mk
new file mode 100644
index 000000000..b9fc5533d
--- /dev/null
+++ b/tools/makedict/Android.mk
@@ -0,0 +1,24 @@
+#
+# Copyright (C) 2009 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(call all-java-files-under,src)
+LOCAL_JAR_MANIFEST := etc/manifest.txt
+LOCAL_MODULE := makedict
+
+include $(BUILD_HOST_JAVA_LIBRARY)
+include $(LOCAL_PATH)/etc/Android.mk
diff --git a/tools/makedict/etc/Android.mk b/tools/makedict/etc/Android.mk
new file mode 100644
index 000000000..da162868a
--- /dev/null
+++ b/tools/makedict/etc/Android.mk
@@ -0,0 +1,20 @@
+# Copyright (C) 2009 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_PREBUILT_EXECUTABLES := makedict
+include $(BUILD_HOST_PREBUILT)
+
diff --git a/tools/makedict/etc/makedict b/tools/makedict/etc/makedict
new file mode 100755
index 000000000..8420d6e5e
--- /dev/null
+++ b/tools/makedict/etc/makedict
@@ -0,0 +1,63 @@
+#!/bin/sh
+# Copyright 2009, The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Set up prog to be the path of this script, including following symlinks,
+# and set up progdir to be the fully-qualified pathname of its directory.
+prog="$0"
+while [ -h "${prog}" ]; do
+ newProg=`/bin/ls -ld "${prog}"`
+ newProg=`expr "${newProg}" : ".* -> \(.*\)$"`
+ if expr "x${newProg}" : 'x/' >/dev/null; then
+ prog="${newProg}"
+ else
+ progdir=`dirname "${prog}"`
+ prog="${progdir}/${newProg}"
+ fi
+done
+oldwd=`pwd`
+progdir=`dirname "${prog}"`
+cd "${progdir}"
+progdir=`pwd`
+prog="${progdir}"/`basename "${prog}"`
+cd "${oldwd}"
+
+jarfile=makedict.jar
+frameworkdir="$progdir"
+if [ ! -r "$frameworkdir/$jarfile" ]
+then
+ frameworkdir=`dirname "$progdir"`/tools/lib
+ libdir=`dirname "$progdir"`/tools/lib
+fi
+if [ ! -r "$frameworkdir/$jarfile" ]
+then
+ frameworkdir=`dirname "$progdir"`/framework
+ libdir=`dirname "$progdir"`/lib
+fi
+if [ ! -r "$frameworkdir/$jarfile" ]
+then
+ echo `basename "$prog"`": can't find $jarfile"
+ exit 1
+fi
+
+if [ "$OSTYPE" = "cygwin" ] ; then
+ jarpath=`cygpath -w "$frameworkdir/$jarfile"`
+ progdir=`cygpath -w "$progdir"`
+else
+ jarpath="$frameworkdir/$jarfile"
+fi
+
+# need to use "java.ext.dirs" because "-jar" causes classpath to be ignored
+# might need more memory, e.g. -Xmx128M
+exec java -Djava.ext.dirs="$frameworkdir" -jar "$jarpath" "$@"
diff --git a/tools/makedict/etc/manifest.txt b/tools/makedict/etc/manifest.txt
new file mode 100644
index 000000000..aa3a3e84c
--- /dev/null
+++ b/tools/makedict/etc/manifest.txt
@@ -0,0 +1 @@
+Main-Class: com.android.tools.dict.MakeBinaryDictionary
diff --git a/tools/makedict/src/com/android/tools/dict/BigramDictionary.java b/tools/makedict/src/com/android/tools/dict/BigramDictionary.java
new file mode 100644
index 000000000..35115bf2c
--- /dev/null
+++ b/tools/makedict/src/com/android/tools/dict/BigramDictionary.java
@@ -0,0 +1,286 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.tools.dict;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+/**
+ * Helper for MakeBinaryDictionary
+ * Deals with all the bigram data
+ */
+public class BigramDictionary {
+
+ /*
+ * Must match the values in the client side which is located in dictionary.cpp & dictionary.h
+ * Changing these values will generate totally different structure which must be also reflected
+ * on the client side.
+ */
+ public static final int FLAG_BIGRAM_READ = 0x80;
+ public static final int FLAG_BIGRAM_CHILDEXIST = 0x40;
+ public static final int FLAG_BIGRAM_CONTINUED = 0x80;
+ public static final int FLAG_BIGRAM_FREQ = 0x7F;
+
+ public static final int FOR_REVERSE_LOOKUPALL = -99;
+
+ public ArrayList<String> mBigramToFill = new ArrayList<String>();
+ public ArrayList<Integer> mBigramToFillAddress = new ArrayList<Integer>();
+
+ public HashMap<String, Bigram> mBi;
+
+ public boolean mHasBigram;
+
+ public BigramDictionary(String bigramSrcFilename, boolean hasBigram) {
+ mHasBigram = hasBigram;
+ loadBigram(bigramSrcFilename);
+ }
+
+ private void loadBigram(String filename) {
+ mBi = new HashMap<String, Bigram>();
+ if (!mHasBigram) {
+ System.out.println("Number of bigrams = " + Bigram.sBigramNum);
+ return;
+ }
+ try {
+ SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ parser.parse(new File(filename), new DefaultHandler() {
+ String w1 = null;
+ boolean inWord1 = false;
+ boolean inWord2 = false;
+ int freq = 0, counter = 0;
+ Bigram tempBigram = null;
+
+ @Override
+ public void startElement(String uri, String localName,
+ String qName, Attributes attributes) {
+ if (qName.equals("bi")) {
+ inWord1 = true;
+ w1 = attributes.getValue(0);
+ int count = Integer.parseInt(attributes.getValue(1));
+ tempBigram = new Bigram(count);
+ counter = 0;
+ } else if (qName.equals("w")) {
+ inWord2 = true;
+ String word2 = attributes.getValue(0);
+ int freq = Integer.parseInt(attributes.getValue(1));
+ tempBigram.setWord2(counter, word2, freq);
+ counter++;
+ Bigram.sBigramNum++;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName,
+ String qName) {
+ if (inWord2) {
+ inWord2 = false;
+ } else if (inWord1) {
+ inWord1 = false;
+ mBi.put(w1, tempBigram);
+ }
+ }
+ });
+ } catch (Exception ioe) {
+ System.err.println("Exception in parsing bigram\n" + ioe);
+ ioe.printStackTrace();
+ }
+ System.out.println("Number of bigrams = " + Bigram.sBigramNum);
+ }
+
+ byte[] writeBigrams(byte[] dict, Map<String, Integer> mDictionary) {
+ for (int i = 0; i < mBigramToFill.size(); i++) {
+ String w1 = mBigramToFill.get(i);
+ int address = mBigramToFillAddress.get(i);
+
+ Bigram temp = mBi.get(w1);
+ int word2Count = temp.count;
+ int j4;
+ for (int j = 0; j < word2Count; j++) {
+ if (!mDictionary.containsKey(temp.word2[j])) {
+ System.out.println("Not in dictionary: " + temp.word2[j]);
+ System.exit(0);
+ } else {
+ j4 = (j * 4);
+ int addressOfWord2 = mDictionary.get(temp.word2[j]);
+ dict[address + j4 + 0] = (byte) (((addressOfWord2 & 0x3F0000) >> 16)
+ | FLAG_BIGRAM_READ);
+ dict[address + j4 + 1] = (byte) ((addressOfWord2 & 0x00FF00) >> 8);
+ dict[address + j4 + 2] = (byte) ((addressOfWord2 & 0x0000FF));
+
+ if (j == (word2Count - 1)) {
+ dict[address + j4 + 3] = (byte) (temp.freq[j] & FLAG_BIGRAM_FREQ);
+ } else {
+ dict[address + j4 + 3] = (byte) ((temp.freq[j] & FLAG_BIGRAM_FREQ)
+ | FLAG_BIGRAM_CONTINUED);
+ }
+ }
+ }
+ }
+
+ return dict;
+ }
+
+ void reverseLookupAll(Map<String, Integer> mDictionary, byte[] dict) {
+ Set<String> st = mDictionary.keySet();
+ for (String s : st) {
+ searchForTerminalNode(mDictionary.get(s), FOR_REVERSE_LOOKUPALL, dict);
+ }
+ }
+
+ void searchForTerminalNode(int bigramAddress, int frequency, byte[] dict) {
+ StringBuilder sb = new StringBuilder(48);
+ int pos;
+ boolean found = false;
+ int followDownBranchAddress = 2;
+ char followingChar = ' ';
+ int depth = 0;
+ int totalLoopCount = 0;
+
+ while (!found) {
+ boolean followDownAddressSearchStop = false;
+ boolean firstAddress = true;
+ boolean haveToSearchAll = true;
+
+ if (depth > 0) {
+ sb.append(followingChar);
+ }
+ pos = followDownBranchAddress; // pos start at count
+ int count = dict[pos] & 0xFF;
+ pos++;
+ for (int i = 0; i < count; i++) {
+ totalLoopCount++;
+ // pos at data
+ pos++;
+ // pos now at flag
+ if (!MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // non-terminal
+ if (!followDownAddressSearchStop) {
+ int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
+ if (addr > bigramAddress) {
+ followDownAddressSearchStop = true;
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ } else if (!haveToSearchAll) {
+ break;
+ }
+ } else {
+ followDownBranchAddress = addr;
+ followingChar = (char) (0xFF & dict[pos-1]);
+ if(firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = false;
+ }
+ }
+ }
+ pos += 3;
+ } else if (MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // terminal
+ // found !!
+ if (bigramAddress == (pos-1)) {
+ sb.append((char) (0xFF & dict[pos-1]));
+ found = true;
+ break;
+ }
+
+ // address + freq (4 byte)
+ if (MakeBinaryDictionary.getSecondBitOfByte(pos, dict)) {
+ if (!followDownAddressSearchStop) {
+ int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
+ if (addr > bigramAddress) {
+ followDownAddressSearchStop = true;
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ } else if (!haveToSearchAll) {
+ break;
+ }
+ } else {
+ followDownBranchAddress = addr;
+ followingChar = (char) (0xFF & dict[pos-1]);
+ if(firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ }
+ }
+ }
+ pos += 4;
+ } else { // freq only (2 byte)
+ pos += 2;
+ }
+ // skipping bigram
+ int bigramExist = (dict[pos] & FLAG_BIGRAM_READ);
+ if (bigramExist > 0) {
+ int nextBigramExist = 1;
+ while (nextBigramExist > 0) {
+ pos += 3;
+ nextBigramExist = (dict[pos++] & FLAG_BIGRAM_CONTINUED);
+ }
+ } else {
+ pos++;
+ }
+ }
+ }
+ depth++;
+ if (followDownBranchAddress == 2) {
+ System.out.println("ERROR!!! Cannot find bigram!!");
+ System.exit(0);
+ }
+ }
+
+ if (frequency == FOR_REVERSE_LOOKUPALL) {
+ System.out.println("Reverse: " + sb.toString() + " (" + bigramAddress + ")"
+ + " Loop: " + totalLoopCount);
+ } else {
+ System.out.println(" bigram: " + sb.toString() + " (" + bigramAddress + ") freq: "
+ + frequency + " Loop: " + totalLoopCount);
+ }
+ }
+
+ static class Bigram {
+ String[] word2;
+ int[] freq;
+ int count;
+ static int sBigramNum = 0;
+
+ String getSecondWord(int i) {
+ return word2[i];
+ }
+
+ int getFrequency(int i) {
+ return (freq[i] == 0) ? 1 : freq[i];
+ }
+
+ void setWord2(int index, String word2, int freq) {
+ this.word2[index] = word2;
+ this.freq[index] = freq;
+ }
+
+ public Bigram(int word2Count) {
+ count = word2Count;
+ word2 = new String[word2Count];
+ freq = new int[word2Count];
+ }
+ }
+}
diff --git a/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
new file mode 100755
index 000000000..ca6de56e1
--- /dev/null
+++ b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.tools.dict;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+/**
+ * Compresses a list of words, frequencies, and bigram data
+ * into a tree structured binary dictionary.
+ * Dictionary Version: 200 (may contain bigrams)
+ * Version number started from 200 rather than 1 because we wanted to prevent number of roots in
+ * any old dictionaries being mistaken as the version number. There is not a chance that there
+ * will be more than 200 roots. Version number should be increased when there is structural change
+ * in the data. There is no need to increase the version when only the words in the data changes.
+ */
+public class MakeBinaryDictionary {
+
+ private static final int VERSION_NUM = 200;
+
+ public static final int ALPHA_SIZE = 256;
+
+ public static final String TAG_WORD = "w";
+ public static final String ATTR_FREQ = "f";
+
+ private static final int FLAG_ADDRESS_MASK = 0x400000;
+ private static final int FLAG_TERMINAL_MASK = 0x800000;
+ private static final int ADDRESS_MASK = 0x3FFFFF;
+
+ /**
+ * Unit for this variable is in bytes
+ * If destination file name is main.dict and file limit causes dictionary to be separated into
+ * multiple file, it will generate main0.dict, main1.dict, and so forth.
+ */
+ private static int sOutputFileSize;
+ private static boolean sSplitOutput;
+
+ public static final CharNode EMPTY_NODE = new CharNode();
+
+ List<CharNode> roots;
+ Map<String, Integer> mDictionary;
+ int mWordCount;
+
+ BigramDictionary bigramDict;
+
+ static class CharNode {
+ char data;
+ int freq;
+ boolean terminal;
+ List<CharNode> children;
+ static int sNodes;
+
+ public CharNode() {
+ sNodes++;
+ }
+ }
+
+ public static void usage() {
+ System.err.println("Usage: makedict -s <src_dict.xml> [-b <src_bigram.xml>] "
+ + "-d <dest.dict> [--size filesize]");
+ System.exit(-1);
+ }
+
+ public static void main(String[] args) {
+ int checkSource = -1;
+ int checkBigram = -1;
+ int checkDest = -1;
+ int checkFileSize = -1;
+ for (int i = 0; i < args.length; i+=2) {
+ if (args[i].equals("-s")) checkSource = (i + 1);
+ if (args[i].equals("-b")) checkBigram = (i + 1);
+ if (args[i].equals("-d")) checkDest = (i + 1);
+ if (args[i].equals("--size")) checkFileSize = (i + 1);
+ }
+ if (checkFileSize >= 0) {
+ sSplitOutput = true;
+ sOutputFileSize = Integer.parseInt(args[checkFileSize]);
+ } else {
+ sSplitOutput = false;
+ }
+ if (checkDest >= 0 && !args[checkDest].endsWith(".dict")) {
+ System.err.println("Error: Dictionary output file extension should be \".dict\"");
+ usage();
+ } else if (checkSource >= 0 && checkBigram >= 0 && checkDest >= 0 &&
+ ((!sSplitOutput && args.length == 6) || (sSplitOutput && args.length == 8))) {
+ new MakeBinaryDictionary(args[checkSource], args[checkBigram], args[checkDest]);
+ } else if (checkSource >= 0 && checkDest >= 0 &&
+ ((!sSplitOutput && args.length == 4) || (sSplitOutput && args.length == 6))) {
+ new MakeBinaryDictionary(args[checkSource], null, args[checkDest]);
+ } else {
+ usage();
+ }
+ }
+
+ public MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename){
+ System.out.println("Generating dictionary version " + VERSION_NUM);
+ bigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null));
+ populateDictionary(srcFilename);
+ writeToDict(destFilename);
+
+ // Enable the code below to verify that the generated tree is traversable
+ // and bigram data is stored correctly.
+ if (false) {
+ bigramDict.reverseLookupAll(mDictionary, dict);
+ traverseDict(2, new char[32], 0);
+ }
+ }
+
+ private void populateDictionary(String filename) {
+ roots = new ArrayList<CharNode>();
+ mDictionary = new HashMap<String, Integer>();
+ try {
+ SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ parser.parse(new File(filename), new DefaultHandler() {
+ boolean inWord;
+ int freq;
+ StringBuilder wordBuilder = new StringBuilder(48);
+
+ @Override
+ public void startElement(String uri, String localName,
+ String qName, Attributes attributes) {
+ if (qName.equals("w")) {
+ inWord = true;
+ freq = Integer.parseInt(attributes.getValue(0));
+ wordBuilder.setLength(0);
+ }
+ }
+
+ @Override
+ public void characters(char[] data, int offset, int length) {
+ // Ignore other whitespace
+ if (!inWord) return;
+ wordBuilder.append(data, offset, length);
+ }
+
+ @Override
+ public void endElement(String uri, String localName,
+ String qName) {
+ if (qName.equals("w")) {
+ if (wordBuilder.length() > 1) {
+ addWordTop(wordBuilder.toString(), freq);
+ mWordCount++;
+ }
+ inWord = false;
+ }
+ }
+ });
+ } catch (Exception ioe) {
+ System.err.println("Exception in parsing\n" + ioe);
+ ioe.printStackTrace();
+ }
+ System.out.println("Nodes = " + CharNode.sNodes);
+ }
+
+ private int indexOf(List<CharNode> children, char c) {
+ if (children == null) {
+ return -1;
+ }
+ for (int i = 0; i < children.size(); i++) {
+ if (children.get(i).data == c) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ private void addWordTop(String word, int occur) {
+ if (occur > 255) occur = 255;
+ char firstChar = word.charAt(0);
+ int index = indexOf(roots, firstChar);
+ if (index == -1) {
+ CharNode newNode = new CharNode();
+ newNode.data = firstChar;
+ newNode.freq = occur;
+ index = roots.size();
+ roots.add(newNode);
+ } else {
+ roots.get(index).freq += occur;
+ }
+ if (word.length() > 1) {
+ addWordRec(roots.get(index), word, 1, occur);
+ } else {
+ roots.get(index).terminal = true;
+ }
+ }
+
+ private void addWordRec(CharNode parent, String word, int charAt, int occur) {
+ CharNode child = null;
+ char data = word.charAt(charAt);
+ if (parent.children == null) {
+ parent.children = new ArrayList<CharNode>();
+ } else {
+ for (int i = 0; i < parent.children.size(); i++) {
+ CharNode node = parent.children.get(i);
+ if (node.data == data) {
+ child = node;
+ break;
+ }
+ }
+ }
+ if (child == null) {
+ child = new CharNode();
+ parent.children.add(child);
+ }
+ child.data = data;
+ if (child.freq == 0) child.freq = occur;
+ if (word.length() > charAt + 1) {
+ addWordRec(child, word, charAt + 1, occur);
+ } else {
+ child.terminal = true;
+ child.freq = occur;
+ }
+ }
+
+ byte[] dict;
+ int dictSize;
+ static final int CHAR_WIDTH = 8;
+ static final int FLAGS_WIDTH = 1; // Terminal flag (word end)
+ static final int ADDR_WIDTH = 23; // Offset to children
+ static final int FREQ_WIDTH_BYTES = 1;
+ static final int COUNT_WIDTH_BYTES = 1;
+
+ private void addCount(int count) {
+ dict[dictSize++] = (byte) (0xFF & count);
+ }
+
+ private void addNode(CharNode node, String word1) {
+ if (node.terminal) { // store address of each word1
+ mDictionary.put(word1, dictSize);
+ }
+ int charData = 0xFFFF & node.data;
+ if (charData > 254) {
+ dict[dictSize++] = (byte) 255;
+ dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF);
+ dict[dictSize++] = (byte) (node.data & 0xFF);
+ } else {
+ dict[dictSize++] = (byte) (0xFF & node.data);
+ }
+ if (node.children != null) {
+ dictSize += 3; // Space for children address
+ } else {
+ dictSize += 1; // Space for just the terminal/address flags
+ }
+ if ((0xFFFFFF & node.freq) > 255) {
+ node.freq = 255;
+ }
+ if (node.terminal) {
+ byte freq = (byte) (0xFF & node.freq);
+ dict[dictSize++] = freq;
+ // bigram
+ if (bigramDict.mBi.containsKey(word1)) {
+ int count = bigramDict.mBi.get(word1).count;
+ bigramDict.mBigramToFill.add(word1);
+ bigramDict.mBigramToFillAddress.add(dictSize);
+ dictSize += (4 * count);
+ } else {
+ dict[dictSize++] = (byte) (0x00);
+ }
+ }
+ }
+
+ int nullChildrenCount = 0;
+ int notTerminalCount = 0;
+
+ private void updateNodeAddress(int nodeAddress, CharNode node,
+ int childrenAddress) {
+ if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character
+ nodeAddress += 2;
+ }
+ childrenAddress = ADDRESS_MASK & childrenAddress;
+ if (childrenAddress == 0) {
+ nullChildrenCount++;
+ } else {
+ childrenAddress |= FLAG_ADDRESS_MASK;
+ }
+ if (node.terminal) {
+ childrenAddress |= FLAG_TERMINAL_MASK;
+ } else {
+ notTerminalCount++;
+ }
+ dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
+ if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) {
+ dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
+ dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
+ }
+ }
+
+ void writeWordsRec(List<CharNode> children, StringBuilder word) {
+ if (children == null || children.size() == 0) {
+ return;
+ }
+ final int childCount = children.size();
+ addCount(childCount);
+ int[] childrenAddresses = new int[childCount];
+ for (int j = 0; j < childCount; j++) {
+ CharNode node = children.get(j);
+ childrenAddresses[j] = dictSize;
+ word.append(children.get(j).data);
+ addNode(node, word.toString());
+ word.deleteCharAt(word.length()-1);
+ }
+ for (int j = 0; j < childCount; j++) {
+ CharNode node = children.get(j);
+ int nodeAddress = childrenAddresses[j];
+ int cacheDictSize = dictSize;
+ word.append(children.get(j).data);
+ writeWordsRec(node.children, word);
+ word.deleteCharAt(word.length()-1);
+ updateNodeAddress(nodeAddress, node, node.children != null
+ ? cacheDictSize : 0);
+ }
+ }
+
+ void writeToDict(String dictFilename) {
+ // 4MB max, 22-bit offsets
+ dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably
+ // < 1MB in most cases, as there is a limit in the
+ // resource size in apks.
+ dictSize = 0;
+
+ dict[dictSize++] = (byte) (0xFF & VERSION_NUM); // version info
+ dict[dictSize++] = (byte) (0xFF & (bigramDict.mHasBigram ? 1 : 0));
+
+ StringBuilder word = new StringBuilder(48);
+ writeWordsRec(roots, word);
+ dict = bigramDict.writeBigrams(dict, mDictionary);
+ System.out.println("Dict Size = " + dictSize);
+ if (!sSplitOutput) {
+ sOutputFileSize = dictSize;
+ }
+ try {
+ int currentLoc = 0;
+ int i = 0;
+ int extension = dictFilename.indexOf(".dict");
+ String filename = dictFilename.substring(0, extension);
+ while (dictSize > 0) {
+ FileOutputStream fos;
+ if (sSplitOutput) {
+ fos = new FileOutputStream(filename + i + ".dict");
+ } else {
+ fos = new FileOutputStream(filename + ".dict");
+ }
+ if (dictSize > sOutputFileSize) {
+ fos.write(dict, currentLoc, sOutputFileSize);
+ dictSize -= sOutputFileSize;
+ currentLoc += sOutputFileSize;
+ } else {
+ fos.write(dict, currentLoc, dictSize);
+ dictSize = 0;
+ }
+ fos.close();
+ i++;
+ }
+ } catch (IOException ioe) {
+ System.err.println("Error writing dict file:" + ioe);
+ }
+ }
+
+ void traverseDict(int pos, char[] word, int depth) {
+ int count = dict[pos++] & 0xFF;
+ for (int i = 0; i < count; i++) {
+ char c = (char) (dict[pos++] & 0xFF);
+ if (c == 0xFF) { // two byte character
+ c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
+ pos += 2;
+ }
+ word[depth] = c;
+ boolean terminal = getFirstBitOfByte(pos, dict);
+ int address = 0;
+ if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check
+ address = get22BitAddress(pos, dict);
+ pos += 3;
+ } else {
+ pos += 1;
+ }
+ if (terminal) {
+ showWord(word, depth + 1, dict[pos] & 0xFF);
+ pos++;
+
+ int bigramExist = (dict[pos] & bigramDict.FLAG_BIGRAM_READ);
+ if (bigramExist > 0) {
+ int nextBigramExist = 1;
+ while (nextBigramExist > 0) {
+ int bigramAddress = get22BitAddress(pos, dict);
+ pos += 3;
+ int frequency = (bigramDict.FLAG_BIGRAM_FREQ & dict[pos]);
+ bigramDict.searchForTerminalNode(bigramAddress, frequency, dict);
+ nextBigramExist = (dict[pos++] & bigramDict.FLAG_BIGRAM_CONTINUED);
+ }
+ } else {
+ pos++;
+ }
+ }
+ if (address != 0) {
+ traverseDict(address, word, depth + 1);
+ }
+ }
+ }
+
+ void showWord(char[] word, int size, int freq) {
+ System.out.print(new String(word, 0, size) + " " + freq + "\n");
+ }
+
+ static int get22BitAddress(int pos, byte[] dict) {
+ return ((dict[pos + 0] & 0x3F) << 16)
+ | ((dict[pos + 1] & 0xFF) << 8)
+ | ((dict[pos + 2] & 0xFF));
+ }
+
+ static boolean getFirstBitOfByte(int pos, byte[] dict) {
+ return (dict[pos] & 0x80) > 0;
+ }
+
+ static boolean getSecondBitOfByte(int pos, byte[] dict) {
+ return (dict[pos] & 0x40) > 0;
+ }
+}