diff options
Diffstat (limited to 'tools/dicttool/src')
7 files changed, 147 insertions, 172 deletions
diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java index e571bc21d..d1df81b52 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/BinaryDictOffdeviceUtils.java @@ -198,7 +198,7 @@ public final class BinaryDictOffdeviceUtils { System.out.println("Packaging : " + decodedSpec.describeChain()); System.out.println("Uncompressed size : " + decodedSpec.mFile.length()); } - return dictDecoder.readDictionaryBinary(null, false /* deleteDictIfBroken */); + return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); } } } catch (IOException e) { diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java index 4b6716936..b6795ea6d 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -21,7 +21,9 @@ import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.makedict.Word; +import com.android.inputmethod.latin.makedict.ProbabilityInfo; +import com.android.inputmethod.latin.makedict.WordProperty; +import com.android.inputmethod.latin.utils.CombinedFormatUtils; import java.io.BufferedReader; import java.io.File; @@ -41,18 +43,10 @@ import java.util.TreeSet; * All functions in this class are static. */ public class CombinedInputOutput { - - private static final String DICTIONARY_TAG = "dictionary"; - private static final String BIGRAM_TAG = "bigram"; - private static final String SHORTCUT_TAG = "shortcut"; - private static final String FREQUENCY_TAG = "f"; - private static final String WORD_TAG = "word"; - private static final String NOT_A_WORD_TAG = "not_a_word"; private static final String WHITELIST_TAG = "whitelist"; private static final String OPTIONS_TAG = "options"; - private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; - private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; private static final String COMMENT_LINE_STARTER = "#"; + private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3; /** * Basic test to find out whether the file is in the combined format or not. @@ -70,7 +64,8 @@ public class CombinedInputOutput { while (firstLine.startsWith(COMMENT_LINE_STARTER)) { firstLine = reader.readLine(); } - return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); + return firstLine.matches( + "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); } catch (FileNotFoundException e) { return false; } catch (IOException e) { @@ -112,28 +107,25 @@ public class CombinedInputOutput { attributes.put(keyValue[0], keyValue[1]); } - final boolean processUmlauts = - GERMAN_UMLAUT_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); - final boolean processLigatures = - FRENCH_LIGATURE_PROCESSING_OPTION.equals(attributes.get(OPTIONS_TAG)); attributes.remove(OPTIONS_TAG); - final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), new DictionaryOptions( - attributes, processUmlauts, processLigatures)); + final FusionDictionary dict = + new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes)); String line; String word = null; - int freq = 0; + ProbabilityInfo probabilityInfo = new ProbabilityInfo(0); boolean isNotAWord = false; ArrayList<WeightedString> bigrams = new ArrayList<WeightedString>(); ArrayList<WeightedString> shortcuts = new ArrayList<WeightedString>(); while (null != (line = reader.readLine())) { if (line.startsWith(COMMENT_LINE_STARTER)) continue; final String args[] = line.trim().split(","); - if (args[0].matches(WORD_TAG + "=.*")) { + if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { if (null != word) { - dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, + isNotAWord); for (WeightedString s : bigrams) { - dict.setBigram(word, s.mWord, s.mFrequency); + dict.setBigram(word, s.mWord, s.mProbabilityInfo); } } if (!shortcuts.isEmpty()) shortcuts = new ArrayList<WeightedString>(); @@ -142,23 +134,35 @@ public class CombinedInputOutput { for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); - if (WORD_TAG.equals(params[0])) { + if (CombinedFormatUtils.WORD_TAG.equals(params[0])) { word = params[1]; - } else if (FREQUENCY_TAG.equals(params[0])) { - freq = Integer.parseInt(params[1]); - } else if (NOT_A_WORD_TAG.equals(params[0])) { + } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { + probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), + probabilityInfo.mTimestamp, probabilityInfo.mLevel, + probabilityInfo.mCount); + } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { + final String[] historicalInfoParams = + params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); + if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { + throw new RuntimeException("Wrong format (historical info) : " + line); + } + probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability, + Integer.parseInt(historicalInfoParams[0]), + Integer.parseInt(historicalInfoParams[1]), + Integer.parseInt(historicalInfoParams[2])); + } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) { isNotAWord = "true".equals(params[1]); } } - } else if (args[0].matches(SHORTCUT_TAG + "=.*")) { + } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { String shortcut = null; int shortcutFreq = 0; for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); - if (SHORTCUT_TAG.equals(params[0])) { + if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) { shortcut = params[1]; - } else if (FREQUENCY_TAG.equals(params[0])) { + } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { shortcutFreq = WHITELIST_TAG.equals(params[1]) ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY : Integer.parseInt(params[1]); @@ -169,29 +173,42 @@ public class CombinedInputOutput { } else { throw new RuntimeException("Wrong format : " + line); } - } else if (args[0].matches(BIGRAM_TAG + "=.*")) { + } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) { String secondWordOfBigram = null; - int bigramFreq = 0; + ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0); for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); - if (BIGRAM_TAG.equals(params[0])) { + if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) { secondWordOfBigram = params[1]; - } else if (FREQUENCY_TAG.equals(params[0])) { - bigramFreq = Integer.parseInt(params[1]); + } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { + bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]), + bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel, + bigramProbabilityInfo.mCount); + } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { + final String[] historicalInfoParams = + params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); + if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { + throw new RuntimeException("Wrong format (historical info) : " + line); + } + bigramProbabilityInfo = new ProbabilityInfo( + bigramProbabilityInfo.mProbability, + Integer.parseInt(historicalInfoParams[0]), + Integer.parseInt(historicalInfoParams[1]), + Integer.parseInt(historicalInfoParams[2])); } } if (null != secondWordOfBigram) { - bigrams.add(new WeightedString(secondWordOfBigram, bigramFreq)); + bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo)); } else { throw new RuntimeException("Wrong format : " + line); } } } if (null != word) { - dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); + dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); for (WeightedString s : bigrams) { - dict.setBigram(word, s.mWord, s.mFrequency); + dict.setBigram(word, s.mWord, s.mProbabilityInfo); } } @@ -204,44 +221,16 @@ public class CombinedInputOutput { * @param destination a destination stream to write to. * @param dict the dictionary to write. */ - public static void writeDictionaryCombined(Writer destination, FusionDictionary dict) - throws IOException { - final TreeSet<Word> set = new TreeSet<Word>(); - for (Word word : dict) { - set.add(word); // This for ordering by frequency, then by asciibetic order - } - final HashMap<String, String> options = dict.mOptions.mAttributes; - destination.write(DICTIONARY_TAG + "="); - if (options.containsKey(DICTIONARY_TAG)) { - destination.write(options.get(DICTIONARY_TAG)); - options.remove(DICTIONARY_TAG); - } - if (dict.mOptions.mGermanUmlautProcessing) { - destination.write("," + OPTIONS_TAG + "=" + GERMAN_UMLAUT_PROCESSING_OPTION); - } else if (dict.mOptions.mFrenchLigatureProcessing) { - destination.write("," + OPTIONS_TAG + "=" + FRENCH_LIGATURE_PROCESSING_OPTION); - } - for (final String key : dict.mOptions.mAttributes.keySet()) { - final String value = dict.mOptions.mAttributes.get(key); - destination.write("," + key + "=" + value); + public static void writeDictionaryCombined( + final Writer destination, final FusionDictionary dict) throws IOException { + final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<WordProperty>(); + for (final WordProperty wordProperty : dict) { + // This for ordering by frequency, then by asciibetic order + wordPropertiesInDict.add(wordProperty); } - destination.write("\n"); - for (Word word : set) { - destination.write(" " + WORD_TAG + "=" + word.mWord + "," - + FREQUENCY_TAG + "=" + word.mFrequency - + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); - if (null != word.mShortcutTargets) { - for (WeightedString target : word.mShortcutTargets) { - destination.write(" " + SHORTCUT_TAG + "=" + target.mWord + "," - + FREQUENCY_TAG + "=" + target.mFrequency + "\n"); - } - } - if (null != word.mBigrams) { - for (WeightedString bigram : word.mBigrams) { - destination.write(" " + BIGRAM_TAG + "=" + bigram.mWord + "," - + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n"); - } - } + destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes)); + for (final WordProperty wordProperty : wordPropertiesInDict) { + destination.write(CombinedFormatUtils.formatWordProperty(wordProperty)); } destination.close(); } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java index 5c7e8b4f2..80d71fc64 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/DictionaryMaker.java @@ -23,7 +23,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.MakedictLog; import com.android.inputmethod.latin.makedict.UnsupportedFormatException; -import com.android.inputmethod.latin.makedict.Ver3DictEncoder; +import com.android.inputmethod.latin.makedict.Ver2DictEncoder; import com.android.inputmethod.latin.makedict.Ver4DictEncoder; import java.io.BufferedWriter; @@ -46,7 +46,6 @@ public class DictionaryMaker { static class Arguments { private static final String OPTION_VERSION_2 = "-2"; - private static final String OPTION_VERSION_3 = "-3"; private static final String OPTION_VERSION_4 = "-4"; private static final String OPTION_INPUT_SOURCE = "-s"; private static final String OPTION_INPUT_BIGRAM_XML = "-b"; @@ -158,10 +157,8 @@ public class DictionaryMaker { if (arg.charAt(0) == '-') { if (OPTION_VERSION_2.equals(arg)) { // Do nothing, this is the default - } else if (OPTION_VERSION_3.equals(arg)) { - outputBinaryFormatVersion = 3; } else if (OPTION_VERSION_4.equals(arg)) { - outputBinaryFormatVersion = 4; + outputBinaryFormatVersion = FormatSpec.VERSION4; } else if (OPTION_HELP.equals(arg)) { displayHelp(); } else { @@ -268,7 +265,7 @@ public class DictionaryMaker { throws FileNotFoundException, IOException, UnsupportedFormatException { final File file = new File(binaryFilename); final DictDecoder dictDecoder = FormatSpec.getDictDecoder(file); - return dictDecoder.readDictionaryBinary(null, false /* deleteDictIfBroken */); + return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); } /** @@ -358,10 +355,10 @@ public class DictionaryMaker { final File outputFile = new File(outputFilename); final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions(version); final DictEncoder dictEncoder; - if (version == 4) { + if (version == FormatSpec.VERSION4) { dictEncoder = new Ver4DictEncoder(outputFile); } else { - dictEncoder = new Ver3DictEncoder(outputFile); + dictEncoder = new Ver2DictEncoder(outputFile); } dictEncoder.writeDictionary(dict, formatOptions); } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java index 66fd084cd..ce9b9f306 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java @@ -19,7 +19,7 @@ package com.android.inputmethod.latin.dicttool; import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.makedict.Word; +import com.android.inputmethod.latin.makedict.WordProperty; import java.util.Arrays; import java.util.ArrayList; @@ -85,18 +85,6 @@ public class Diff extends Dicttool.Command { private static void diffHeaders(final FusionDictionary dict0, final FusionDictionary dict1) { boolean hasDifferences = false; - if (dict0.mOptions.mFrenchLigatureProcessing != dict1.mOptions.mFrenchLigatureProcessing) { - System.out.println(" French ligature processing : " - + dict0.mOptions.mFrenchLigatureProcessing + " <=> " - + dict1.mOptions.mFrenchLigatureProcessing); - hasDifferences = true; - } - else if (dict0.mOptions.mGermanUmlautProcessing != dict1.mOptions.mGermanUmlautProcessing) { - System.out.println(" German umlaut processing : " - + dict0.mOptions.mGermanUmlautProcessing + " <=> " - + dict1.mOptions.mGermanUmlautProcessing); - hasDifferences = true; - } final HashMap<String, String> options1 = new HashMap<String, String>(dict1.mOptions.mAttributes); for (final String optionKey : dict0.mOptions.mAttributes.keySet()) { @@ -120,42 +108,47 @@ public class Diff extends Dicttool.Command { private static void diffWords(final FusionDictionary dict0, final FusionDictionary dict1) { boolean hasDifferences = false; - for (final Word word0 : dict0) { - final PtNode word1 = FusionDictionary.findWordInTree(dict1.mRootNodeArray, - word0.mWord); - if (null == word1) { + for (final WordProperty word0Property : dict0) { + final PtNode word1PtNode = FusionDictionary.findWordInTree(dict1.mRootNodeArray, + word0Property.mWord); + if (null == word1PtNode) { // This word is not in dict1 - System.out.println("Deleted: " + word0.mWord + " " + word0.mFrequency); + System.out.println("Deleted: " + word0Property.mWord + " " + + word0Property.getProbability()); hasDifferences = true; } else { // We found the word. Compare frequencies, shortcuts, bigrams - if (word0.mFrequency != word1.getFrequency()) { - System.out.println("Freq changed: " + word0.mWord + " " + word0.mFrequency - + " -> " + word1.getFrequency()); + if (word0Property.getProbability() != word1PtNode.getProbability()) { + System.out.println("Probability changed: " + word0Property.mWord + " " + + word0Property.getProbability() + " -> " + + word1PtNode.getProbability()); hasDifferences = true; } - if (word0.mIsNotAWord != word1.getIsNotAWord()) { - System.out.println("Not a word: " + word0.mWord + " " + word0.mIsNotAWord - + " -> " + word1.getIsNotAWord()); + if (word0Property.mIsNotAWord != word1PtNode.getIsNotAWord()) { + System.out.println("Not a word: " + word0Property.mWord + " " + + word0Property.mIsNotAWord + " -> " + word1PtNode.getIsNotAWord()); hasDifferences = true; } - if (word0.mIsBlacklistEntry != word1.getIsBlacklistEntry()) { - System.out.println("Blacklist: " + word0.mWord + " " + word0.mIsBlacklistEntry - + " -> " + word1.getIsBlacklistEntry()); + if (word0Property.mIsBlacklistEntry != word1PtNode.getIsBlacklistEntry()) { + System.out.println("Blacklist: " + word0Property.mWord + " " + + word0Property.mIsBlacklistEntry + " -> " + + word1PtNode.getIsBlacklistEntry()); hasDifferences = true; } - hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0.mWord, - "Bigram", word0.mBigrams, word1.getBigrams()); - hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0.mWord, - "Shortcut", word0.mShortcutTargets, word1.getShortcutTargets()); + hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0Property.mWord, + "Bigram", word0Property.mBigrams, word1PtNode.getBigrams()); + hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0Property.mWord, + "Shortcut", word0Property.mShortcutTargets, + word1PtNode.getShortcutTargets()); } } - for (final Word word1 : dict1) { - final PtNode word0 = FusionDictionary.findWordInTree(dict0.mRootNodeArray, - word1.mWord); - if (null == word0) { + for (final WordProperty word1Property : dict1) { + final PtNode word0PtNode = FusionDictionary.findWordInTree(dict0.mRootNodeArray, + word1Property.mWord); + if (null == word0PtNode) { // This word is not in dict0 - System.out.println("Added: " + word1.mWord + " " + word1.mFrequency); + System.out.println("Added: " + word1Property.mWord + " " + + word1Property.getProbability()); hasDifferences = true; } } @@ -171,7 +164,7 @@ public class Diff extends Dicttool.Command { if (null == list0) return false; for (final WeightedString attribute0 : list0) { System.out.println(type + " removed: " + word + " " + attribute0.mWord + " " - + attribute0.mFrequency); + + attribute0.getProbability()); } return true; } @@ -187,8 +180,8 @@ public class Diff extends Dicttool.Command { for (final WeightedString attribute1 : list1) { if (attribute0.mWord.equals(attribute1.mWord)) { System.out.println(type + " freq changed: " + word + " " - + attribute0.mWord + " " + attribute0.mFrequency + " -> " - + attribute1.mFrequency); + + attribute0.mWord + " " + attribute0.getProbability() + " -> " + + attribute1.getProbability()); list1.remove(attribute1); foundString = true; break; @@ -197,7 +190,7 @@ public class Diff extends Dicttool.Command { if (!foundString) { // We come here if we haven't found any matching string. System.out.println(type + " removed: " + word + " " + attribute0.mWord + " " - + attribute0.mFrequency); + + attribute0.getProbability()); } } else { list1.remove(attribute0); @@ -209,7 +202,7 @@ public class Diff extends Dicttool.Command { for (final WeightedString attribute1 : list1) { hasDifferences = true; System.out.println(type + " added: " + word + " " + attribute1.mWord + " " - + attribute1.mFrequency); + + attribute1.getProbability()); } return hasDifferences; } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java index 350f42772..178df5cec 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java @@ -20,7 +20,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.makedict.Word; +import com.android.inputmethod.latin.makedict.WordProperty; import java.util.Arrays; import java.util.ArrayList; @@ -43,15 +43,16 @@ public class Info extends Dicttool.Command { int bigramCount = 0; int shortcutCount = 0; int whitelistCount = 0; - for (final Word w : dict) { + for (final WordProperty wordProperty : dict) { ++wordCount; - if (null != w.mBigrams) { - bigramCount += w.mBigrams.size(); + if (null != wordProperty.mBigrams) { + bigramCount += wordProperty.mBigrams.size(); } - if (null != w.mShortcutTargets) { - shortcutCount += w.mShortcutTargets.size(); - for (WeightedString shortcutTarget : w.mShortcutTargets) { - if (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY == shortcutTarget.mFrequency) { + if (null != wordProperty.mShortcutTargets) { + shortcutCount += wordProperty.mShortcutTargets.size(); + for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { + if (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY + == shortcutTarget.getProbability()) { ++whitelistCount; } } @@ -71,7 +72,7 @@ public class Info extends Dicttool.Command { return; } System.out.println("Word: " + word); - System.out.println(" Freq: " + ptNode.getFrequency()); + System.out.println(" Freq: " + ptNode.getProbability()); if (ptNode.getIsNotAWord()) { System.out.println(" Is not a word"); } @@ -84,8 +85,9 @@ public class Info extends Dicttool.Command { } else { for (final WeightedString shortcutTarget : shortcutTargets) { System.out.println(" Shortcut target: " + shortcutTarget.mWord + " (" - + (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY == shortcutTarget.mFrequency - ? "whitelist" : shortcutTarget.mFrequency) + ")"); + + (FormatSpec.SHORTCUT_WHITELIST_FREQUENCY + == shortcutTarget.getProbability() ? + "whitelist" : shortcutTarget.getProbability()) + ")"); } } final ArrayList<WeightedString> bigrams = ptNode.getBigrams(); @@ -93,7 +95,8 @@ public class Info extends Dicttool.Command { System.out.println(" No bigrams"); } else { for (final WeightedString bigram : bigrams) { - System.out.println(" Bigram: " + bigram.mWord + " (" + bigram.mFrequency + ")"); + System.out.println( + " Bigram: " + bigram.mWord + " (" + bigram.getProbability() + ")"); } } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java index 9174238da..48817b1b1 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Test.java @@ -18,7 +18,6 @@ package com.android.inputmethod.latin.dicttool; import com.android.inputmethod.latin.makedict.BinaryDictDecoderEncoderTests; import com.android.inputmethod.latin.makedict.BinaryDictEncoderFlattenTreeTests; -import com.android.inputmethod.latin.makedict.BinaryDictIOUtilsTests; import com.android.inputmethod.latin.makedict.FusionDictionaryTest; import java.lang.reflect.Constructor; @@ -31,15 +30,15 @@ import java.util.ArrayList; */ public class Test extends Dicttool.Command { public static final String COMMAND = "test"; + private static final int DEFAULT_MAX_UNIGRAMS = 1500; private long mSeed = System.currentTimeMillis(); - private int mMaxUnigrams = BinaryDictIOUtilsTests.DEFAULT_MAX_UNIGRAMS; + private int mMaxUnigrams = DEFAULT_MAX_UNIGRAMS; private static final Class<?>[] sClassesToTest = { BinaryDictOffdeviceUtilsTests.class, FusionDictionaryTest.class, BinaryDictDecoderEncoderTests.class, BinaryDictEncoderFlattenTreeTests.class, - BinaryDictIOUtilsTests.class }; private ArrayList<Method> mAllTestMethods = new ArrayList<Method>(); private ArrayList<String> mUsedTestMethods = new ArrayList<String>(); diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java index 4e99bf979..2ac842a80 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/XmlDictInputOutput.java @@ -20,7 +20,8 @@ import com.android.inputmethod.latin.makedict.FusionDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import com.android.inputmethod.latin.makedict.Word; +import com.android.inputmethod.latin.makedict.ProbabilityInfo; +import com.android.inputmethod.latin.makedict.WordProperty; import java.io.BufferedReader; import java.io.File; @@ -52,13 +53,11 @@ public class XmlDictInputOutput { private static final String WORD_TAG = "w"; private static final String BIGRAM_TAG = "bigram"; private static final String SHORTCUT_TAG = "shortcut"; - private static final String FREQUENCY_ATTR = "f"; + private static final String PROBABILITY_ATTR = "f"; private static final String WORD_ATTR = "word"; private static final String NOT_A_WORD_ATTR = "not_a_word"; private static final String OPTIONS_KEY = "options"; - private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; - private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; /** * SAX handler for a unigram XML file. @@ -68,6 +67,7 @@ public class XmlDictInputOutput { private static final int START = 1; private static final int WORD = 2; private static final int UNKNOWN = 3; + private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1; FusionDictionary mDictionary; int mState; // the state of the parser @@ -92,7 +92,8 @@ public class XmlDictInputOutput { final FusionDictionary dict = mDictionary; for (final String shortcutOnly : mShortcutsMap.keySet()) { if (dict.hasWord(shortcutOnly)) continue; - dict.add(shortcutOnly, 1, mShortcutsMap.get(shortcutOnly), true /* isNotAWord */); + dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY), + mShortcutsMap.get(shortcutOnly), true /* isNotAWord */); } mDictionary = null; mShortcutsMap.clear(); @@ -109,7 +110,7 @@ public class XmlDictInputOutput { mWord = ""; for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { final String attrName = attrs.getLocalName(attrIndex); - if (FREQUENCY_ATTR.equals(attrName)) { + if (PROBABILITY_ATTR.equals(attrName)) { mFreq = Integer.parseInt(attrs.getValue(attrIndex)); } } @@ -120,12 +121,8 @@ public class XmlDictInputOutput { attributes.put(attrName, attrs.getValue(attrIndex)); } final String optionsString = attributes.get(OPTIONS_KEY); - final boolean processUmlauts = - GERMAN_UMLAUT_PROCESSING_OPTION.equals(optionsString); - final boolean processLigatures = - FRENCH_LIGATURE_PROCESSING_OPTION.equals(optionsString); mDictionary = new FusionDictionary(new PtNodeArray(), - new DictionaryOptions(attributes, processUmlauts, processLigatures)); + new DictionaryOptions(attributes)); } else { mState = UNKNOWN; } @@ -144,7 +141,8 @@ public class XmlDictInputOutput { @Override public void endElement(String uri, String localName, String qName) { if (WORD == mState) { - mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), false /* isNotAWord */); + mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord), + false /* isNotAWord */); mState = START; } } @@ -325,7 +323,7 @@ public class XmlDictInputOutput { final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); for (final WeightedString bigram : bigramList) { if (!dict.hasWord(bigram.mWord)) continue; - dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency); + dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo); } } return dict; @@ -354,42 +352,38 @@ public class XmlDictInputOutput { */ public static void writeDictionaryXml(Writer destination, FusionDictionary dict) throws IOException { - final TreeSet<Word> set = new TreeSet<Word>(); - for (Word word : dict) { - set.add(word); + final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<WordProperty>(); + for (WordProperty wordProperty : dict) { + wordPropertiesInDict.add(wordProperty); } // TODO: use an XMLSerializer if this gets big destination.write("<wordlist format=\"2\""); - final HashMap<String, String> options = dict.mOptions.mAttributes; - if (dict.mOptions.mGermanUmlautProcessing) { - destination.write(" " + OPTIONS_KEY + "=\"" + GERMAN_UMLAUT_PROCESSING_OPTION + "\""); - } else if (dict.mOptions.mFrenchLigatureProcessing) { - destination.write(" " + OPTIONS_KEY + "=\"" + FRENCH_LIGATURE_PROCESSING_OPTION + "\""); - } for (final String key : dict.mOptions.mAttributes.keySet()) { final String value = dict.mOptions.mAttributes.get(key); destination.write(" " + key + "=\"" + value + "\""); } destination.write(">\n"); destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); - for (Word word : set) { - destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " - + FREQUENCY_ATTR + "=\"" + word.mFrequency - + (word.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") + "\">"); - if (null != word.mShortcutTargets) { + for (WordProperty wordProperty : wordPropertiesInDict) { + destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord + + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability() + + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") + + "\">"); + if (null != wordProperty.mShortcutTargets) { destination.write("\n"); - for (WeightedString target : word.mShortcutTargets) { - destination.write(" <" + SHORTCUT_TAG + " " + FREQUENCY_ATTR + "=\"" - + target.mFrequency + "\">" + target.mWord + "</" + SHORTCUT_TAG + for (WeightedString target : wordProperty.mShortcutTargets) { + destination.write(" <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\"" + + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG + ">\n"); } destination.write(" "); } - if (null != word.mBigrams) { + if (null != wordProperty.mBigrams) { destination.write("\n"); - for (WeightedString bigram : word.mBigrams) { - destination.write(" <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\"" - + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n"); + for (WeightedString bigram : wordProperty.mBigrams) { + destination.write(" <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\"" + + bigram.getProbability() + "\">" + bigram.mWord + + "</" + BIGRAM_TAG + ">\n"); } destination.write(" "); } |