diff options
55 files changed, 72 insertions, 35 deletions
diff --git a/dictionaries/cs_wordlist.combined.gz b/dictionaries/cs_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..8cbf2e961 --- /dev/null +++ b/dictionaries/cs_wordlist.combined.gz diff --git a/dictionaries/cs_wordlist.xml.gz b/dictionaries/cs_wordlist.xml.gz Binary files differdeleted file mode 100644 index f99148b07..000000000 --- a/dictionaries/cs_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/da_wordlist.combined.gz b/dictionaries/da_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..1cccb8632 --- /dev/null +++ b/dictionaries/da_wordlist.combined.gz diff --git a/dictionaries/da_wordlist.xml.gz b/dictionaries/da_wordlist.xml.gz Binary files differdeleted file mode 100644 index a3d4318e2..000000000 --- a/dictionaries/da_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/de_wordlist.combined.gz b/dictionaries/de_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..5db1aa4f3 --- /dev/null +++ b/dictionaries/de_wordlist.combined.gz diff --git a/dictionaries/de_wordlist.xml.gz b/dictionaries/de_wordlist.xml.gz Binary files differdeleted file mode 100644 index a4267b35a..000000000 --- a/dictionaries/de_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/el_wordlist.combined.gz b/dictionaries/el_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..b61da8918 --- /dev/null +++ b/dictionaries/el_wordlist.combined.gz diff --git a/dictionaries/el_wordlist.xml.gz b/dictionaries/el_wordlist.xml.gz Binary files differdeleted file mode 100644 index af1d71d47..000000000 --- a/dictionaries/el_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/en_GB_wordlist.combined.gz b/dictionaries/en_GB_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..b5909c2da --- /dev/null +++ b/dictionaries/en_GB_wordlist.combined.gz diff --git a/dictionaries/en_US_wordlist.combined.gz b/dictionaries/en_US_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..03ea2b787 --- /dev/null +++ b/dictionaries/en_US_wordlist.combined.gz diff --git a/dictionaries/en_gb_wordlist.xml.gz b/dictionaries/en_gb_wordlist.xml.gz Binary files differdeleted file mode 100644 index 274424cf2..000000000 --- a/dictionaries/en_gb_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/en_us_wordlist.xml.gz b/dictionaries/en_us_wordlist.xml.gz Binary files differdeleted file mode 100644 index b5054ef3c..000000000 --- a/dictionaries/en_us_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/en_whitelist.xml.gz b/dictionaries/en_whitelist.xml.gz Binary files differdeleted file mode 100644 index cf70a1a84..000000000 --- a/dictionaries/en_whitelist.xml.gz +++ /dev/null diff --git a/dictionaries/en_wordlist.combined.gz b/dictionaries/en_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..62c454049 --- /dev/null +++ b/dictionaries/en_wordlist.combined.gz diff --git a/dictionaries/en_wordlist.xml.gz b/dictionaries/en_wordlist.xml.gz Binary files differdeleted file mode 100644 index 6e57f4212..000000000 --- a/dictionaries/en_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/es_wordlist.combined.gz b/dictionaries/es_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..b0a137c4a --- /dev/null +++ b/dictionaries/es_wordlist.combined.gz diff --git a/dictionaries/es_wordlist.xml.gz b/dictionaries/es_wordlist.xml.gz Binary files differdeleted file mode 100644 index aaf0aa348..000000000 --- a/dictionaries/es_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/fi_wordlist.combined.gz b/dictionaries/fi_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..32dc126c0 --- /dev/null +++ b/dictionaries/fi_wordlist.combined.gz diff --git a/dictionaries/fi_wordlist.xml.gz b/dictionaries/fi_wordlist.xml.gz Binary files differdeleted file mode 100644 index b03565c4f..000000000 --- a/dictionaries/fi_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/fr_wordlist.combined.gz b/dictionaries/fr_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..95a87e630 --- /dev/null +++ b/dictionaries/fr_wordlist.combined.gz diff --git a/dictionaries/fr_wordlist.xml.gz b/dictionaries/fr_wordlist.xml.gz Binary files differdeleted file mode 100644 index 3134a040a..000000000 --- a/dictionaries/fr_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/hr_wordlist.combined.gz b/dictionaries/hr_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..573c3e8ac --- /dev/null +++ b/dictionaries/hr_wordlist.combined.gz diff --git a/dictionaries/hr_wordlist.xml.gz b/dictionaries/hr_wordlist.xml.gz Binary files differdeleted file mode 100644 index 13998d9d5..000000000 --- a/dictionaries/hr_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/it_wordlist.combined.gz b/dictionaries/it_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..d143bc425 --- /dev/null +++ b/dictionaries/it_wordlist.combined.gz diff --git a/dictionaries/it_wordlist.xml.gz b/dictionaries/it_wordlist.xml.gz Binary files differdeleted file mode 100644 index a75553d45..000000000 --- a/dictionaries/it_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/lt_wordlist.combined.gz b/dictionaries/lt_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..03cfa8426 --- /dev/null +++ b/dictionaries/lt_wordlist.combined.gz diff --git a/dictionaries/lt_wordlist.xml.gz b/dictionaries/lt_wordlist.xml.gz Binary files differdeleted file mode 100644 index 8f00f6393..000000000 --- a/dictionaries/lt_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/lv_wordlist.combined.gz b/dictionaries/lv_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..6b2ee77d6 --- /dev/null +++ b/dictionaries/lv_wordlist.combined.gz diff --git a/dictionaries/lv_wordlist.xml.gz b/dictionaries/lv_wordlist.xml.gz Binary files differdeleted file mode 100644 index 453ebb518..000000000 --- a/dictionaries/lv_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/nb_wordlist.combined.gz b/dictionaries/nb_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..0644fc93a --- /dev/null +++ b/dictionaries/nb_wordlist.combined.gz diff --git a/dictionaries/nb_wordlist.xml.gz b/dictionaries/nb_wordlist.xml.gz Binary files differdeleted file mode 100644 index 91813b66c..000000000 --- a/dictionaries/nb_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/nl_wordlist.combined.gz b/dictionaries/nl_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..748c5ed77 --- /dev/null +++ b/dictionaries/nl_wordlist.combined.gz diff --git a/dictionaries/nl_wordlist.xml.gz b/dictionaries/nl_wordlist.xml.gz Binary files differdeleted file mode 100644 index 72ebc6136..000000000 --- a/dictionaries/nl_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/pl_wordlist.combined.gz b/dictionaries/pl_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..638c8eef5 --- /dev/null +++ b/dictionaries/pl_wordlist.combined.gz diff --git a/dictionaries/pl_wordlist.xml.gz b/dictionaries/pl_wordlist.xml.gz Binary files differdeleted file mode 100644 index 5909a5f7f..000000000 --- a/dictionaries/pl_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/pt_BR_wordlist.combined.gz b/dictionaries/pt_BR_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..6f7952004 --- /dev/null +++ b/dictionaries/pt_BR_wordlist.combined.gz diff --git a/dictionaries/pt_PT_wordlist.combined.gz b/dictionaries/pt_PT_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..66ed025fb --- /dev/null +++ b/dictionaries/pt_PT_wordlist.combined.gz diff --git a/dictionaries/pt_br_wordlist.xml.gz b/dictionaries/pt_br_wordlist.xml.gz Binary files differdeleted file mode 100644 index 53d4e4298..000000000 --- a/dictionaries/pt_br_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/pt_pt_wordlist.xml.gz b/dictionaries/pt_pt_wordlist.xml.gz Binary files differdeleted file mode 100644 index e3e8c3ac3..000000000 --- a/dictionaries/pt_pt_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/ru_wordlist.combined.gz b/dictionaries/ru_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..62ae12e90 --- /dev/null +++ b/dictionaries/ru_wordlist.combined.gz diff --git a/dictionaries/ru_wordlist.xml.gz b/dictionaries/ru_wordlist.xml.gz Binary files differdeleted file mode 100644 index 877f0608e..000000000 --- a/dictionaries/ru_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/sample.combined b/dictionaries/sample.combined new file mode 100644 index 000000000..4fa595e1e --- /dev/null +++ b/dictionaries/sample.combined @@ -0,0 +1,38 @@ +# This is a sample wordlist that can be converted to a binary dictionary +# for use by the Latin IME. +# The file is essentially a CSV file, with indent level denoting nesting. +# +# The file starts with a single CSV line with the header attributes. Whatever +# the content, these are included as is in the binary file. The first attribute +# of the file should be `dictionary'. Usual fields are `locale', `description', +# `date', `version', `options'. +# +# Each word has a `word' entry and at least a `f' argument denoting its +# probability, as an integer between 0 and 255 on a logarithmic scale, with +# 255 meaning 1 and each decrement in 1 dividing probability by 1.15. +# As a special case, a weight of 0 is taken to mean profanity - words that +# should not be considered a typo, but that should never be suggested +# explicitly. An entry may be made not a word by adding a `not_a_word' +# field with a value of `true'. The main reason for putting such entries +# into the dictionary is to add shortcut targets and maybe a whitelist +# replacement. +# +# Each word may or may not have any number of shortcut target lines +# starting with a `shortcut' entry and having at least a `f' frequency +# value between 0 and 14, or the special value `whitelist' which becomes +# 15, which is then taken to be the whitelist target of this word. +# +# Each word may also have any number of bigram lines starting with a +# `bigram' entry containing the following word whose frequency should +# override the unigram frequency when following the word this bigram is +# for. +# +dictionary=main:en,locale=en,description=Sample wordlist,date=1351495318,version=1 + word=sample,f=200 + bigram=wordlist,f=243 + word=wordlist,f=180 + word=shortcut,f=176 + shortcut=target,f=10 + word=witelisted,f=10,not_a_word=true + shortcut=whitelisted,f=whitelist + word=profanity,f=0 diff --git a/dictionaries/sample.xml b/dictionaries/sample.xml deleted file mode 100644 index ad98f2b6f..000000000 --- a/dictionaries/sample.xml +++ /dev/null @@ -1,17 +0,0 @@ -<!-- This is a sample wordlist that can be converted to a binary dictionary - for use by the Latin IME. - The format of the word list is a flat list of word entries. - Each entry has a frequency between 255 and 0. - Highest frequency words get more weight in the prediction algorithm. As a - special case, a weight of 0 is taken to mean profanity - words that should - not be considered a typo, but that should never be suggested explicitly. - You can capitalize words that must always be capitalized, such as "January". - You can have a capitalized and a non-capitalized word as separate entries, - such as "robin" and "Robin". ---> -<wordlist> - <w f="255">this</w> - <w f="255">is</w> - <w f="128">sample</w> - <w f="1">wordlist</w> -</wordlist> diff --git a/dictionaries/sl_wordlist.combined.gz b/dictionaries/sl_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..845b55a8b --- /dev/null +++ b/dictionaries/sl_wordlist.combined.gz diff --git a/dictionaries/sl_wordlist.xml.gz b/dictionaries/sl_wordlist.xml.gz Binary files differdeleted file mode 100644 index 3927b698e..000000000 --- a/dictionaries/sl_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/sr_wordlist.combined.gz b/dictionaries/sr_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..c15bc48fd --- /dev/null +++ b/dictionaries/sr_wordlist.combined.gz diff --git a/dictionaries/sr_wordlist.xml.gz b/dictionaries/sr_wordlist.xml.gz Binary files differdeleted file mode 100644 index c2eea681f..000000000 --- a/dictionaries/sr_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/sv_wordlist.combined.gz b/dictionaries/sv_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..ec399fbf5 --- /dev/null +++ b/dictionaries/sv_wordlist.combined.gz diff --git a/dictionaries/sv_wordlist.xml.gz b/dictionaries/sv_wordlist.xml.gz Binary files differdeleted file mode 100644 index caa63c435..000000000 --- a/dictionaries/sv_wordlist.xml.gz +++ /dev/null diff --git a/dictionaries/tr_wordlist.combined.gz b/dictionaries/tr_wordlist.combined.gz Binary files differnew file mode 100644 index 000000000..3e6ca3263 --- /dev/null +++ b/dictionaries/tr_wordlist.combined.gz diff --git a/dictionaries/tr_wordlist.xml.gz b/dictionaries/tr_wordlist.xml.gz Binary files differdeleted file mode 100644 index 35999208e..000000000 --- a/dictionaries/tr_wordlist.xml.gz +++ /dev/null diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index c588824fe..b0b3777df 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -290,19 +290,23 @@ public final class FusionDictionary implements Iterable<Word> { } @Override public String toString() { // Convenience method - return toString(0); + return toString(0, false); } - public String toString(final int indentCount) { + public String toString(final int indentCount, final boolean plumbing) { final StringBuilder indent = new StringBuilder(); - for (int i = 0; i < indentCount; ++i) { - indent.append(" "); + if (plumbing) { + indent.append("H:"); + } else { + for (int i = 0; i < indentCount; ++i) { + indent.append(" "); + } } final StringBuilder s = new StringBuilder(); for (final String optionKey : mAttributes.keySet()) { s.append(indent); s.append(optionKey); s.append(" = "); - if ("date".equals(optionKey)) { + if ("date".equals(optionKey) && !plumbing) { // Date needs a number of milliseconds, but the dictionary contains seconds s.append(new Date( 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java index c295eb384..092ee767f 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java @@ -227,18 +227,18 @@ public class CombinedInputOutput { } destination.write("\n"); for (Word word : set) { - destination.write("\t" + WORD_TAG + "=" + word.mWord + "," + destination.write(" " + WORD_TAG + "=" + word.mWord + "," + FREQUENCY_TAG + "=" + word.mFrequency + (word.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); if (null != word.mShortcutTargets) { for (WeightedString target : word.mShortcutTargets) { - destination.write("\t\t" + SHORTCUT_TAG + "=" + target.mWord + "," + destination.write(" " + SHORTCUT_TAG + "=" + target.mWord + "," + FREQUENCY_TAG + "=" + target.mFrequency + "\n"); } } if (null != word.mBigrams) { for (WeightedString bigram : word.mBigrams) { - destination.write("\t\t" + BIGRAM_TAG + "=" + bigram.mWord + "," + destination.write(" " + BIGRAM_TAG + "=" + bigram.mWord + "," + FREQUENCY_TAG + "=" + bigram.mFrequency + "\n"); } } diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java index 9548f2509..855c026b9 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Diff.java @@ -34,7 +34,7 @@ public class Diff extends Dicttool.Command { @Override public String getHelp() { return COMMAND + " [-p] <dict> <dict> : shows differences between two dictionaries.\n" - + " If -p (porcelain) option is given, produce output suitable for a script"; + + " If -p (plumbing) option is given, produce output suitable for a script"; } @Override @@ -42,15 +42,15 @@ public class Diff extends Dicttool.Command { if (mArgs.length < 2) { throw new RuntimeException("Not enough arguments for command " + COMMAND); } - final boolean porcelain; + final boolean plumbing; if ("-p".equals(mArgs[0])) { - porcelain = true; + plumbing = true; mArgs = Arrays.copyOfRange(mArgs, 1, mArgs.length); if (mArgs.length != 2) { // There should be only 2 arguments left throw new RuntimeException("Wrong number of arguments for command " + COMMAND); } } else { - porcelain = false; + plumbing = false; } final FusionDictionary dict0 = BinaryDictOffdeviceUtils.getDictionary(mArgs[0], false /* report */); @@ -58,7 +58,7 @@ public class Diff extends Dicttool.Command { final FusionDictionary dict1 = BinaryDictOffdeviceUtils.getDictionary(mArgs[1], false /* report */); if (null == dict1) throw new RuntimeException("Can't read dictionary " + mArgs[1]); - if (!porcelain) { + if (!plumbing) { System.out.println("Header :"); diffHeaders(dict0, dict1); if (languageDiffers(dict0, dict1)) { diff --git a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java index d91a409d3..f2894544f 100644 --- a/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java +++ b/tools/dicttool/src/com/android/inputmethod/latin/dicttool/Info.java @@ -22,6 +22,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.Word; +import java.util.Arrays; import java.util.ArrayList; public class Info extends Dicttool.Command { @@ -35,9 +36,9 @@ public class Info extends Dicttool.Command { return COMMAND + " <filename>: prints various information about a dictionary file"; } - private static void showInfo(final FusionDictionary dict) { + private static void showInfo(final FusionDictionary dict, final boolean plumbing) { System.out.println("Header attributes :"); - System.out.print(dict.mOptions.toString(2)); + System.out.print(dict.mOptions.toString(2, plumbing)); int wordCount = 0; int bigramCount = 0; int shortcutCount = 0; @@ -62,7 +63,8 @@ public class Info extends Dicttool.Command { + " whitelist entries)"); } - private static void showWordInfo(final FusionDictionary dict, final String word) { + private static void showWordInfo(final FusionDictionary dict, final String word, + final boolean plumbing) { final CharGroup group = FusionDictionary.findWordInTree(dict.mRoot, word); if (null == group) { System.out.println(word + " is not in the dictionary"); @@ -101,15 +103,25 @@ public class Info extends Dicttool.Command { if (mArgs.length < 1) { throw new RuntimeException("Not enough arguments for command " + COMMAND); } + final boolean plumbing; + if ("-p".equals(mArgs[0])) { + plumbing = true; + mArgs = Arrays.copyOfRange(mArgs, 1, mArgs.length); + if (mArgs.length != 1) { // There should be only 1 argument left + throw new RuntimeException("Wrong number of arguments for command " + COMMAND); + } + } else { + plumbing = false; + } final String filename = mArgs[0]; final boolean hasWordArguments = (1 == mArgs.length); final FusionDictionary dict = BinaryDictOffdeviceUtils.getDictionary(filename, hasWordArguments /* report */); if (hasWordArguments) { - showInfo(dict); + showInfo(dict, plumbing); } else { for (int i = 1; i < mArgs.length; ++i) { - showWordInfo(dict, mArgs[i]); + showWordInfo(dict, mArgs[i], plumbing); } } } |