aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJean Chalard <jchalard@google.com>2012-10-31 17:34:47 +0900
committerJean Chalard <jchalard@google.com>2012-10-31 18:52:00 +0900
commita424ff06ec367d1be4cc05a72b7384d9f9834787 (patch)
treee18e65363c4d5f685dcbc542f78b0a1db179a1a8
parenta8058d169dad450eca428ca76c5a0f44e45f41a7 (diff)
downloadlatinime-a424ff06ec367d1be4cc05a72b7384d9f9834787.tar.gz
latinime-a424ff06ec367d1be4cc05a72b7384d9f9834787.tar.xz
latinime-a424ff06ec367d1be4cc05a72b7384d9f9834787.zip
Switch the AOSP word lists to the combined format.
This will help with managing the word lists. Bug: 7388859 Change-Id: I89f049569b177d3027fe56d6c67eaca27d44dc7d
-rw-r--r--dictionaries/cs_wordlist.combined.gzbin0 -> 945721 bytes
-rw-r--r--dictionaries/cs_wordlist.xml.gzbin934317 -> 0 bytes
-rw-r--r--dictionaries/da_wordlist.combined.gzbin0 -> 1016252 bytes
-rw-r--r--dictionaries/da_wordlist.xml.gzbin1028556 -> 0 bytes
-rw-r--r--dictionaries/de_wordlist.combined.gzbin0 -> 1594831 bytes
-rw-r--r--dictionaries/de_wordlist.xml.gzbin1264849 -> 0 bytes
-rw-r--r--dictionaries/el_wordlist.combined.gzbin0 -> 1132398 bytes
-rw-r--r--dictionaries/el_wordlist.xml.gzbin1111767 -> 0 bytes
-rw-r--r--dictionaries/en_GB_wordlist.combined.gzbin0 -> 859526 bytes
-rw-r--r--dictionaries/en_US_wordlist.combined.gzbin0 -> 876850 bytes
-rw-r--r--dictionaries/en_gb_wordlist.xml.gzbin873666 -> 0 bytes
-rw-r--r--dictionaries/en_us_wordlist.xml.gzbin891237 -> 0 bytes
-rw-r--r--dictionaries/en_whitelist.xml.gzbin816 -> 0 bytes
-rw-r--r--dictionaries/en_wordlist.combined.gzbin0 -> 901400 bytes
-rw-r--r--dictionaries/en_wordlist.xml.gzbin916081 -> 0 bytes
-rw-r--r--dictionaries/es_wordlist.combined.gzbin0 -> 948224 bytes
-rw-r--r--dictionaries/es_wordlist.xml.gzbin938245 -> 0 bytes
-rw-r--r--dictionaries/fi_wordlist.combined.gzbin0 -> 1267592 bytes
-rw-r--r--dictionaries/fi_wordlist.xml.gzbin1247226 -> 0 bytes
-rw-r--r--dictionaries/fr_wordlist.combined.gzbin0 -> 1106063 bytes
-rw-r--r--dictionaries/fr_wordlist.xml.gzbin1118441 -> 0 bytes
-rw-r--r--dictionaries/hr_wordlist.combined.gzbin0 -> 1010674 bytes
-rw-r--r--dictionaries/hr_wordlist.xml.gzbin996565 -> 0 bytes
-rw-r--r--dictionaries/it_wordlist.combined.gzbin0 -> 931870 bytes
-rw-r--r--dictionaries/it_wordlist.xml.gzbin922548 -> 0 bytes
-rw-r--r--dictionaries/lt_wordlist.combined.gzbin0 -> 977866 bytes
-rw-r--r--dictionaries/lt_wordlist.xml.gzbin963678 -> 0 bytes
-rw-r--r--dictionaries/lv_wordlist.combined.gzbin0 -> 963904 bytes
-rw-r--r--dictionaries/lv_wordlist.xml.gzbin947300 -> 0 bytes
-rw-r--r--dictionaries/nb_wordlist.combined.gzbin0 -> 964442 bytes
-rw-r--r--dictionaries/nb_wordlist.xml.gzbin944838 -> 0 bytes
-rw-r--r--dictionaries/nl_wordlist.combined.gzbin0 -> 1050110 bytes
-rw-r--r--dictionaries/nl_wordlist.xml.gzbin1031994 -> 0 bytes
-rw-r--r--dictionaries/pl_wordlist.combined.gzbin0 -> 1086804 bytes
-rw-r--r--dictionaries/pl_wordlist.xml.gzbin1073754 -> 0 bytes
-rw-r--r--dictionaries/pt_BR_wordlist.combined.gzbin0 -> 876891 bytes
-rw-r--r--dictionaries/pt_PT_wordlist.combined.gzbin0 -> 1102007 bytes
-rw-r--r--dictionaries/pt_br_wordlist.xml.gzbin868558 -> 0 bytes
-rw-r--r--dictionaries/pt_pt_wordlist.xml.gzbin1092967 -> 0 bytes
-rw-r--r--dictionaries/ru_wordlist.combined.gzbin0 -> 1394258 bytes
-rw-r--r--dictionaries/ru_wordlist.xml.gzbin1367381 -> 0 bytes
-rw-r--r--dictionaries/sample.combined38
-rw-r--r--dictionaries/sample.xml17
-rw-r--r--dictionaries/sl_wordlist.combined.gzbin0 -> 313077 bytes
-rw-r--r--dictionaries/sl_wordlist.xml.gzbin310873 -> 0 bytes
-rw-r--r--dictionaries/sr_wordlist.combined.gzbin0 -> 1049710 bytes
-rw-r--r--dictionaries/sr_wordlist.xml.gzbin1027560 -> 0 bytes
-rw-r--r--dictionaries/sv_wordlist.combined.gzbin0 -> 1137493 bytes
-rw-r--r--dictionaries/sv_wordlist.xml.gzbin1115582 -> 0 bytes
-rw-r--r--dictionaries/tr_wordlist.combined.gzbin0 -> 924020 bytes
-rw-r--r--dictionaries/tr_wordlist.xml.gzbin909030 -> 0 bytes
51 files changed, 38 insertions, 17 deletions
diff --git a/dictionaries/cs_wordlist.combined.gz b/dictionaries/cs_wordlist.combined.gz
new file mode 100644
index 000000000..8cbf2e961
--- /dev/null
+++ b/dictionaries/cs_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/cs_wordlist.xml.gz b/dictionaries/cs_wordlist.xml.gz
deleted file mode 100644
index f99148b07..000000000
--- a/dictionaries/cs_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/da_wordlist.combined.gz b/dictionaries/da_wordlist.combined.gz
new file mode 100644
index 000000000..1cccb8632
--- /dev/null
+++ b/dictionaries/da_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/da_wordlist.xml.gz b/dictionaries/da_wordlist.xml.gz
deleted file mode 100644
index a3d4318e2..000000000
--- a/dictionaries/da_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/de_wordlist.combined.gz b/dictionaries/de_wordlist.combined.gz
new file mode 100644
index 000000000..5db1aa4f3
--- /dev/null
+++ b/dictionaries/de_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/de_wordlist.xml.gz b/dictionaries/de_wordlist.xml.gz
deleted file mode 100644
index a4267b35a..000000000
--- a/dictionaries/de_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/el_wordlist.combined.gz b/dictionaries/el_wordlist.combined.gz
new file mode 100644
index 000000000..b61da8918
--- /dev/null
+++ b/dictionaries/el_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/el_wordlist.xml.gz b/dictionaries/el_wordlist.xml.gz
deleted file mode 100644
index af1d71d47..000000000
--- a/dictionaries/el_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/en_GB_wordlist.combined.gz b/dictionaries/en_GB_wordlist.combined.gz
new file mode 100644
index 000000000..b5909c2da
--- /dev/null
+++ b/dictionaries/en_GB_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/en_US_wordlist.combined.gz b/dictionaries/en_US_wordlist.combined.gz
new file mode 100644
index 000000000..03ea2b787
--- /dev/null
+++ b/dictionaries/en_US_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/en_gb_wordlist.xml.gz b/dictionaries/en_gb_wordlist.xml.gz
deleted file mode 100644
index 274424cf2..000000000
--- a/dictionaries/en_gb_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/en_us_wordlist.xml.gz b/dictionaries/en_us_wordlist.xml.gz
deleted file mode 100644
index b5054ef3c..000000000
--- a/dictionaries/en_us_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/en_whitelist.xml.gz b/dictionaries/en_whitelist.xml.gz
deleted file mode 100644
index cf70a1a84..000000000
--- a/dictionaries/en_whitelist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/en_wordlist.combined.gz b/dictionaries/en_wordlist.combined.gz
new file mode 100644
index 000000000..62c454049
--- /dev/null
+++ b/dictionaries/en_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/en_wordlist.xml.gz b/dictionaries/en_wordlist.xml.gz
deleted file mode 100644
index 6e57f4212..000000000
--- a/dictionaries/en_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/es_wordlist.combined.gz b/dictionaries/es_wordlist.combined.gz
new file mode 100644
index 000000000..b0a137c4a
--- /dev/null
+++ b/dictionaries/es_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/es_wordlist.xml.gz b/dictionaries/es_wordlist.xml.gz
deleted file mode 100644
index aaf0aa348..000000000
--- a/dictionaries/es_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/fi_wordlist.combined.gz b/dictionaries/fi_wordlist.combined.gz
new file mode 100644
index 000000000..32dc126c0
--- /dev/null
+++ b/dictionaries/fi_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/fi_wordlist.xml.gz b/dictionaries/fi_wordlist.xml.gz
deleted file mode 100644
index b03565c4f..000000000
--- a/dictionaries/fi_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/fr_wordlist.combined.gz b/dictionaries/fr_wordlist.combined.gz
new file mode 100644
index 000000000..95a87e630
--- /dev/null
+++ b/dictionaries/fr_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/fr_wordlist.xml.gz b/dictionaries/fr_wordlist.xml.gz
deleted file mode 100644
index 3134a040a..000000000
--- a/dictionaries/fr_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/hr_wordlist.combined.gz b/dictionaries/hr_wordlist.combined.gz
new file mode 100644
index 000000000..573c3e8ac
--- /dev/null
+++ b/dictionaries/hr_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/hr_wordlist.xml.gz b/dictionaries/hr_wordlist.xml.gz
deleted file mode 100644
index 13998d9d5..000000000
--- a/dictionaries/hr_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/it_wordlist.combined.gz b/dictionaries/it_wordlist.combined.gz
new file mode 100644
index 000000000..d143bc425
--- /dev/null
+++ b/dictionaries/it_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/it_wordlist.xml.gz b/dictionaries/it_wordlist.xml.gz
deleted file mode 100644
index a75553d45..000000000
--- a/dictionaries/it_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/lt_wordlist.combined.gz b/dictionaries/lt_wordlist.combined.gz
new file mode 100644
index 000000000..03cfa8426
--- /dev/null
+++ b/dictionaries/lt_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/lt_wordlist.xml.gz b/dictionaries/lt_wordlist.xml.gz
deleted file mode 100644
index 8f00f6393..000000000
--- a/dictionaries/lt_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/lv_wordlist.combined.gz b/dictionaries/lv_wordlist.combined.gz
new file mode 100644
index 000000000..6b2ee77d6
--- /dev/null
+++ b/dictionaries/lv_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/lv_wordlist.xml.gz b/dictionaries/lv_wordlist.xml.gz
deleted file mode 100644
index 453ebb518..000000000
--- a/dictionaries/lv_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/nb_wordlist.combined.gz b/dictionaries/nb_wordlist.combined.gz
new file mode 100644
index 000000000..0644fc93a
--- /dev/null
+++ b/dictionaries/nb_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/nb_wordlist.xml.gz b/dictionaries/nb_wordlist.xml.gz
deleted file mode 100644
index 91813b66c..000000000
--- a/dictionaries/nb_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/nl_wordlist.combined.gz b/dictionaries/nl_wordlist.combined.gz
new file mode 100644
index 000000000..748c5ed77
--- /dev/null
+++ b/dictionaries/nl_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/nl_wordlist.xml.gz b/dictionaries/nl_wordlist.xml.gz
deleted file mode 100644
index 72ebc6136..000000000
--- a/dictionaries/nl_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/pl_wordlist.combined.gz b/dictionaries/pl_wordlist.combined.gz
new file mode 100644
index 000000000..638c8eef5
--- /dev/null
+++ b/dictionaries/pl_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/pl_wordlist.xml.gz b/dictionaries/pl_wordlist.xml.gz
deleted file mode 100644
index 5909a5f7f..000000000
--- a/dictionaries/pl_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/pt_BR_wordlist.combined.gz b/dictionaries/pt_BR_wordlist.combined.gz
new file mode 100644
index 000000000..6f7952004
--- /dev/null
+++ b/dictionaries/pt_BR_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/pt_PT_wordlist.combined.gz b/dictionaries/pt_PT_wordlist.combined.gz
new file mode 100644
index 000000000..66ed025fb
--- /dev/null
+++ b/dictionaries/pt_PT_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/pt_br_wordlist.xml.gz b/dictionaries/pt_br_wordlist.xml.gz
deleted file mode 100644
index 53d4e4298..000000000
--- a/dictionaries/pt_br_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/pt_pt_wordlist.xml.gz b/dictionaries/pt_pt_wordlist.xml.gz
deleted file mode 100644
index e3e8c3ac3..000000000
--- a/dictionaries/pt_pt_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/ru_wordlist.combined.gz b/dictionaries/ru_wordlist.combined.gz
new file mode 100644
index 000000000..62ae12e90
--- /dev/null
+++ b/dictionaries/ru_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/ru_wordlist.xml.gz b/dictionaries/ru_wordlist.xml.gz
deleted file mode 100644
index 877f0608e..000000000
--- a/dictionaries/ru_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/sample.combined b/dictionaries/sample.combined
new file mode 100644
index 000000000..4fa595e1e
--- /dev/null
+++ b/dictionaries/sample.combined
@@ -0,0 +1,38 @@
+# This is a sample wordlist that can be converted to a binary dictionary
+# for use by the Latin IME.
+# The file is essentially a CSV file, with indent level denoting nesting.
+#
+# The file starts with a single CSV line with the header attributes. Whatever
+# the content, these are included as is in the binary file. The first attribute
+# of the file should be `dictionary'. Usual fields are `locale', `description',
+# `date', `version', `options'.
+#
+# Each word has a `word' entry and at least a `f' argument denoting its
+# probability, as an integer between 0 and 255 on a logarithmic scale, with
+# 255 meaning 1 and each decrement in 1 dividing probability by 1.15.
+# As a special case, a weight of 0 is taken to mean profanity - words that
+# should not be considered a typo, but that should never be suggested
+# explicitly. An entry may be made not a word by adding a `not_a_word'
+# field with a value of `true'. The main reason for putting such entries
+# into the dictionary is to add shortcut targets and maybe a whitelist
+# replacement.
+#
+# Each word may or may not have any number of shortcut target lines
+# starting with a `shortcut' entry and having at least a `f' frequency
+# value between 0 and 14, or the special value `whitelist' which becomes
+# 15, which is then taken to be the whitelist target of this word.
+#
+# Each word may also have any number of bigram lines starting with a
+# `bigram' entry containing the following word whose frequency should
+# override the unigram frequency when following the word this bigram is
+# for.
+#
+dictionary=main:en,locale=en,description=Sample wordlist,date=1351495318,version=1
+ word=sample,f=200
+ bigram=wordlist,f=243
+ word=wordlist,f=180
+ word=shortcut,f=176
+ shortcut=target,f=10
+ word=witelisted,f=10,not_a_word=true
+ shortcut=whitelisted,f=whitelist
+ word=profanity,f=0
diff --git a/dictionaries/sample.xml b/dictionaries/sample.xml
deleted file mode 100644
index ad98f2b6f..000000000
--- a/dictionaries/sample.xml
+++ /dev/null
@@ -1,17 +0,0 @@
-<!-- This is a sample wordlist that can be converted to a binary dictionary
- for use by the Latin IME.
- The format of the word list is a flat list of word entries.
- Each entry has a frequency between 255 and 0.
- Highest frequency words get more weight in the prediction algorithm. As a
- special case, a weight of 0 is taken to mean profanity - words that should
- not be considered a typo, but that should never be suggested explicitly.
- You can capitalize words that must always be capitalized, such as "January".
- You can have a capitalized and a non-capitalized word as separate entries,
- such as "robin" and "Robin".
--->
-<wordlist>
- <w f="255">this</w>
- <w f="255">is</w>
- <w f="128">sample</w>
- <w f="1">wordlist</w>
-</wordlist>
diff --git a/dictionaries/sl_wordlist.combined.gz b/dictionaries/sl_wordlist.combined.gz
new file mode 100644
index 000000000..845b55a8b
--- /dev/null
+++ b/dictionaries/sl_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/sl_wordlist.xml.gz b/dictionaries/sl_wordlist.xml.gz
deleted file mode 100644
index 3927b698e..000000000
--- a/dictionaries/sl_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/sr_wordlist.combined.gz b/dictionaries/sr_wordlist.combined.gz
new file mode 100644
index 000000000..c15bc48fd
--- /dev/null
+++ b/dictionaries/sr_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/sr_wordlist.xml.gz b/dictionaries/sr_wordlist.xml.gz
deleted file mode 100644
index c2eea681f..000000000
--- a/dictionaries/sr_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/sv_wordlist.combined.gz b/dictionaries/sv_wordlist.combined.gz
new file mode 100644
index 000000000..ec399fbf5
--- /dev/null
+++ b/dictionaries/sv_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/sv_wordlist.xml.gz b/dictionaries/sv_wordlist.xml.gz
deleted file mode 100644
index caa63c435..000000000
--- a/dictionaries/sv_wordlist.xml.gz
+++ /dev/null
Binary files differ
diff --git a/dictionaries/tr_wordlist.combined.gz b/dictionaries/tr_wordlist.combined.gz
new file mode 100644
index 000000000..3e6ca3263
--- /dev/null
+++ b/dictionaries/tr_wordlist.combined.gz
Binary files differ
diff --git a/dictionaries/tr_wordlist.xml.gz b/dictionaries/tr_wordlist.xml.gz
deleted file mode 100644
index 35999208e..000000000
--- a/dictionaries/tr_wordlist.xml.gz
+++ /dev/null
Binary files differ