diff options
author | 2014-09-25 19:58:33 +0900 | |
---|---|---|
committer | 2014-09-26 15:15:16 +0900 | |
commit | 8a6e96d28645ce325a38423af6967a011edefc9d (patch) | |
tree | c6d3bf8693b6d95f2acb322b5801ae61983f70d6 /tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java | |
parent | f4329f7fffc43840b7fb95cf181ea016108a7664 (diff) | |
download | latinime-8a6e96d28645ce325a38423af6967a011edefc9d.tar.gz latinime-8a6e96d28645ce325a38423af6967a011edefc9d.tar.xz latinime-8a6e96d28645ce325a38423af6967a011edefc9d.zip |
Create a code point table based on occurrence counts.
Bug:17097992
Change-Id: Ifd76dbd4d385d800af416368e25c9e56a76d0fbf
Diffstat (limited to 'tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java')
-rw-r--r-- | tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java | 52 |
1 files changed, 51 insertions, 1 deletions
diff --git a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java index 0fa75e8ee..012fd811c 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java +++ b/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java @@ -18,6 +18,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; @@ -28,7 +29,11 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; import java.util.Iterator; +import java.util.Map.Entry; /** * An implementation of DictEncoder for version 2 binary dictionary. @@ -73,6 +78,46 @@ public class Ver2DictEncoder implements DictEncoder { } } + // Package for testing + static CodePointTable makeCodePointTable(final FusionDictionary dict) { + final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>(); + for (final WordProperty word : dict) { + // Store per code point occurrence + final String wordString = word.mWord; + for (int i = 0; i < wordString.length(); ++i) { + final int codePoint = Character.codePointAt(wordString, i); + if (codePointOccurrenceCounts.containsKey(codePoint)) { + codePointOccurrenceCounts.put(codePoint, + codePointOccurrenceCounts.get(codePoint) + 1); + } else { + codePointOccurrenceCounts.put(codePoint, 1); + } + } + } + final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray = + new ArrayList<>(codePointOccurrenceCounts.entrySet()); + // Descending order sort by occurrence (value side) + Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() { + @Override + public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) { + return b.getValue().compareTo(a.getValue()); + } + }); + int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; + // Temporary map for writing of nodes + final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>(); + for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { + // Put a relation from the original code point to the one byte code. + codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); + if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { + break; + } + } + // codePointToOneByteCodeMap for writing the trie + // codePointOccurrenceArray for writing the header + return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); + } + @Override public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) throws IOException, UnsupportedFormatException { @@ -85,7 +130,12 @@ public class Ver2DictEncoder implements DictEncoder { if (mOutStream == null) { openStream(); } - BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions); + + // Make code point conversion table ordered by occurrence of code points + final CodePointTable codePointTable = makeCodePointTable(dict); + + BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, + codePointTable.mCodePointOccurrenceArray); // Addresses are limited to 3 bytes, but since addresses can be relative to each node // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding |