diff options
Diffstat (limited to 'tests/src/org/kelar/inputmethod/latin/makedict')
15 files changed, 3966 insertions, 0 deletions
diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/AbstractDictDecoder.java b/tests/src/org/kelar/inputmethod/latin/makedict/AbstractDictDecoder.java new file mode 100644 index 000000000..978e6cd3a --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/AbstractDictDecoder.java @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.TreeMap; + +/** + * A base class of the binary dictionary decoder. + */ +public abstract class AbstractDictDecoder implements DictDecoder { + private static final int SUCCESS = 0; + private static final int ERROR_CANNOT_READ = 1; + private static final int ERROR_WRONG_FORMAT = 2; + + @Override @UsedForTesting + public int getTerminalPosition(final String word) + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + return BinaryDictIOUtils.getTerminalPosition(this, word); + } + + @Override @UsedForTesting + public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, + final TreeMap<Integer, Integer> frequencies, + final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams); + } + + /** + * Check whether the header contains the expected information. This is a no-error method, + * that will return an error code and never throw a checked exception. + * @return an error code, either ERROR_* or SUCCESS. + */ + private int checkHeader() { + try { + readHeader(); + } catch (IOException e) { + return ERROR_CANNOT_READ; + } catch (UnsupportedFormatException e) { + return ERROR_WRONG_FORMAT; + } + return SUCCESS; + } + + @Override + public boolean hasValidRawBinaryDictionary() { + return checkHeader() == SUCCESS; + } + + // Placeholder implementations below. These are actually unused. + @Override + public void openDictBuffer() throws FileNotFoundException, IOException, + UnsupportedFormatException { + } + + @Override + public boolean isDictBufferOpen() { + return false; + } + + @Override + public PtNodeInfo readPtNode(final int ptNodePos) { + return null; + } + + @Override + public void setPosition(int newPos) { + } + + @Override + public int getPosition() { + return 0; + } + + @Override + public int readPtNodeCount() { + return 0; + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java new file mode 100644 index 000000000..659df6859 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java @@ -0,0 +1,675 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import android.test.AndroidTestCase; +import android.util.Log; +import android.util.Pair; +import android.util.SparseArray; + +import org.kelar.inputmethod.latin.BinaryDictionary; +import org.kelar.inputmethod.latin.common.CodePointUtils; +import org.kelar.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import org.kelar.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import org.kelar.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNode; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; +import org.kelar.inputmethod.latin.utils.BinaryDictionaryUtils; +import org.kelar.inputmethod.latin.utils.ByteArrayDictBuffer; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map.Entry; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; + +/** + * Unit tests for BinaryDictDecoderUtils and BinaryDictEncoderUtils. + */ +public class BinaryDictDecoderEncoderTests extends AndroidTestCase { + private static final String TAG = BinaryDictDecoderEncoderTests.class.getSimpleName(); + private static final int DEFAULT_MAX_UNIGRAMS = 300; + private static final int DEFAULT_CODE_POINT_SET_SIZE = 50; + private static final int LARGE_CODE_POINT_SET_SIZE = 300; + private static final int UNIGRAM_FREQ = 10; + private static final int BIGRAM_FREQ = 50; + private static final int TOLERANCE_OF_BIGRAM_FREQ = 5; + + private static final ArrayList<String> sWords = new ArrayList<>(); + private static final ArrayList<String> sWordsWithVariousCodePoints = new ArrayList<>(); + private static final SparseArray<List<Integer>> sEmptyBigrams = new SparseArray<>(); + private static final SparseArray<List<Integer>> sStarBigrams = new SparseArray<>(); + private static final SparseArray<List<Integer>> sChainBigrams = new SparseArray<>(); + + final Random mRandom; + + public BinaryDictDecoderEncoderTests() { + this(System.currentTimeMillis(), DEFAULT_MAX_UNIGRAMS); + } + + public BinaryDictDecoderEncoderTests(final long seed, final int maxUnigrams) { + super(); + BinaryDictionaryUtils.setCurrentTimeForTest(0); + Log.e(TAG, "Testing dictionary: seed is " + seed); + mRandom = new Random(seed); + sWords.clear(); + sWordsWithVariousCodePoints.clear(); + generateWords(maxUnigrams, mRandom); + + for (int i = 0; i < sWords.size(); ++i) { + sChainBigrams.put(i, new ArrayList<Integer>()); + if (i > 0) { + sChainBigrams.get(i - 1).add(i); + } + } + + sStarBigrams.put(0, new ArrayList<Integer>()); + // MAX - 1 because we added one above already + final int maxBigrams = Math.min(sWords.size(), FormatSpec.MAX_BIGRAMS_IN_A_PTNODE - 1); + for (int i = 1; i < maxBigrams; ++i) { + sStarBigrams.get(0).add(i); + } + } + + @Override + protected void setUp() throws Exception { + super.setUp(); + BinaryDictionaryUtils.setCurrentTimeForTest(0); + } + + @Override + protected void tearDown() throws Exception { + // Quit test mode. + BinaryDictionaryUtils.setCurrentTimeForTest(-1); + super.tearDown(); + } + + private static void generateWords(final int number, final Random random) { + final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, + random); + final Set<String> wordSet = new HashSet<>(); + while (wordSet.size() < number) { + wordSet.add(CodePointUtils.generateWord(random, codePointSet)); + } + sWords.addAll(wordSet); + + final int[] largeCodePointSet = CodePointUtils.generateCodePointSet( + LARGE_CODE_POINT_SET_SIZE, random); + wordSet.clear(); + while (wordSet.size() < number) { + wordSet.add(CodePointUtils.generateWord(random, largeCodePointSet)); + } + sWordsWithVariousCodePoints.addAll(wordSet); + } + + /** + * Adds unigrams to the dictionary. + */ + private static void addUnigrams(final int number, final FusionDictionary dict, + final List<String> words) { + for (int i = 0; i < number; ++i) { + final String word = words.get(i); + final ArrayList<WeightedString> shortcuts = new ArrayList<>(); + dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ), false /* isNotAWord */, + false /* isPossiblyOffensive */); + } + } + + private static void addBigrams(final FusionDictionary dict, + final List<String> words, + final SparseArray<List<Integer>> bigrams) { + for (int i = 0; i < bigrams.size(); ++i) { + final int w1 = bigrams.keyAt(i); + for (int w2 : bigrams.valueAt(i)) { + dict.setBigram(words.get(w1), words.get(w2), new ProbabilityInfo(BIGRAM_FREQ)); + } + } + } + +// The following is useful to dump the dictionary into a textual file, but it can't compile +// on-device, so it's commented out. +// private void dumpToCombinedFileForDebug(final FusionDictionary dict, final String filename) +// throws IOException { +// org.kelar.inputmethod.latin.dicttool.CombinedInputOutput.writeDictionaryCombined( +// new java.io.FileWriter(new File(filename)), dict); +// } + + private static long timeWritingDictToFile(final File file, final FusionDictionary dict, + final FormatSpec.FormatOptions formatOptions) { + + long now = -1, diff = -1; + + try { + final DictEncoder dictEncoder = BinaryDictUtils.getDictEncoder(file, formatOptions); + + now = System.currentTimeMillis(); + // If you need to dump the dict to a textual file, uncomment the line below and the + // function above + // dumpToCombinedFileForDebug(file, "/tmp/foo"); + dictEncoder.writeDictionary(dict, formatOptions); + diff = System.currentTimeMillis() - now; + } catch (IOException e) { + Log.e(TAG, "IO exception while writing file", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException", e); + } + + return diff; + } + + private static void checkDictionary(final FusionDictionary dict, final List<String> words, + final SparseArray<List<Integer>> bigrams) { + assertNotNull(dict); + + // check unigram + for (final String word : words) { + final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, word); + assertNotNull(ptNode); + } + + // check bigram + for (int i = 0; i < bigrams.size(); ++i) { + final int w1 = bigrams.keyAt(i); + for (final int w2 : bigrams.valueAt(i)) { + final PtNode ptNode = FusionDictionary.findWordInTree(dict.mRootNodeArray, + words.get(w1)); + assertNotNull(words.get(w1) + "," + words.get(w2), ptNode.getBigram(words.get(w2))); + } + } + } + + private static String outputOptions(final int bufferType, + final FormatSpec.FormatOptions formatOptions) { + final String result = " : buffer type = " + + ((bufferType == BinaryDictUtils.USE_BYTE_BUFFER) ? "byte buffer" : "byte array"); + return result + " : version = " + formatOptions.mVersion; + } + + // Tests for readDictionaryBinary and writeDictionaryBinary + + private static long timeReadingAndCheckDict(final File file, final List<String> words, + final SparseArray<List<Integer>> bigrams, final int bufferType) { + long now, diff = -1; + + FusionDictionary dict = null; + try { + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + bufferType); + now = System.currentTimeMillis(); + dict = dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); + diff = System.currentTimeMillis() - now; + } catch (IOException e) { + Log.e(TAG, "IOException while reading dictionary", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "Unsupported format", e); + } + + checkDictionary(dict, words, bigrams); + return diff; + } + + // Tests for readDictionaryBinary and writeDictionaryBinary + private String runReadAndWrite(final List<String> words, + final SparseArray<List<Integer>> bigrams, + final int bufferType, final FormatSpec.FormatOptions formatOptions, + final String message) { + + final String dictName = "runReadAndWrite"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + checkDictionary(dict, words, bigrams); + + final long write = timeWritingDictToFile(file, dict, formatOptions); + final long read = timeReadingAndCheckDict(file, words, bigrams, bufferType); + + return "PROF: read=" + read + "ms, write=" + write + "ms :" + message + + " : " + outputOptions(bufferType, formatOptions); + } + + private void runReadAndWriteTests(final List<String> results, final int bufferType, + final FormatSpec.FormatOptions formatOptions) { + results.add(runReadAndWrite(sWords, sEmptyBigrams, bufferType, + formatOptions, "unigram")); + results.add(runReadAndWrite(sWords, sChainBigrams, bufferType, + formatOptions, "chain")); + results.add(runReadAndWrite(sWords, sStarBigrams, bufferType, + formatOptions, "star")); + results.add(runReadAndWrite(sWords, sEmptyBigrams, bufferType, formatOptions, + "unigram with shortcuts")); + results.add(runReadAndWrite(sWords, sChainBigrams, bufferType, formatOptions, + "chain with shortcuts")); + results.add(runReadAndWrite(sWords, sStarBigrams, bufferType, formatOptions, + "star with shortcuts")); + results.add(runReadAndWrite(sWordsWithVariousCodePoints, sEmptyBigrams, + bufferType, formatOptions, + "unigram with various code points")); + } + + public void testCharacterTableIsPresent() throws IOException, UnsupportedFormatException { + final String[] wordSource = {"words", "used", "for", "testing", "a", "code point", "table"}; + final List<String> words = Arrays.asList(wordSource); + final String correctCodePointTable = "toesdrniawuplgfcb "; + final String dictName = "codePointTableTest"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final String codePointTableAttribute = DictionaryHeader.CODE_POINT_TABLE_KEY; + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, + BinaryDictUtils.STATIC_OPTIONS, getContext().getCacheDir()); + + // Write a test dictionary + final DictEncoder dictEncoder = new Ver2DictEncoder(file, + Ver2DictEncoder.CODE_POINT_TABLE_ON); + final FormatSpec.FormatOptions formatOptions = + new FormatSpec.FormatOptions( + FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION); + final FusionDictionary sourcedict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), sourcedict, words); + dictEncoder.writeDictionary(sourcedict, formatOptions); + + // Read the dictionary + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + DictDecoder.USE_BYTEARRAY); + final DictionaryHeader fileHeader = dictDecoder.readHeader(); + // Check if codePointTable is present + assertTrue("codePointTable is not present", + fileHeader.mDictionaryOptions.mAttributes.containsKey(codePointTableAttribute)); + final String codePointTable = + fileHeader.mDictionaryOptions.mAttributes.get(codePointTableAttribute); + // Check if codePointTable is correct + assertEquals("codePointTable is incorrect", codePointTable, correctCodePointTable); + } + + // Unit test for CharEncoding.readString and CharEncoding.writeString. + public void testCharEncoding() { + // the max length of a word in sWords is less than 50. + // See generateWords. + final byte[] buffer = new byte[50 * 3]; + final DictBuffer dictBuffer = new ByteArrayDictBuffer(buffer); + for (final String word : sWords) { + Arrays.fill(buffer, (byte) 0); + CharEncoding.writeString(buffer, 0, word, null); + dictBuffer.position(0); + final String str = CharEncoding.readString(dictBuffer); + assertEquals(word, str); + } + } + + public void testReadAndWriteWithByteBuffer() { + final List<String> results = new ArrayList<>(); + + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.STATIC_OPTIONS); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.DYNAMIC_OPTIONS_WITH_TIMESTAMP); + for (final String result : results) { + Log.d(TAG, result); + } + } + + public void testReadAndWriteWithByteArray() { + final List<String> results = new ArrayList<>(); + + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.STATIC_OPTIONS); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP); + runReadAndWriteTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.DYNAMIC_OPTIONS_WITH_TIMESTAMP); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + // Tests for readUnigramsAndBigramsBinary + + private static void checkWordMap(final List<String> expectedWords, + final SparseArray<List<Integer>> expectedBigrams, + final TreeMap<Integer, String> resultWords, + final TreeMap<Integer, Integer> resultFrequencies, + final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams, + final boolean checkProbability) { + // check unigrams + final Set<String> actualWordsSet = new HashSet<>(resultWords.values()); + final Set<String> expectedWordsSet = new HashSet<>(expectedWords); + assertEquals(actualWordsSet, expectedWordsSet); + if (checkProbability) { + for (int freq : resultFrequencies.values()) { + assertEquals(freq, UNIGRAM_FREQ); + } + } + + // check bigrams + final HashMap<String, Set<String>> expBigrams = new HashMap<>(); + for (int i = 0; i < expectedBigrams.size(); ++i) { + final String word1 = expectedWords.get(expectedBigrams.keyAt(i)); + for (int w2 : expectedBigrams.valueAt(i)) { + if (expBigrams.get(word1) == null) { + expBigrams.put(word1, new HashSet<String>()); + } + expBigrams.get(word1).add(expectedWords.get(w2)); + } + } + + final HashMap<String, Set<String>> actBigrams = new HashMap<>(); + for (Entry<Integer, ArrayList<PendingAttribute>> entry : resultBigrams.entrySet()) { + final String word1 = resultWords.get(entry.getKey()); + final int unigramFreq = resultFrequencies.get(entry.getKey()); + for (PendingAttribute attr : entry.getValue()) { + final String word2 = resultWords.get(attr.mAddress); + if (actBigrams.get(word1) == null) { + actBigrams.put(word1, new HashSet<String>()); + } + actBigrams.get(word1).add(word2); + + if (checkProbability) { + final int bigramFreq = BinaryDictIOUtils.reconstructBigramFrequency( + unigramFreq, attr.mFrequency); + assertTrue(Math.abs(bigramFreq - BIGRAM_FREQ) < TOLERANCE_OF_BIGRAM_FREQ); + } + } + } + assertEquals(actBigrams, expBigrams); + } + + private static long timeAndCheckReadUnigramsAndBigramsBinary(final File file, + final List<String> words, final SparseArray<List<Integer>> bigrams, + final int bufferType, final boolean checkProbability) { + final TreeMap<Integer, String> resultWords = new TreeMap<>(); + final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams = new TreeMap<>(); + final TreeMap<Integer, Integer> resultFreqs = new TreeMap<>(); + + long now = -1, diff = -1; + try { + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + bufferType); + now = System.currentTimeMillis(); + dictDecoder.readUnigramsAndBigramsBinary(resultWords, resultFreqs, resultBigrams); + diff = System.currentTimeMillis() - now; + } catch (IOException e) { + Log.e(TAG, "IOException", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException", e); + } + + checkWordMap(words, bigrams, resultWords, resultFreqs, resultBigrams, checkProbability); + return diff; + } + + private String runReadUnigramsAndBigramsBinary(final ArrayList<String> words, + final SparseArray<List<Integer>> bigrams, final int bufferType, + final FormatSpec.FormatOptions formatOptions, final String message) { + final String dictName = "runReadUnigrams"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + + // making the dictionary from lists of words. + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + + timeWritingDictToFile(file, dict, formatOptions); + + // Caveat: Currently, the Java code to read a v4 dictionary doesn't calculate the + // probability when there's a timestamp for the entry. + // TODO: Abandon the Java code, and implement the v4 dictionary reading code in native. + long wordMap = timeAndCheckReadUnigramsAndBigramsBinary(file, words, bigrams, bufferType, + !formatOptions.mHasTimestamp /* checkProbability */); + long fullReading = timeReadingAndCheckDict(file, words, bigrams, + bufferType); + + return "readDictionaryBinary=" + fullReading + ", readUnigramsAndBigramsBinary=" + wordMap + + " : " + message + " : " + outputOptions(bufferType, formatOptions); + } + + private void runReadUnigramsAndBigramsTests(final ArrayList<String> results, + final int bufferType, final FormatSpec.FormatOptions formatOptions) { + results.add(runReadUnigramsAndBigramsBinary(sWords, sEmptyBigrams, bufferType, + formatOptions, "unigram")); + results.add(runReadUnigramsAndBigramsBinary(sWords, sChainBigrams, bufferType, + formatOptions, "chain")); + results.add(runReadUnigramsAndBigramsBinary(sWords, sStarBigrams, bufferType, + formatOptions, "star")); + } + + public void testReadUnigramsAndBigramsBinaryWithByteBuffer() { + final ArrayList<String> results = new ArrayList<>(); + + runReadUnigramsAndBigramsTests(results, BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.STATIC_OPTIONS); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + public void testReadUnigramsAndBigramsBinaryWithByteArray() { + final ArrayList<String> results = new ArrayList<>(); + + runReadUnigramsAndBigramsTests(results, BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.STATIC_OPTIONS); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + // Tests for getTerminalPosition + private static String getWordFromBinary(final DictDecoder dictDecoder, final int address) { + if (dictDecoder.getPosition() != 0) dictDecoder.setPosition(0); + + DictionaryHeader fileHeader = null; + try { + fileHeader = dictDecoder.readHeader(); + } catch (IOException e) { + return null; + } catch (UnsupportedFormatException e) { + return null; + } + if (fileHeader == null) return null; + return BinaryDictDecoderUtils.getWordAtPosition(dictDecoder, fileHeader.mBodyOffset, + address).mWord; + } + + private static long checkGetTerminalPosition(final DictDecoder dictDecoder, final String word, + final boolean contained) { + long diff = -1; + int position = -1; + try { + final long now = System.nanoTime(); + position = dictDecoder.getTerminalPosition(word); + diff = System.nanoTime() - now; + } catch (IOException e) { + Log.e(TAG, "IOException while getTerminalPosition", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "UnsupportedFormatException while getTerminalPosition", e); + } + + assertEquals(FormatSpec.NOT_VALID_WORD != position, contained); + if (contained) assertEquals(getWordFromBinary(dictDecoder, position), word); + return diff; + } + + private void runGetTerminalPosition(final ArrayList<String> words, + final SparseArray<List<Integer>> bigrams, final int bufferType, + final FormatOptions formatOptions, final String message) { + final String dictName = "testGetTerminalPosition"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(sWords.size(), dict, sWords); + addBigrams(dict, words, bigrams); + timeWritingDictToFile(file, dict, formatOptions); + + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), + DictDecoder.USE_BYTEARRAY); + try { + dictDecoder.openDictBuffer(); + } catch (IOException e) { + Log.e(TAG, "IOException while opening the buffer", e); + } catch (UnsupportedFormatException e) { + Log.e(TAG, "IOException while opening the buffer", e); + } + assertTrue("Can't get the buffer", dictDecoder.isDictBufferOpen()); + + try { + // too long word + final String longWord = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(longWord)); + + // null + assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(null)); + + // empty string + assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition("")); + } catch (IOException e) { + } catch (UnsupportedFormatException e) { + } + + // Test a word that is contained within the dictionary. + long sum = 0; + for (int i = 0; i < sWords.size(); ++i) { + final long time = checkGetTerminalPosition(dictDecoder, sWords.get(i), true); + sum += time == -1 ? 0 : time; + } + Log.d(TAG, "per search : " + (((double)sum) / sWords.size() / 1000000) + " : " + message + + " : " + outputOptions(bufferType, formatOptions)); + + // Test a word that isn't contained within the dictionary. + final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, + mRandom); + for (int i = 0; i < 1000; ++i) { + final String word = CodePointUtils.generateWord(mRandom, codePointSet); + if (sWords.indexOf(word) != -1) continue; + checkGetTerminalPosition(dictDecoder, word, false); + } + } + + private void runGetTerminalPositionTests(final int bufferType, + final FormatOptions formatOptions) { + runGetTerminalPosition(sWords, sEmptyBigrams, bufferType, formatOptions, "unigram"); + } + + public void testGetTerminalPosition() { + final ArrayList<String> results = new ArrayList<>(); + + runGetTerminalPositionTests(BinaryDictUtils.USE_BYTE_ARRAY, + BinaryDictUtils.STATIC_OPTIONS); + runGetTerminalPositionTests(BinaryDictUtils.USE_BYTE_BUFFER, + BinaryDictUtils.STATIC_OPTIONS); + + for (final String result : results) { + Log.d(TAG, result); + } + } + + public void testVer2DictGetWordProperty() { + final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS; + final ArrayList<String> words = sWords; + final String dictName = "testGetWordProperty"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, sEmptyBigrams); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + file.delete(); + timeWritingDictToFile(file, dict, formatOptions); + final BinaryDictionary binaryDictionary = new BinaryDictionary(file.getAbsolutePath(), + 0 /* offset */, file.length(), true /* useFullEditDistance */, + Locale.ENGLISH, dictName, false /* isUpdatable */); + for (final String word : words) { + final WordProperty wordProperty = binaryDictionary.getWordProperty(word, + false /* isBeginningOfSentence */); + assertEquals(word, wordProperty.mWord); + assertEquals(UNIGRAM_FREQ, wordProperty.getProbability()); + } + } + + public void testVer2DictIteration() { + final FormatOptions formatOptions = BinaryDictUtils.STATIC_OPTIONS; + final ArrayList<String> words = sWords; + final SparseArray<List<Integer>> bigrams = sEmptyBigrams; + final String dictName = "testGetWordProperty"; + final String dictVersion = Long.toString(System.currentTimeMillis()); + final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), + BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); + addUnigrams(words.size(), dict, words); + addBigrams(dict, words, bigrams); + final File file = BinaryDictUtils.getDictFile(dictName, dictVersion, formatOptions, + getContext().getCacheDir()); + timeWritingDictToFile(file, dict, formatOptions); + Log.d(TAG, file.getAbsolutePath()); + final BinaryDictionary binaryDictionary = new BinaryDictionary(file.getAbsolutePath(), + 0 /* offset */, file.length(), true /* useFullEditDistance */, + Locale.ENGLISH, dictName, false /* isUpdatable */); + + final HashSet<String> wordSet = new HashSet<>(words); + final HashSet<Pair<String, String>> bigramSet = new HashSet<>(); + + for (int i = 0; i < words.size(); i++) { + final List<Integer> bigramList = bigrams.get(i); + if (bigramList != null) { + for (final Integer word1Index : bigramList) { + final String word1 = words.get(word1Index); + bigramSet.add(new Pair<>(words.get(i), word1)); + } + } + } + int token = 0; + do { + final BinaryDictionary.GetNextWordPropertyResult result = + binaryDictionary.getNextWordProperty(token); + final WordProperty wordProperty = result.mWordProperty; + final String word0 = wordProperty.mWord; + assertEquals(UNIGRAM_FREQ, wordProperty.mProbabilityInfo.mProbability); + wordSet.remove(word0); + if (wordProperty.mHasNgrams) { + for (final WeightedString bigramTarget : wordProperty.getBigrams()) { + final String word1 = bigramTarget.mWord; + final Pair<String, String> bigram = new Pair<>(word0, word1); + assertTrue(bigramSet.contains(bigram)); + bigramSet.remove(bigram); + } + } + token = result.mNextToken; + } while (token != 0); + assertTrue(wordSet.isEmpty()); + assertTrue(bigramSet.isEmpty()); + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java new file mode 100644 index 000000000..18199ae21 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -0,0 +1,425 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.LinkedList; + +import javax.annotation.Nonnull; + +/** + * Decodes binary files for a FusionDictionary. + * + * All the methods in this class are static. + * + * TODO: Move this file to makedict/internal. + * TODO: Rename this class to DictDecoderUtils. + */ +public final class BinaryDictDecoderUtils { + private BinaryDictDecoderUtils() { + // This utility class is not publicly instantiable. + } + + @UsedForTesting + public interface DictBuffer { + public int readUnsignedByte(); + public int readUnsignedShort(); + public int readUnsignedInt24(); + public int readInt(); + public int position(); + public void position(int newPosition); + @UsedForTesting + public void put(final byte b); + public int limit(); + @UsedForTesting + public int capacity(); + } + + public static final class ByteBufferDictBuffer implements DictBuffer { + private ByteBuffer mBuffer; + + public ByteBufferDictBuffer(final ByteBuffer buffer) { + mBuffer = buffer; + } + + @Override + public int readUnsignedByte() { + return mBuffer.get() & 0xFF; + } + + @Override + public int readUnsignedShort() { + return mBuffer.getShort() & 0xFFFF; + } + + @Override + public int readUnsignedInt24() { + final int retval = readUnsignedByte(); + return (retval << 16) + readUnsignedShort(); + } + + @Override + public int readInt() { + return mBuffer.getInt(); + } + + @Override + public int position() { + return mBuffer.position(); + } + + @Override + public void position(int newPos) { + mBuffer.position(newPos); + } + + @Override + public void put(final byte b) { + mBuffer.put(b); + } + + @Override + public int limit() { + return mBuffer.limit(); + } + + @Override + public int capacity() { + return mBuffer.capacity(); + } + } + + /** + * A class grouping utility function for our specific character encoding. + */ + static final class CharEncoding { + + /** + * Helper method to find out whether this code fits on one byte + */ + private static boolean fitsOnOneByte(final int character, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + int codePoint = character; + if (codePointToOneByteCodeMap != null) { + if (codePointToOneByteCodeMap.containsKey(character)) { + codePoint = codePointToOneByteCodeMap.get(character); + } + } + return codePoint >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE + && codePoint <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE; + } + + /** + * Compute the size of a character given its character code. + * + * Char format is: + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + * + * @param character the character code. + * @return the size in binary encoded-form, either 1 or 3 bytes. + */ + static int getCharSize(final int character, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + // See char encoding in FusionDictionary.java + if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1; + if (FormatSpec.INVALID_CHARACTER == character) return 1; + return 3; + } + + /** + * Compute the byte size of a character array. + */ + static int getCharArraySize(final int[] chars, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + int size = 0; + for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap); + return size; + } + + /** + * Writes a char array to a byte buffer. + * + * @param codePoints the code point array to write. + * @param buffer the byte buffer to write to. + * @param fromIndex the index in buffer to write the character array to. + * @param codePointToOneByteCodeMap the map to convert the code point. + * @return the index after the last character. + */ + static int writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + int index = fromIndex; + for (int codePoint : codePoints) { + if (codePointToOneByteCodeMap != null) { + if (codePointToOneByteCodeMap.containsKey(codePoint)) { + // Convert code points + codePoint = codePointToOneByteCodeMap.get(codePoint); + } + } + if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { + buffer[index++] = (byte)codePoint; + } else { + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); + } + } + return index; + } + + /** + * Writes a string with our character format to a byte buffer. + * + * This will also write the terminator byte. + * + * @param buffer the byte buffer to write to. + * @param origin the offset to write from. + * @param word the string to write. + * @return the size written, in bytes. + */ + static int writeString(final byte[] buffer, final int origin, final String word, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + final int length = word.length(); + int index = origin; + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + int codePoint = word.codePointAt(i); + if (codePointToOneByteCodeMap != null) { + if (codePointToOneByteCodeMap.containsKey(codePoint)) { + // Convert code points + codePoint = codePointToOneByteCodeMap.get(codePoint); + } + } + if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { + buffer[index++] = (byte)codePoint; + } else { + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); + } + } + buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + return index - origin; + } + + /** + * Writes a string with our character format to an OutputStream. + * + * This will also write the terminator byte. + * + * @param stream the OutputStream to write to. + * @param word the string to write. + * @return the size written, in bytes. + */ + static int writeString(final OutputStream stream, final String word, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException { + final int length = word.length(); + int written = 0; + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap); + if (1 == charSize) { + stream.write((byte) codePoint); + } else { + stream.write((byte) (0xFF & (codePoint >> 16))); + stream.write((byte) (0xFF & (codePoint >> 8))); + stream.write((byte) (0xFF & codePoint)); + } + written += charSize; + } + stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); + written += FormatSpec.PTNODE_TERMINATOR_SIZE; + return written; + } + + /** + * Reads a string from a DictBuffer. This is the converse of the above method. + */ + static String readString(final DictBuffer dictBuffer) { + final StringBuilder s = new StringBuilder(); + int character = readChar(dictBuffer); + while (character != FormatSpec.INVALID_CHARACTER) { + s.appendCodePoint(character); + character = readChar(dictBuffer); + } + return s.toString(); + } + + /** + * Reads a character from the buffer. + * + * This follows the character format documented earlier in this source file. + * + * @param dictBuffer the buffer, positioned over an encoded character. + * @return the character code. + */ + static int readChar(final DictBuffer dictBuffer) { + int character = dictBuffer.readUnsignedByte(); + if (!fitsOnOneByte(character, null)) { + if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { + return FormatSpec.INVALID_CHARACTER; + } + character <<= 16; + character += dictBuffer.readUnsignedShort(); + } + return character; + } + } + + /** + * Reads and returns the PtNode count out of a buffer and forwards the pointer. + */ + /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) { + final int msb = dictBuffer.readUnsignedByte(); + if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) { + return msb; + } + return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8) + + dictBuffer.readUnsignedByte(); + } + + /** + * Finds, as a string, the word at the position passed as an argument. + * + * @param dictDecoder the dict decoder. + * @param headerSize the size of the header. + * @param pos the position to seek. + * @return the word with its frequency, as a weighted string. + */ + @UsedForTesting + /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, + final int headerSize, final int pos) { + final WeightedString result; + final int originalPos = dictDecoder.getPosition(); + dictDecoder.setPosition(pos); + result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos); + dictDecoder.setPosition(originalPos); + return result; + } + + private static WeightedString getWordAtPositionWithoutParentAddress( + final DictDecoder dictDecoder, final int headerSize, final int pos) { + dictDecoder.setPosition(headerSize); + final int count = dictDecoder.readPtNodeCount(); + int groupPos = dictDecoder.getPosition(); + final StringBuilder builder = new StringBuilder(); + WeightedString result = null; + + PtNodeInfo last = null; + for (int i = count - 1; i >= 0; --i) { + PtNodeInfo info = dictDecoder.readPtNode(groupPos); + groupPos = info.mEndAddress; + if (info.mOriginalAddress == pos) { + builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); + result = new WeightedString(builder.toString(), info.mProbabilityInfo); + break; // and return + } + if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { + if (info.mChildrenAddress > pos) { + if (null == last) continue; + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); + groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); + last = null; + continue; + } + last = info; + } + if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); + groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); + last = null; + continue; + } + } + return result; + } + + /** + * Helper method that brutally decodes a header from a byte array. + * + * @param headerBuffer a buffer containing the bytes of the header. + * @return a hashmap of the attributes stored in the header + */ + @Nonnull + public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer) + throws UnsupportedFormatException { + final StringBuilder sb = new StringBuilder(); + final LinkedList<String> keyValues = new LinkedList<>(); + int index = 0; + while (index < headerBuffer.length) { + if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) { + keyValues.add(sb.toString()); + sb.setLength(0); + } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF, + null /* codePointTable */)) { + sb.appendCodePoint(headerBuffer[index] & 0xFF); + } else { + sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16) + + ((headerBuffer[index + 1] & 0xFF) << 8) + + (headerBuffer[index + 2] & 0xFF)); + index += 2; + } + index += 1; + } + if ((keyValues.size() & 1) != 0) { + throw new UnsupportedFormatException("Odd number of attributes"); + } + final HashMap<String, String> attributes = new HashMap<>(); + for (int i = 0; i < keyValues.size(); i += 2) { + attributes.put(keyValues.get(i), keyValues.get(i + 1)); + } + return attributes; + } + + /** + * Helper method to pass a file name instead of a File object to isBinaryDictionary. + */ + public static boolean isBinaryDictionary(final String filename) { + final File file = new File(filename); + return isBinaryDictionary(file); + } + + /** + * Basic test to find out whether the file is a binary dictionary or not. + * + * @param file The file to test. + * @return true if it's a binary dictionary, false otherwise + */ + public static boolean isBinaryDictionary(final File file) { + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length()); + if (dictDecoder == null) { + return false; + } + return dictDecoder.hasValidRawBinaryDictionary(); + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictEncoderUtils.java new file mode 100644 index 000000000..3b35288d7 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -0,0 +1,839 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import org.kelar.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNode; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Encodes binary files for a FusionDictionary. + * + * All the methods in this class are static. + * + * TODO: Rename this class to DictEncoderUtils. + */ +public class BinaryDictEncoderUtils { + + private static final boolean DBG = MakedictLog.DBG; + + private BinaryDictEncoderUtils() { + // This utility class is not publicly instantiable. + } + + // Arbitrary limit to how much passes we consider address size compression should + // terminate in. At the time of this writing, our largest dictionary completes + // compression in five passes. + // If the number of passes exceeds this number, makedict bails with an exception on + // suspicion that a bug might be causing an infinite loop. + private static final int MAX_PASSES = 24; + + /** + * Compute the binary size of the character array. + * + * If only one character, this is the size of this character. If many, it's the sum of their + * sizes + 1 byte for the terminator. + * + * @param characters the character array + * @return the size of the char array, including the terminator if any + */ + static int getPtNodeCharactersSize(final int[] characters, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + int size = CharEncoding.getCharArraySize(characters, codePointToOneByteCodeMap); + if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; + return size; + } + + /** + * Compute the binary size of the character array in a PtNode + * + * If only one character, this is the size of this character. If many, it's the sum of their + * sizes + 1 byte for the terminator. + * + * @param ptNode the PtNode + * @return the size of the char array, including the terminator if any + */ + private static int getPtNodeCharactersSize(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + return getPtNodeCharactersSize(ptNode.mChars, codePointToOneByteCodeMap); + } + + /** + * Compute the binary size of the PtNode count for a node array. + * @param nodeArray the nodeArray + * @return the size of the PtNode count, either 1 or 2 bytes. + */ + private static int getPtNodeCountSize(final PtNodeArray nodeArray) { + return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size()); + } + + /** + * Compute the maximum size of a PtNode, assuming 3-byte addresses for everything. + * + * @param ptNode the PtNode to compute the size of. + * @return the maximum size of the PtNode. + */ + private static int getPtNodeMaximumSize(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + int size = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); + if (ptNode.isTerminal()) { + // If terminal, one byte for the frequency. + size += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address + if (null != ptNode.mBigrams) { + size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE + + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) + * ptNode.mBigrams.size(); + } + return size; + } + + /** + * Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for + * everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of + * the containing node array, and cache it it its 'mCachedSize' member. + * + * @param ptNodeArray the node array to compute the maximum size of. + */ + private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + int size = getPtNodeCountSize(ptNodeArray); + for (PtNode node : ptNodeArray.mData) { + final int nodeSize = getPtNodeMaximumSize(node, codePointToOneByteCodeMap); + node.mCachedSize = nodeSize; + size += nodeSize; + } + ptNodeArray.mCachedSize = size; + } + + /** + * Compute the size of the header (flag + [parent address] + characters size) of a PtNode. + * + * @param ptNode the PtNode of which to compute the size of the header + */ + private static int getNodeHeaderSize(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode, + codePointToOneByteCodeMap); + } + + /** + * Compute the size, in bytes, that an address will occupy. + * + * This can be used either for children addresses (which are always positive) or for + * attribute, which may be positive or negative but + * store their sign bit separately. + * + * @param address the address + * @return the byte size. + */ + static int getByteSize(final int address) { + assert(address <= FormatSpec.UINT24_MAX); + if (!BinaryDictIOUtils.hasChildrenAddress(address)) { + return 0; + } else if (Math.abs(address) <= FormatSpec.UINT8_MAX) { + return 1; + } else if (Math.abs(address) <= FormatSpec.UINT16_MAX) { + return 2; + } else { + return 3; + } + } + + static int writeUIntToBuffer(final byte[] buffer, final int fromPosition, final int value, + final int size) { + int position = fromPosition; + switch(size) { + case 4: + buffer[position++] = (byte) ((value >> 24) & 0xFF); + /* fall through */ + case 3: + buffer[position++] = (byte) ((value >> 16) & 0xFF); + /* fall through */ + case 2: + buffer[position++] = (byte) ((value >> 8) & 0xFF); + /* fall through */ + case 1: + buffer[position++] = (byte) (value & 0xFF); + break; + default: + /* nop */ + } + return position; + } + + static void writeUIntToStream(final OutputStream stream, final int value, final int size) + throws IOException { + switch(size) { + case 4: + stream.write((value >> 24) & 0xFF); + /* fall through */ + case 3: + stream.write((value >> 16) & 0xFF); + /* fall through */ + case 2: + stream.write((value >> 8) & 0xFF); + /* fall through */ + case 1: + stream.write(value & 0xFF); + break; + default: + /* nop */ + } + } + + // End utility methods + + // This method is responsible for finding a nice ordering of the nodes that favors run-time + // cache performance and dictionary size. + /* package for tests */ static ArrayList<PtNodeArray> flattenTree( + final PtNodeArray rootNodeArray) { + final int treeSize = FusionDictionary.countPtNodes(rootNodeArray); + MakedictLog.i("Counted nodes : " + treeSize); + final ArrayList<PtNodeArray> flatTree = new ArrayList<>(treeSize); + return flattenTreeInner(flatTree, rootNodeArray); + } + + private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list, + final PtNodeArray ptNodeArray) { + // Removing the node is necessary if the tails are merged, because we would then + // add the same node several times when we only want it once. A number of places in + // the code also depends on any node being only once in the list. + // Merging tails can only be done if there are no attributes. Searching for attributes + // in LatinIME code depends on a total breadth-first ordering, which merging tails + // breaks. If there are no attributes, it should be fine (and reduce the file size) + // to merge tails, and removing the node from the list would be necessary. However, + // we don't merge tails because breaking the breadth-first ordering would result in + // extreme overhead at bigram lookup time (it would make the search function O(n) instead + // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty + // high). + // If no nodes are ever merged, we can't have the same node twice in the list, hence + // searching for duplicates in unnecessary. It is also very performance consuming, + // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making + // this simple list.remove operation O(n*n) overall. On Android this overhead is very + // high. + // For future reference, the code to remove duplicate is a simple : list.remove(node); + list.add(ptNodeArray); + final ArrayList<PtNode> branches = ptNodeArray.mData; + for (PtNode ptNode : branches) { + if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren); + } + return list; + } + + /** + * Get the offset from a position inside a current node array to a target node array, during + * update. + * + * If the current node array is before the target node array, the target node array has not + * been updated yet, so we should return the offset from the old position of the current node + * array to the old position of the target node array. If on the other hand the target is + * before the current node array, it already has been updated, so we should return the offset + * from the new position in the current node array to the new position in the target node + * array. + * + * @param currentNodeArray node array containing the PtNode where the offset will be written + * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray + * @param targetNodeArray the target node array to get the offset to + * @return the offset to the target node array + */ + private static int getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray, + final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray) { + final boolean isTargetBeforeCurrent = (targetNodeArray.mCachedAddressBeforeUpdate + < currentNodeArray.mCachedAddressBeforeUpdate); + if (isTargetBeforeCurrent) { + return targetNodeArray.mCachedAddressAfterUpdate + - (currentNodeArray.mCachedAddressAfterUpdate + + offsetFromStartOfCurrentNodeArray); + } + return targetNodeArray.mCachedAddressBeforeUpdate + - (currentNodeArray.mCachedAddressBeforeUpdate + offsetFromStartOfCurrentNodeArray); + } + + /** + * Get the offset from a position inside a current node array to a target PtNode, during + * update. + * + * @param currentNodeArray node array containing the PtNode where the offset will be written + * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray + * @param targetPtNode the target PtNode to get the offset to + * @return the offset to the target PtNode + */ + // TODO: is there any way to factorize this method with the one above? + private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray, + final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) { + final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate + + offsetFromStartOfCurrentNodeArray; + final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate + < oldOffsetBasePoint); + // If the target is before the current node array, then its address has already been + // updated. We can use the AfterUpdate member, and compare it to our own member after + // update. Otherwise, the AfterUpdate member is not updated yet, so we need to use the + // BeforeUpdate member, and of course we have to compare this to our own address before + // update. + if (isTargetBeforeCurrent) { + final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate + + offsetFromStartOfCurrentNodeArray; + return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint; + } + return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint; + } + + /** + * Computes the actual node array size, based on the cached addresses of the children nodes. + * + * Each node array stores its tentative address. During dictionary address computing, these + * are not final, but they can be used to compute the node array size (the node array size + * depends on the address of the children because the number of bytes necessary to store an + * address depends on its numeric value. The return value indicates whether the node array + * contents (as in, any of the addresses stored in the cache fields) have changed with + * respect to their previous value. + * + * @param ptNodeArray the node array to compute the size of. + * @param dict the dictionary in which the word/attributes are to be found. + * @return false if none of the cached addresses inside the node array changed, true otherwise. + */ + private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, + final FusionDictionary dict, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + boolean changed = false; + int size = getPtNodeCountSize(ptNodeArray); + for (PtNode ptNode : ptNodeArray.mData) { + ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size; + if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { + changed = true; + } + int nodeSize = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); + if (ptNode.isTerminal()) { + nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + if (null != ptNode.mChildren) { + nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, + nodeSize + size, ptNode.mChildren)); + } + if (null != ptNode.mBigrams) { + for (WeightedString bigram : ptNode.mBigrams) { + final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, + nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); + nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; + } + } + ptNode.mCachedSize = nodeSize; + size += nodeSize; + } + if (ptNodeArray.mCachedSize != size) { + ptNodeArray.mCachedSize = size; + changed = true; + } + return changed; + } + + /** + * Initializes the cached addresses of node arrays and their containing nodes from their size. + * + * @param flatNodes the list of node arrays. + * @return the byte size of the entire stack. + */ + private static int initializePtNodeArraysCachedAddresses( + final ArrayList<PtNodeArray> flatNodes) { + int nodeArrayOffset = 0; + for (final PtNodeArray nodeArray : flatNodes) { + nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset; + int nodeCountSize = getPtNodeCountSize(nodeArray); + int nodeffset = 0; + for (final PtNode ptNode : nodeArray.mData) { + ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate = + nodeCountSize + nodeArrayOffset + nodeffset; + nodeffset += ptNode.mCachedSize; + } + nodeArrayOffset += nodeArray.mCachedSize; + } + return nodeArrayOffset; + } + + /** + * Updates the cached addresses of node arrays after recomputing their new positions. + * + * @param flatNodes the list of node arrays. + */ + private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) { + for (final PtNodeArray nodeArray : flatNodes) { + nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate; + for (final PtNode ptNode : nodeArray.mData) { + ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate; + } + } + } + + /** + * Compute the addresses and sizes of an ordered list of PtNode arrays. + * + * This method takes a list of PtNode arrays and will update their cached address and size + * values so that they can be written into a file. It determines the smallest size each of the + * PtNode arrays can be given the addresses of its children and attributes, and store that into + * each PtNode. + * The order of the PtNode is given by the order of the array. This method makes no effort + * to find a good order; it only mechanically computes the size this order results in. + * + * @param dict the dictionary + * @param flatNodes the ordered list of PtNode arrays + * @return the same array it was passed. The nodes have been updated for address and size. + */ + /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, + final ArrayList<PtNodeArray> flatNodes, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + // First get the worst possible sizes and offsets + for (final PtNodeArray n : flatNodes) { + calculatePtNodeArrayMaximumSize(n, codePointToOneByteCodeMap); + } + final int offset = initializePtNodeArraysCachedAddresses(flatNodes); + + MakedictLog.i("Compressing the array addresses. Original size : " + offset); + MakedictLog.i("(Recursively seen size : " + offset + ")"); + + int passes = 0; + boolean changesDone = false; + do { + changesDone = false; + int ptNodeArrayStartOffset = 0; + for (final PtNodeArray ptNodeArray : flatNodes) { + ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; + final int oldNodeArraySize = ptNodeArray.mCachedSize; + final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict, + codePointToOneByteCodeMap); + final int newNodeArraySize = ptNodeArray.mCachedSize; + if (oldNodeArraySize < newNodeArraySize) { + throw new RuntimeException("Increased size ?!"); + } + ptNodeArrayStartOffset += newNodeArraySize; + changesDone |= changed; + } + updatePtNodeArraysCachedAddresses(flatNodes); + ++passes; + if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); + } while (changesDone); + + final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1); + MakedictLog.i("Compression complete in " + passes + " passes."); + MakedictLog.i("After address compression : " + + (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize)); + + return flatNodes; + } + + /** + * Validity-checking method. + * + * This method checks a list of PtNode arrays for juxtaposition, that is, it will do + * nothing if each node array's cached address is actually the previous node array's address + * plus the previous node's size. + * If this is not the case, it will throw an exception. + * + * @param arrays the list of node arrays to check + */ + /* package */ static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) { + int offset = 0; + int index = 0; + for (final PtNodeArray ptNodeArray : arrays) { + // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter + // which we use. + if (ptNodeArray.mCachedAddressAfterUpdate != offset) { + throw new RuntimeException("Wrong address for node " + index + + " : expected " + offset + ", got " + + ptNodeArray.mCachedAddressAfterUpdate); + } + ++index; + offset += ptNodeArray.mCachedSize; + } + } + + /** + * Helper method to write a children position to a file. + * + * @param buffer the buffer to write to. + * @param fromIndex the index in the buffer to write the address to. + * @param position the position to write. + * @return the size in bytes the address actually took. + */ + /* package */ static int writeChildrenPosition(final byte[] buffer, final int fromIndex, + final int position) { + int index = fromIndex; + switch (getByteSize(position)) { + case 1: + buffer[index++] = (byte)position; + return 1; + case 2: + buffer[index++] = (byte)(0xFF & (position >> 8)); + buffer[index++] = (byte)(0xFF & position); + return 2; + case 3: + buffer[index++] = (byte)(0xFF & (position >> 16)); + buffer[index++] = (byte)(0xFF & (position >> 8)); + buffer[index++] = (byte)(0xFF & position); + return 3; + case 0: + return 0; + default: + throw new RuntimeException("Position " + position + " has a strange size"); + } + } + + /** + * Makes the flag value for a PtNode. + * + * @param hasMultipleChars whether the PtNode has multiple chars. + * @param isTerminal whether the PtNode is terminal. + * @param childrenAddressSize the size of a children address. + * @param hasBigrams whether the PtNode has bigrams. + * @param isNotAWord whether the PtNode is not a word. + * @param isPossiblyOffensive whether the PtNode is a possibly offensive entry. + * @return the flags + */ + static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal, + final int childrenAddressSize, final boolean hasBigrams, + final boolean isNotAWord, final boolean isPossiblyOffensive) { + byte flags = 0; + if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; + if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; + switch (childrenAddressSize) { + case 1: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES; + break; + case 0: + flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS; + break; + default: + throw new RuntimeException("Node with a strange address"); + } + if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; + if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; + if (isPossiblyOffensive) flags |= FormatSpec.FLAG_IS_POSSIBLY_OFFENSIVE; + return flags; + } + + /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset) { + return (byte) makePtNodeFlags(node.mChars.length > 1, node.isTerminal(), + getByteSize(childrenOffset), + node.mBigrams != null && !node.mBigrams.isEmpty(), + node.mIsNotAWord, node.mIsPossiblyOffensive); + } + + /** + * Makes the flag value for a bigram. + * + * @param more whether there are more bigrams after this one. + * @param offset the offset of the bigram. + * @param bigramFrequency the frequency of the bigram, 0..255. + * @param unigramFrequency the unigram frequency of the same word, 0..255. + * @param word the second bigram, for debugging purposes + * @return the flags + */ + /* package */ static int makeBigramFlags(final boolean more, final int offset, + final int bigramFrequency, final int unigramFrequency, final String word) { + int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) + + (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0); + switch (getByteSize(offset)) { + case 1: + bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE; + break; + case 2: + bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES; + break; + case 3: + bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES; + break; + default: + throw new RuntimeException("Strange offset size"); + } + final int frequency; + if (unigramFrequency > bigramFrequency) { + MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word + + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " + + word + " is " + unigramFrequency); + frequency = unigramFrequency; + } else { + frequency = bigramFrequency; + } + bigramFlags += getBigramFrequencyDiff(unigramFrequency, frequency) + & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; + return bigramFlags; + } + + public static int getBigramFrequencyDiff(final int unigramFrequency, + final int bigramFrequency) { + // We compute the difference between 255 (which means probability = 1) and the + // unigram score. We split this into a number of discrete steps. + // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 + // represents an increase of 16 steps: a value of 15 will be interpreted as the median + // value of the 16th step. In all justice, if the bigram frequency is low enough to be + // rounded below the first step (which means it is less than half a step higher than the + // unigram frequency) then the unigram frequency itself is the best approximation of the + // bigram freq that we could possibly supply, hence we should *not* include this bigram + // in the file at all. + // until this is done, we'll write 0 and slightly overestimate this case. + // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step + // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to + // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the + // step size. Then we compute the start of the first step (the one where value 0 starts) + // by adding half-a-step to the unigramFrequency. From there, we compute the integer + // number of steps to the bigramFrequency. One last thing: we want our steps to include + // their lower bound and exclude their higher bound so we need to have the first step + // start at exactly 1 unit higher than floor(unigramFreq + half a step). + // Note : to reconstruct the score, the dictionary reader will need to divide + // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, + // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best + // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the + // step pointed by the discretized frequency. + final float stepSize = + (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) + / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); + final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); + final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); + // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 + // here. The best approximation would be the unigram freq itself, so we should not + // include this bigram in the dictionary. For now, register as 0, and live with the + // small over-estimation that we get in this case. TODO: actually remove this bigram + // if discretizedFrequency < 0. + return discretizedFrequency > 0 ? discretizedFrequency : 0; + } + + /* package */ static int getChildrenPosition(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate + + getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); + if (ptNode.isTerminal()) { + // A terminal node has the frequency. + // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children + // position. + positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS + : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField; + } + + /** + * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached. + * + * @param dict the dictionary the node array is a part of (for relative offsets). + * @param dictEncoder the dictionary encoder. + * @param ptNodeArray the node array to write. + * @param codePointToOneByteCodeMap the map to convert the code points. + */ + /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, + final DictEncoder dictEncoder, final PtNodeArray ptNodeArray, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + // TODO: Make the code in common with BinaryDictIOUtils#writePtNode + dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); + + final int ptNodeCount = ptNodeArray.mData.size(); + dictEncoder.writePtNodeCount(ptNodeCount); + for (int i = 0; i < ptNodeCount; ++i) { + final PtNode ptNode = ptNodeArray.mData.get(i); + if (dictEncoder.getPosition() != ptNode.mCachedAddressAfterUpdate) { + throw new RuntimeException("Bug: write index is not the same as the cached address " + + "of the node : " + dictEncoder.getPosition() + " <> " + + ptNode.mCachedAddressAfterUpdate); + } + // Validity checks. + if (DBG && ptNode.getProbability() > FormatSpec.MAX_TERMINAL_FREQUENCY) { + throw new RuntimeException("A node has a frequency > " + + FormatSpec.MAX_TERMINAL_FREQUENCY + + " : " + ptNode.mProbabilityInfo.toString()); + } + dictEncoder.writePtNode(ptNode, dict, codePointToOneByteCodeMap); + } + if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate + + ptNodeArray.mCachedSize) { + throw new RuntimeException("Not the same size : written " + + (dictEncoder.getPosition() - ptNodeArray.mCachedAddressAfterUpdate) + + " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes"); + } + } + + /** + * Dumps a collection of useful statistics about a list of PtNode arrays. + * + * This prints purely informative stuff, like the total estimated file size, the + * number of PtNode arrays, of PtNodes, the repartition of each address size, etc + * + * @param ptNodeArrays the list of PtNode arrays. + */ + /* package */ static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) { + int firstTerminalAddress = Integer.MAX_VALUE; + int lastTerminalAddress = Integer.MIN_VALUE; + int size = 0; + int ptNodes = 0; + int maxNodes = 0; + int maxRuns = 0; + for (final PtNodeArray ptNodeArray : ptNodeArrays) { + if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size(); + for (final PtNode ptNode : ptNodeArray.mData) { + ++ptNodes; + if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length; + if (ptNode.isTerminal()) { + if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress) + firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; + if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress) + lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; + } + } + if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) { + size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize; + } + } + final int[] ptNodeCounts = new int[maxNodes + 1]; + final int[] runCounts = new int[maxRuns + 1]; + for (final PtNodeArray ptNodeArray : ptNodeArrays) { + ++ptNodeCounts[ptNodeArray.mData.size()]; + for (final PtNode ptNode : ptNodeArray.mData) { + ++runCounts[ptNode.mChars.length]; + } + } + + MakedictLog.i("Statistics:\n" + + " Total file size " + size + "\n" + + " " + ptNodeArrays.size() + " node arrays\n" + + " " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size()) + + " PtNodes per node)\n" + + " First terminal at " + firstTerminalAddress + "\n" + + " Last terminal at " + lastTerminalAddress + "\n" + + " PtNode stats : max = " + maxNodes); + } + + /** + * Writes a file header to an output stream. + * + * @param destination the stream to write the file header to. + * @param dict the dictionary to write. + * @param formatOptions file format options. + * @param codePointOccurrenceArray code points ordered by occurrence count. + * @return the size of the header. + */ + /* package */ static int writeDictionaryHeader(final OutputStream destination, + final FusionDictionary dict, final FormatOptions formatOptions, + final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) + throws IOException, UnsupportedFormatException { + final int version = formatOptions.mVersion; + if ((version >= FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION && + version <= FormatSpec.MAXIMUM_SUPPORTED_STATIC_VERSION) || ( + version >= FormatSpec.MINIMUM_SUPPORTED_DYNAMIC_VERSION && + version <= FormatSpec.MAXIMUM_SUPPORTED_DYNAMIC_VERSION)) { + // Dictionary is valid + } else { + throw new UnsupportedFormatException("Requested file format version " + version + + ", but this implementation only supports static versions " + + FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION + " through " + + FormatSpec.MAXIMUM_SUPPORTED_STATIC_VERSION + " and dynamic versions " + + FormatSpec.MINIMUM_SUPPORTED_DYNAMIC_VERSION + " through " + + FormatSpec.MAXIMUM_SUPPORTED_DYNAMIC_VERSION); + } + + ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); + + // The magic number in big-endian order. + // Magic number for all versions. + headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 24))); + headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 16))); + headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 8))); + headerBuffer.write((byte) (0xFF & FormatSpec.MAGIC_NUMBER)); + // Dictionary version. + headerBuffer.write((byte) (0xFF & (version >> 8))); + headerBuffer.write((byte) (0xFF & version)); + + // Options flags + // TODO: Remove this field. + final int options = 0; + headerBuffer.write((byte) (0xFF & (options >> 8))); + headerBuffer.write((byte) (0xFF & options)); + final int headerSizeOffset = headerBuffer.size(); + // Placeholder to be written later with header size. + for (int i = 0; i < 4; ++i) { + headerBuffer.write(0); + } + // Write out the options. + for (final String key : dict.mOptions.mAttributes.keySet()) { + final String value = dict.mOptions.mAttributes.get(key); + CharEncoding.writeString(headerBuffer, key, null); + CharEncoding.writeString(headerBuffer, value, null); + } + // Write out the codePointTable if there is codePointOccurrenceArray. + if (codePointOccurrenceArray != null) { + final String codePointTableString = + encodeCodePointTable(codePointOccurrenceArray); + CharEncoding.writeString(headerBuffer, DictionaryHeader.CODE_POINT_TABLE_KEY, null); + CharEncoding.writeString(headerBuffer, codePointTableString, null); + } + final int size = headerBuffer.size(); + final byte[] bytes = headerBuffer.toByteArray(); + // Write out the header size. + bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); + bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); + bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); + bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); + destination.write(bytes); + + headerBuffer.close(); + return size; + } + + static final class CodePointTable { + final HashMap<Integer, Integer> mCodePointToOneByteCodeMap; + final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray; + + // Let code point table empty for version 200 dictionary which used in test + CodePointTable() { + mCodePointToOneByteCodeMap = null; + mCodePointOccurrenceArray = null; + } + + CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap, + final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { + mCodePointToOneByteCodeMap = codePointToOneByteCodeMap; + mCodePointOccurrenceArray = codePointOccurrenceArray; + } + } + + private static String encodeCodePointTable( + final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { + final StringBuilder codePointTableString = new StringBuilder(); + int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; + for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { + // Native reads the table as a string + codePointTableString.appendCodePoint(entry.getKey()); + if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { + break; + } + } + return codePointTableString.toString(); + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictIOUtils.java b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictIOUtils.java new file mode 100644 index 000000000..055d6492d --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictIOUtils.java @@ -0,0 +1,292 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.define.DecoderSpecificConstants; +import org.kelar.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Map; +import java.util.Stack; + +public final class BinaryDictIOUtils { + private static final boolean DBG = false; + + private BinaryDictIOUtils() { + // This utility class is not publicly instantiable. + } + + /** + * Returns new dictionary decoder. + * + * @param dictFile the dictionary file. + * @param bufferType The type of buffer, as one of USE_* in DictDecoder. + * @return new dictionary decoder if the dictionary file exists, otherwise null. + */ + public static DictDecoder getDictDecoder(final File dictFile, final long offset, + final long length, final int bufferType) { + return new Ver4DictDecoder(dictFile); + } + + public static DictDecoder getDictDecoder(final File dictFile, final long offset, + final long length, final DictionaryBufferFactory factory) { + return new Ver4DictDecoder(dictFile); + } + + public static DictDecoder getDictDecoder(final File dictFile, final long offset, + final long length) { + return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER); + } + + private static final class Position { + public static final int NOT_READ_PTNODE_COUNT = -1; + + public int mAddress; + public int mNumOfPtNode; + public int mPosition; + public int mLength; + + public Position(int address, int length) { + mAddress = address; + mLength = length; + mNumOfPtNode = NOT_READ_PTNODE_COUNT; + } + } + + /** + * Retrieves all node arrays without recursive call. + */ + private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, + final int bodyOffset, final Map<Integer, String> words, + final Map<Integer, Integer> frequencies, + final Map<Integer, ArrayList<PendingAttribute>> bigrams) { + int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; + + Stack<Position> stack = new Stack<>(); + int index = 0; + + Position initPos = new Position(bodyOffset, 0); + stack.push(initPos); + + while (!stack.empty()) { + Position p = stack.peek(); + + if (DBG) { + MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" + + p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength); + } + + if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress); + if (index != p.mLength) index = p.mLength; + + if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) { + p.mNumOfPtNode = dictDecoder.readPtNodeCount(); + p.mAddress = dictDecoder.getPosition(); + p.mPosition = 0; + } + if (p.mNumOfPtNode == 0) { + stack.pop(); + continue; + } + final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress); + for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) { + pushedChars[index++] = ptNodeInfo.mCharacters[i]; + } + p.mPosition++; + if (ptNodeInfo.isTerminal()) {// found word + words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index)); + frequencies.put( + ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability); + if (ptNodeInfo.mBigrams != null) { + bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams); + } + } + + if (p.mPosition == p.mNumOfPtNode) { + stack.pop(); + } else { + // The PtNode array has more PtNodes. + p.mAddress = dictDecoder.getPosition(); + } + + if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) { + final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index); + stack.push(childrenPos); + } + } + } + + /** + * Reads unigrams and bigrams from the binary file. + * Doesn't store a full memory representation of the dictionary. + * + * @param dictDecoder the dict decoder. + * @param words the map to store the address as a key and the word as a value. + * @param frequencies the map to store the address as a key and the frequency as a value. + * @param bigrams the map to store the address as a key and the list of address as a value. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, + final Map<Integer, String> words, final Map<Integer, Integer> frequencies, + final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, + UnsupportedFormatException { + // Read header + final DictionaryHeader header = dictDecoder.readHeader(); + readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words, + frequencies, bigrams); + } + + /** + * Gets the address of the last PtNode of the exact matching word in the dictionary. + * If no match is found, returns NOT_VALID_WORD. + * + * @param dictDecoder the dict decoder. + * @param word the word we search for. + * @return the address of the terminal node. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + @UsedForTesting + /* package */ static int getTerminalPosition(final DictDecoder dictDecoder, + final String word) throws IOException, UnsupportedFormatException { + if (word == null) return FormatSpec.NOT_VALID_WORD; + dictDecoder.setPosition(0); + dictDecoder.readHeader(); + int wordPos = 0; + final int wordLen = word.codePointCount(0, word.length()); + for (int depth = 0; depth < DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { + if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; + + do { + final int ptNodeCount = dictDecoder.readPtNodeCount(); + boolean foundNextPtNode = false; + for (int i = 0; i < ptNodeCount; ++i) { + final int ptNodePos = dictDecoder.getPosition(); + final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos); + boolean same = true; + for (int p = 0, j = word.offsetByCodePoints(0, wordPos); + p < currentInfo.mCharacters.length; + ++p, j = word.offsetByCodePoints(j, 1)) { + if (wordPos + p >= wordLen + || word.codePointAt(j) != currentInfo.mCharacters[p]) { + same = false; + break; + } + } + + if (same) { + // found the PtNode matches the word. + if (wordPos + currentInfo.mCharacters.length == wordLen) { + return currentInfo.isTerminal() ? ptNodePos : FormatSpec.NOT_VALID_WORD; + } + wordPos += currentInfo.mCharacters.length; + if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { + return FormatSpec.NOT_VALID_WORD; + } + foundNextPtNode = true; + dictDecoder.setPosition(currentInfo.mChildrenAddress); + break; + } + } + if (foundNextPtNode) break; + return FormatSpec.NOT_VALID_WORD; + } while(true); + } + return FormatSpec.NOT_VALID_WORD; + } + + /** + * Writes a PtNodeCount to the stream. + * + * @param destination the stream to write. + * @param ptNodeCount the count. + * @return the size written in bytes. + */ + @UsedForTesting + static int writePtNodeCount(final OutputStream destination, final int ptNodeCount) + throws IOException { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + // the count must fit on one byte or two bytes. + // Please see comments in FormatSpec. + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); + } + final int encodedPtNodeCount = (countSize == 2) ? + (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; + BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize); + return countSize; + } + + /** + * Helper method to hide the actual value of the no children address. + */ + public static boolean hasChildrenAddress(final int address) { + return FormatSpec.NO_CHILDREN_ADDRESS != address; + } + + /** + * Compute the binary size of the node count + * @param count the node count + * @return the size of the node count, either 1 or 2 bytes. + */ + public static int getPtNodeCountSize(final int count) { + if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) { + return 1; + } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) { + return 2; + } else { + throw new RuntimeException("Can't have more than " + + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found " + + count + ")"); + } + } + + static int getChildrenAddressSize(final int optionFlags) { + switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: + return 1; + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: + return 2; + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: + return 3; + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: + default: + return 0; + } + } + + /** + * Calculate bigram frequency from compressed value + * + * @param unigramFrequency + * @param bigramFrequency compressed frequency + * @return approximate bigram frequency + */ + @UsedForTesting + public static int reconstructBigramFrequency(final int unigramFrequency, + final int bigramFrequency) { + final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) + / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); + final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f); + return (int)resultFreqFloat; + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictUtils.java b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictUtils.java new file mode 100644 index 000000000..48e3d95f2 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictUtils.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; +import org.kelar.inputmethod.latin.makedict.FormatSpec.FormatOptions; + +import java.io.File; +import java.util.HashMap; + +public class BinaryDictUtils { + public static final int USE_BYTE_ARRAY = 1; + public static final int USE_BYTE_BUFFER = 2; + + public static final String TEST_DICT_FILE_EXTENSION = ".testDict"; + + public static final FormatSpec.FormatOptions STATIC_OPTIONS = + new FormatSpec.FormatOptions(FormatSpec.VERSION202); + public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP = + new FormatSpec.FormatOptions(FormatSpec.VERSION4, false /* hasTimestamp */); + public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITH_TIMESTAMP = + new FormatSpec.FormatOptions(FormatSpec.VERSION4, true /* hasTimestamp */); + + public static DictionaryOptions makeDictionaryOptions(final String id, final String version, + final FormatSpec.FormatOptions formatOptions) { + final DictionaryOptions options = new DictionaryOptions(new HashMap<String, String>()); + options.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, "en_US"); + options.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, id); + options.mAttributes.put(DictionaryHeader.DICTIONARY_VERSION_KEY, version); + if (formatOptions.mHasTimestamp) { + options.mAttributes.put(DictionaryHeader.HAS_HISTORICAL_INFO_KEY, + DictionaryHeader.ATTRIBUTE_VALUE_TRUE); + options.mAttributes.put(DictionaryHeader.USES_FORGETTING_CURVE_KEY, + DictionaryHeader.ATTRIBUTE_VALUE_TRUE); + } + return options; + } + + public static File getDictFile(final String name, final String version, + final FormatOptions formatOptions, final File directory) { + if (formatOptions.mVersion == FormatSpec.VERSION2 + || formatOptions.mVersion == FormatSpec.VERSION201 + || formatOptions.mVersion == FormatSpec.VERSION202) { + return new File(directory, name + "." + version + TEST_DICT_FILE_EXTENSION); + } else if (formatOptions.mVersion == FormatSpec.VERSION4) { + return new File(directory, name + "." + version); + } else { + throw new RuntimeException("the format option has a wrong version : " + + formatOptions.mVersion); + } + } + + public static DictEncoder getDictEncoder(final File file, final FormatOptions formatOptions) { + if (formatOptions.mVersion == FormatSpec.VERSION4) { + if (!file.isDirectory()) { + file.mkdir(); + } + return new Ver4DictEncoder(file); + } else if (formatOptions.mVersion == FormatSpec.VERSION202) { + return new Ver2DictEncoder(file, Ver2DictEncoder.CODE_POINT_TABLE_OFF); + } else { + throw new RuntimeException("The format option has a wrong version : " + + formatOptions.mVersion); + } + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/DictDecoder.java b/tests/src/org/kelar/inputmethod/latin/makedict/DictDecoder.java new file mode 100644 index 000000000..97fb3596b --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/DictDecoder.java @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import org.kelar.inputmethod.latin.utils.ByteArrayDictBuffer; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.TreeMap; + +/** + * An interface of binary dictionary decoders. + */ +// TODO: Straighten out responsibility for the buffer's file pointer. +public interface DictDecoder { + + /** + * Reads and returns the file header. + */ + public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException; + + /** + * Reads PtNode from ptNodePos. + * @param ptNodePos the position of PtNode. + * @return PtNodeInfo. + */ + public PtNodeInfo readPtNode(final int ptNodePos); + + /** + * Reads a buffer and returns the memory representation of the dictionary. + * + * This high-level method takes a buffer and reads its contents, populating a + * FusionDictionary structure. + * + * @param deleteDictIfBroken a flag indicating whether this method should remove the broken + * dictionary or not. + * @return the created dictionary. + */ + @UsedForTesting + public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException; + + /** + * Gets the address of the last PtNode of the exact matching word in the dictionary. + * If no match is found, returns NOT_VALID_WORD. + * + * @param word the word we search for. + * @return the address of the terminal node. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + @UsedForTesting + public int getTerminalPosition(final String word) + throws IOException, UnsupportedFormatException; + + /** + * Reads unigrams and bigrams from the binary file. + * Doesn't store a full memory representation of the dictionary. + * + * @param words the map to store the address as a key and the word as a value. + * @param frequencies the map to store the address as a key and the frequency as a value. + * @param bigrams the map to store the address as a key and the list of address as a value. + * @throws IOException if the file can't be read. + * @throws UnsupportedFormatException if the format of the file is not recognized. + */ + @UsedForTesting + public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, + final TreeMap<Integer, Integer> frequencies, + final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) + throws IOException, UnsupportedFormatException; + + /** + * Sets the position of the buffer to the given value. + * + * @param newPos the new position + */ + public void setPosition(final int newPos); + + /** + * Gets the position of the buffer. + * + * @return the position + */ + public int getPosition(); + + /** + * Reads and returns the PtNode count out of a buffer and forwards the pointer. + */ + public int readPtNodeCount(); + + /** + * Opens the dictionary file and makes DictBuffer. + */ + @UsedForTesting + public void openDictBuffer() throws FileNotFoundException, IOException, + UnsupportedFormatException; + @UsedForTesting + public boolean isDictBufferOpen(); + + // Constants for DictionaryBufferFactory. + public static final int USE_READONLY_BYTEBUFFER = 0x01000000; + public static final int USE_BYTEARRAY = 0x02000000; + public static final int USE_WRITABLE_BYTEBUFFER = 0x03000000; + public static final int MASK_DICTBUFFER = 0x0F000000; + + public interface DictionaryBufferFactory { + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException; + } + + /** + * Creates DictionaryBuffer using a ByteBuffer + * + * This class uses less memory than DictionaryBufferFromByteArrayFactory, + * but doesn't perform as fast. + * When operating on a big dictionary, this class is preferred. + */ + public static final class DictionaryBufferFromReadOnlyByteBufferFactory + implements DictionaryBufferFactory { + @Override + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException { + FileInputStream inStream = null; + ByteBuffer buffer = null; + try { + inStream = new FileInputStream(file); + buffer = inStream.getChannel().map(FileChannel.MapMode.READ_ONLY, + 0, file.length()); + } finally { + if (inStream != null) { + inStream.close(); + } + } + if (buffer != null) { + return new BinaryDictDecoderUtils.ByteBufferDictBuffer(buffer); + } + return null; + } + } + + /** + * Creates DictionaryBuffer using a byte array + * + * This class performs faster than other classes, but consumes more memory. + * When operating on a small dictionary, this class is preferred. + */ + public static final class DictionaryBufferFromByteArrayFactory + implements DictionaryBufferFactory { + @Override + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException { + FileInputStream inStream = null; + try { + inStream = new FileInputStream(file); + final byte[] array = new byte[(int) file.length()]; + inStream.read(array); + return new ByteArrayDictBuffer(array); + } finally { + if (inStream != null) { + inStream.close(); + } + } + } + } + + /** + * Creates DictionaryBuffer using a writable ByteBuffer and a RandomAccessFile. + * + * This class doesn't perform as fast as other classes, + * but this class is the only option available for destructive operations (insert or delete) + * on a dictionary. + */ + @UsedForTesting + public static final class DictionaryBufferFromWritableByteBufferFactory + implements DictionaryBufferFactory { + @Override + public DictBuffer getDictionaryBuffer(final File file) + throws FileNotFoundException, IOException { + RandomAccessFile raFile = null; + ByteBuffer buffer = null; + try { + raFile = new RandomAccessFile(file, "rw"); + buffer = raFile.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, file.length()); + } finally { + if (raFile != null) { + raFile.close(); + } + } + if (buffer != null) { + return new BinaryDictDecoderUtils.ByteBufferDictBuffer(buffer); + } + return null; + } + } + + /** + * @return whether this decoder has a valid binary dictionary that it can decode. + */ + public boolean hasValidRawBinaryDictionary(); +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/DictEncoder.java b/tests/src/org/kelar/inputmethod/latin/makedict/DictEncoder.java new file mode 100644 index 000000000..d637b6d2a --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/DictEncoder.java @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNode; + +import java.io.IOException; +import java.util.HashMap; + +/** + * An interface of binary dictionary encoder. + */ +public interface DictEncoder { + @UsedForTesting + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException; + + public void setPosition(final int position); + public int getPosition(); + public void writePtNodeCount(final int ptNodeCount); + public void writePtNode(final PtNode ptNode, final FusionDictionary dict, + final HashMap<Integer, Integer> codePointToOneByteCodeMap); +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/FusionDictionary.java b/tests/src/org/kelar/inputmethod/latin/makedict/FusionDictionary.java new file mode 100644 index 000000000..817d69f99 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/FusionDictionary.java @@ -0,0 +1,646 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.define.DecoderSpecificConstants; +import org.kelar.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedList; + +/** + * A dictionary that can fusion heads and tails of words for more compression. + */ +@UsedForTesting +public final class FusionDictionary implements Iterable<WordProperty> { + private static final boolean DBG = MakedictLog.DBG; + + private static int CHARACTER_NOT_FOUND_INDEX = -1; + + /** + * A node array of the dictionary, containing several PtNodes. + * + * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the + * real information. + * This class also contains fields to cache size and address, to help with binary + * generation. + */ + public static final class PtNodeArray { + ArrayList<PtNode> mData; + // To help with binary generation + int mCachedSize = Integer.MIN_VALUE; + // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They + // always hold the same value except between dictionary address compression, during which + // the update process needs to know about both values at the same time. Updating will + // update the AfterUpdate value, and the code will move them to BeforeUpdate before + // the next update pass. + int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; + int mCachedAddressAfterUpdate = Integer.MIN_VALUE; + int mCachedParentAddress = 0; + + public PtNodeArray() { + mData = new ArrayList<>(); + } + public PtNodeArray(ArrayList<PtNode> data) { + Collections.sort(data, PTNODE_COMPARATOR); + mData = data; + } + } + + /** + * PtNode is a group of characters, with probability information, shortcut targets, bigrams, + * and children (Pt means Patricia Trie). + * + * This is the central class of the in-memory representation. A PtNode is what can + * be seen as a traditional "trie node", except it can hold several characters at the + * same time. A PtNode essentially represents one or several characters in the middle + * of the trie tree; as such, it can be a terminal, and it can have children. + * In this in-memory representation, whether the PtNode is a terminal or not is represented + * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the + * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null + * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present, + * are non-null. + */ + public static final class PtNode { + private static final int NOT_A_TERMINAL = -1; + final int mChars[]; + ArrayList<WeightedString> mBigrams; + // null == mProbabilityInfo indicates this is not a terminal. + ProbabilityInfo mProbabilityInfo; + int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. + PtNodeArray mChildren; + boolean mIsNotAWord; // Only a shortcut + boolean mIsPossiblyOffensive; + // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary + // generation. Before and After always hold the same value except during dictionary + // address compression, where the update process needs to know about both values at the + // same time. Updating will update the AfterUpdate value, and the code will move them + // to BeforeUpdate before the next update pass. + // The update process does not need two versions of mCachedSize. + int mCachedSize; // The size, in bytes, of this PtNode. + int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) + int mCachedAddressAfterUpdate; // The address of this PtNode (after update) + + public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, + final ProbabilityInfo probabilityInfo, final boolean isNotAWord, + final boolean isPossiblyOffensive) { + mChars = chars; + mProbabilityInfo = probabilityInfo; + mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability; + mBigrams = bigrams; + mChildren = null; + mIsNotAWord = isNotAWord; + mIsPossiblyOffensive = isPossiblyOffensive; + } + + public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, + final ProbabilityInfo probabilityInfo, final boolean isNotAWord, + final boolean isPossiblyOffensive, final PtNodeArray children) { + mChars = chars; + mProbabilityInfo = probabilityInfo; + mBigrams = bigrams; + mChildren = children; + mIsNotAWord = isNotAWord; + mIsPossiblyOffensive = isPossiblyOffensive; + } + + public void addChild(PtNode n) { + if (null == mChildren) { + mChildren = new PtNodeArray(); + } + mChildren.mData.add(n); + } + + public int getTerminalId() { + return mTerminalId; + } + + public boolean isTerminal() { + return mProbabilityInfo != null; + } + + public int getProbability() { + return isTerminal() ? mProbabilityInfo.mProbability : NOT_A_TERMINAL; + } + + public boolean getIsNotAWord() { + return mIsNotAWord; + } + + public boolean getIsPossiblyOffensive() { + return mIsPossiblyOffensive; + } + + public ArrayList<WeightedString> getBigrams() { + // We don't want write permission to escape outside the package, so we return a copy + if (null == mBigrams) return null; + final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams); + return copyOfBigrams; + } + + public boolean hasSeveralChars() { + assert(mChars.length > 0); + return 1 < mChars.length; + } + + /** + * Adds a word to the bigram list. Updates the probability information if the word already + * exists. + */ + public void addBigram(final String word, final ProbabilityInfo probabilityInfo) { + if (mBigrams == null) { + mBigrams = new ArrayList<>(); + } + WeightedString bigram = getBigram(word); + if (bigram != null) { + bigram.mProbabilityInfo = probabilityInfo; + } else { + bigram = new WeightedString(word, probabilityInfo); + mBigrams.add(bigram); + } + } + + /** + * Gets the bigram for the given word. + * Returns null if the word is not in the bigrams list. + */ + public WeightedString getBigram(final String word) { + // TODO: Don't do a linear search + if (mBigrams != null) { + final int size = mBigrams.size(); + for (int i = 0; i < size; ++i) { + WeightedString bigram = mBigrams.get(i); + if (bigram.mWord.equals(word)) { + return bigram; + } + } + } + return null; + } + + /** + * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to + * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only + * updated if they are higher than the existing ones. + */ + void update(final ProbabilityInfo probabilityInfo, + final ArrayList<WeightedString> bigrams, + final boolean isNotAWord, final boolean isPossiblyOffensive) { + mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo); + if (bigrams != null) { + if (mBigrams == null) { + mBigrams = bigrams; + } else { + final int size = bigrams.size(); + for (int i = 0; i < size; ++i) { + final WeightedString bigram = bigrams.get(i); + final WeightedString existingBigram = getBigram(bigram.mWord); + if (existingBigram == null) { + mBigrams.add(bigram); + } else { + existingBigram.mProbabilityInfo = ProbabilityInfo.max( + existingBigram.mProbabilityInfo, bigram.mProbabilityInfo); + } + } + } + } + mIsNotAWord = isNotAWord; + mIsPossiblyOffensive = isPossiblyOffensive; + } + } + + public final DictionaryOptions mOptions; + public final PtNodeArray mRootNodeArray; + + public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { + mRootNodeArray = rootNodeArray; + mOptions = options; + } + + public void addOptionAttribute(final String key, final String value) { + mOptions.mAttributes.put(key, value); + } + + /** + * Helper method to convert a String to an int array. + */ + static int[] getCodePoints(final String word) { + // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, + // which is not visible from the makedict package. Factor this code. + final int length = word.length(); + if (length <= 0) return new int[] {}; + final char[] characters = word.toCharArray(); + final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; + int codePoint = Character.codePointAt(characters, 0); + int dsti = 0; + for (int srci = Character.charCount(codePoint); + srci < length; srci += Character.charCount(codePoint), ++dsti) { + codePoints[dsti] = codePoint; + codePoint = Character.codePointAt(characters, srci); + } + codePoints[dsti] = codePoint; + return codePoints; + } + + /** + * Helper method to add a word as a string. + * + * This method adds a word to the dictionary with the given frequency. Optional + * lists of bigrams can be passed here. For each word inside, + * they will be added to the dictionary as necessary. + * @param word the word to add. + * @param probabilityInfo probability information of the word. + * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) + * @param isPossiblyOffensive true if this word is possibly offensive + */ + public void add(final String word, final ProbabilityInfo probabilityInfo, + final boolean isNotAWord, final boolean isPossiblyOffensive) { + add(getCodePoints(word), probabilityInfo, isNotAWord, isPossiblyOffensive); + } + + /** + * Validity check for a PtNode array. + * + * This method checks that all PtNodes in a node array are ordered as expected. + * If they are, nothing happens. If they aren't, an exception is thrown. + */ + private static void checkStack(PtNodeArray ptNodeArray) { + ArrayList<PtNode> stack = ptNodeArray.mData; + int lastValue = -1; + for (int i = 0; i < stack.size(); ++i) { + int currentValue = stack.get(i).mChars[0]; + if (currentValue <= lastValue) { + throw new RuntimeException("Invalid stack"); + } + lastValue = currentValue; + } + } + + /** + * Helper method to add a new bigram to the dictionary. + * + * @param word0 the previous word of the context + * @param word1 the next word of the context + * @param probabilityInfo the bigram probability info + */ + public void setBigram(final String word0, final String word1, + final ProbabilityInfo probabilityInfo) { + PtNode ptNode0 = findWordInTree(mRootNodeArray, word0); + if (ptNode0 != null) { + final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1); + if (ptNode1 == null) { + add(getCodePoints(word1), new ProbabilityInfo(0), false /* isNotAWord */, + false /* isPossiblyOffensive */); + // The PtNode for the first word may have moved by the above insertion, + // if word1 and word2 share a common stem that happens not to have been + // a cutting point until now. In this case, we need to refresh ptNode. + ptNode0 = findWordInTree(mRootNodeArray, word0); + } + ptNode0.addBigram(word1, probabilityInfo); + } else { + throw new RuntimeException("First word of bigram not found " + word0); + } + } + + /** + * Add a word to this dictionary. + * + * The shortcuts, if any, have to be in the dictionary already. If they aren't, + * an exception is thrown. + * @param word the word, as an int array. + * @param probabilityInfo the probability information of the word. + * @param isNotAWord true if this is not a word for spellchecking purposes (shortcut only or so) + * @param isPossiblyOffensive true if this word is possibly offensive + */ + private void add(final int[] word, final ProbabilityInfo probabilityInfo, + final boolean isNotAWord, final boolean isPossiblyOffensive) { + assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY); + if (word.length >= DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH) { + MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); + return; + } + + PtNodeArray currentNodeArray = mRootNodeArray; + int charIndex = 0; + + PtNode currentPtNode = null; + int differentCharIndex = 0; // Set by the loop to the index of the char that differs + int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); + while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { + currentPtNode = currentNodeArray.mData.get(nodeIndex); + differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); + if (ARRAYS_ARE_EQUAL != differentCharIndex + && differentCharIndex < currentPtNode.mChars.length) break; + if (null == currentPtNode.mChildren) break; + charIndex += currentPtNode.mChars.length; + if (charIndex >= word.length) break; + currentNodeArray = currentPtNode.mChildren; + nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); + } + + if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { + // No node at this point to accept the word. Create one. + final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); + final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), + null /* bigrams */, probabilityInfo, isNotAWord, + isPossiblyOffensive); + currentNodeArray.mData.add(insertionIndex, newPtNode); + if (DBG) checkStack(currentNodeArray); + } else { + // There is a word with a common prefix. + if (differentCharIndex == currentPtNode.mChars.length) { + if (charIndex + differentCharIndex >= word.length) { + // The new word is a prefix of an existing word, but the node on which it + // should end already exists as is. Since the old PtNode was not a terminal, + // make it one by filling in its frequency and other attributes + currentPtNode.update(probabilityInfo, null, isNotAWord, + isPossiblyOffensive); + } else { + // The new word matches the full old word and extends past it. + // We only have to create a new node and add it to the end of this. + final PtNode newNode = new PtNode( + Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), + null /* bigrams */, probabilityInfo, + isNotAWord, isPossiblyOffensive); + currentPtNode.mChildren = new PtNodeArray(); + currentPtNode.mChildren.mData.add(newNode); + } + } else { + if (0 == differentCharIndex) { + // Exact same word. Update the frequency if higher. This will also add the + // new shortcuts to the existing shortcut list if it already exists. + currentPtNode.update(probabilityInfo, null, + currentPtNode.mIsNotAWord && isNotAWord, + currentPtNode.mIsPossiblyOffensive || isPossiblyOffensive); + } else { + // Partial prefix match only. We have to replace the current node with a node + // containing the current prefix and create two new ones for the tails. + PtNodeArray newChildren = new PtNodeArray(); + final PtNode newOldWord = new PtNode( + Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, + currentPtNode.mChars.length), + currentPtNode.mBigrams, currentPtNode.mProbabilityInfo, + currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive, + currentPtNode.mChildren); + newChildren.mData.add(newOldWord); + + final PtNode newParent; + if (charIndex + differentCharIndex >= word.length) { + newParent = new PtNode( + Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), + null /* bigrams */, probabilityInfo, + isNotAWord, isPossiblyOffensive, newChildren); + } else { + newParent = new PtNode( + Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), + null /* bigrams */, null /* probabilityInfo */, + false /* isNotAWord */, false /* isPossiblyOffensive */, + newChildren); + final PtNode newWord = new PtNode(Arrays.copyOfRange(word, + charIndex + differentCharIndex, word.length), + null /* bigrams */, probabilityInfo, + isNotAWord, isPossiblyOffensive); + final int addIndex = word[charIndex + differentCharIndex] + > currentPtNode.mChars[differentCharIndex] ? 1 : 0; + newChildren.mData.add(addIndex, newWord); + } + currentNodeArray.mData.set(nodeIndex, newParent); + } + if (DBG) checkStack(currentNodeArray); + } + } + } + + private static int ARRAYS_ARE_EQUAL = 0; + + /** + * Custom comparison of two int arrays taken to contain character codes. + * + * This method compares the two arrays passed as an argument in a lexicographic way, + * with an offset in the dst string. + * This method does NOT test for the first character. It is taken to be equal. + * I repeat: this method starts the comparison at 1 <> dstOffset + 1. + * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the + * strings are equal. This works BECAUSE we don't look at the first character. + * + * @param src the left-hand side string of the comparison. + * @param dst the right-hand side string of the comparison. + * @param dstOffset the offset in the right-hand side string. + * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. + */ + private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { + // We do NOT test the first char, because we come from a method that already + // tested it. + for (int i = 1; i < src.length; ++i) { + if (dstOffset + i >= dst.length) return i; + if (src[i] != dst[dstOffset + i]) return i; + } + if (dst.length > src.length) return src.length; + return ARRAYS_ARE_EQUAL; + } + + /** + * Helper class that compares and sorts two PtNodes according to their + * first element only. I repeat: ONLY the first element is considered, the rest + * is ignored. + * This comparator imposes orderings that are inconsistent with equals. + */ + static final class PtNodeComparator implements java.util.Comparator<PtNode> { + @Override + public int compare(PtNode p1, PtNode p2) { + if (p1.mChars[0] == p2.mChars[0]) return 0; + return p1.mChars[0] < p2.mChars[0] ? -1 : 1; + } + } + final static PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); + + /** + * Finds the insertion index of a character within a node array. + */ + private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { + final ArrayList<PtNode> data = nodeArray.mData; + final PtNode reference = new PtNode(new int[] { character }, + null /* bigrams */, null /* probabilityInfo */, + false /* isNotAWord */, false /* isPossiblyOffensive */); + int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); + return result >= 0 ? result : -result - 1; + } + + /** + * Find the index of a char in a node array, if it exists. + * + * @param nodeArray the node array to search in. + * @param character the character to search for. + * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. + */ + private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { + final int insertionIndex = findInsertionIndex(nodeArray, character); + if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; + return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex + : CHARACTER_NOT_FOUND_INDEX; + } + + /** + * Helper method to find a word in a given branch. + */ + public static PtNode findWordInTree(final PtNodeArray rootNodeArray, final String string) { + PtNodeArray nodeArray = rootNodeArray; + int index = 0; + final StringBuilder checker = DBG ? new StringBuilder() : null; + final int[] codePoints = getCodePoints(string); + + PtNode currentPtNode; + do { + int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); + if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; + currentPtNode = nodeArray.mData.get(indexOfGroup); + + if (codePoints.length - index < currentPtNode.mChars.length) return null; + int newIndex = index; + while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { + if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; + newIndex++; + } + index = newIndex; + + if (DBG) { + checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); + } + if (index < codePoints.length) { + nodeArray = currentPtNode.mChildren; + } + } while (null != nodeArray && index < codePoints.length); + + if (index < codePoints.length) return null; + if (!currentPtNode.isTerminal()) return null; + if (DBG && !string.equals(checker.toString())) return null; + return currentPtNode; + } + + /** + * Helper method to find out whether a word is in the dict or not. + */ + public boolean hasWord(final String s) { + if (null == s || "".equals(s)) { + throw new RuntimeException("Can't search for a null or empty string"); + } + return null != findWordInTree(mRootNodeArray, s); + } + + /** + * Recursively count the number of PtNodes in a given branch of the trie. + * + * @param nodeArray the parent node. + * @return the number of PtNodes in all the branch under this node. + */ + public static int countPtNodes(final PtNodeArray nodeArray) { + final int nodeSize = nodeArray.mData.size(); + int size = nodeSize; + for (int i = nodeSize - 1; i >= 0; --i) { + PtNode ptNode = nodeArray.mData.get(i); + if (null != ptNode.mChildren) + size += countPtNodes(ptNode.mChildren); + } + return size; + } + + /** + * Iterator to walk through a dictionary. + * + * This is purely for convenience. + */ + public static final class DictionaryIterator implements Iterator<WordProperty> { + private static final class Position { + public Iterator<PtNode> pos; + public int length; + public Position(ArrayList<PtNode> ptNodes) { + pos = ptNodes.iterator(); + length = 0; + } + } + final StringBuilder mCurrentString; + final LinkedList<Position> mPositions; + + public DictionaryIterator(ArrayList<PtNode> ptRoot) { + mCurrentString = new StringBuilder(); + mPositions = new LinkedList<>(); + final Position rootPos = new Position(ptRoot); + mPositions.add(rootPos); + } + + @Override + public boolean hasNext() { + for (Position p : mPositions) { + if (p.pos.hasNext()) { + return true; + } + } + return false; + } + + @Override + public WordProperty next() { + Position currentPos = mPositions.getLast(); + mCurrentString.setLength(currentPos.length); + + do { + if (currentPos.pos.hasNext()) { + final PtNode currentPtNode = currentPos.pos.next(); + currentPos.length = mCurrentString.length(); + for (int i : currentPtNode.mChars) { + mCurrentString.append(Character.toChars(i)); + } + if (null != currentPtNode.mChildren) { + currentPos = new Position(currentPtNode.mChildren.mData); + currentPos.length = mCurrentString.length(); + mPositions.addLast(currentPos); + } + if (currentPtNode.isTerminal()) { + return new WordProperty(mCurrentString.toString(), + currentPtNode.mProbabilityInfo, currentPtNode.mBigrams, + currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive); + } + } else { + mPositions.removeLast(); + currentPos = mPositions.getLast(); + mCurrentString.setLength(mPositions.getLast().length); + } + } while (true); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Unsupported yet"); + } + + } + + /** + * Method to return an iterator. + * + * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x + * and say : for (Word w : x) {} + */ + @Override + public Iterator<WordProperty> iterator() { + return new DictionaryIterator(mRootNodeArray.mData); + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/MakedictLog.java b/tests/src/org/kelar/inputmethod/latin/makedict/MakedictLog.java new file mode 100644 index 000000000..07270d1d0 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/MakedictLog.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +/** + * Wrapper to redirect log events to the right output medium. + */ +public class MakedictLog { + public static final boolean DBG = true; + + private static void print(String message) { + System.out.println(message); + } + + public static void d(String message) { + print(message); + } + + public static void i(String message) { + print(message); + } + + public static void w(String message) { + print(message); + } + + public static void e(String message) { + print(message); + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/PendingAttribute.java b/tests/src/org/kelar/inputmethod/latin/makedict/PendingAttribute.java new file mode 100644 index 000000000..0abce4848 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/PendingAttribute.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +/** + * A not-yet-resolved attribute. + * + * An attribute is either a bigram or a shortcut. + * All instances of this class are always immutable. + */ +public final class PendingAttribute { + public final int mFrequency; + public final int mAddress; + public PendingAttribute(final int frequency, final int address) { + mFrequency = frequency; + mAddress = address; + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/PtNodeInfo.java b/tests/src/org/kelar/inputmethod/latin/makedict/PtNodeInfo.java new file mode 100644 index 000000000..65209a832 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/PtNodeInfo.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import java.util.ArrayList; + +/** + * Raw PtNode info straight out of a file. This will contain numbers for addresses. + */ +public final class PtNodeInfo { + public final int mOriginalAddress; + public final int mEndAddress; + public final int mFlags; + public final int[] mCharacters; + public final ProbabilityInfo mProbabilityInfo; + public final int mChildrenAddress; + public final ArrayList<WeightedString> mShortcutTargets; + public final ArrayList<PendingAttribute> mBigrams; + + public PtNodeInfo(final int originalAddress, final int endAddress, final int flags, + final int[] characters, final ProbabilityInfo probabilityInfo, + final int childrenAddress, final ArrayList<WeightedString> shortcutTargets, + final ArrayList<PendingAttribute> bigrams) { + mOriginalAddress = originalAddress; + mEndAddress = endAddress; + mFlags = flags; + mCharacters = characters; + mProbabilityInfo = probabilityInfo; + mChildrenAddress = childrenAddress; + mShortcutTargets = shortcutTargets; + mBigrams = bigrams; + } + + public boolean isTerminal() { + return mProbabilityInfo != null; + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java b/tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java new file mode 100644 index 000000000..8b3636208 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import org.kelar.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; +import org.kelar.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNode; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map.Entry; +import java.util.Objects; + +/** + * An implementation of DictEncoder for version 2 binary dictionary. + */ +@UsedForTesting +public class Ver2DictEncoder implements DictEncoder { + + private final File mDictFile; + private OutputStream mOutStream; + private byte[] mBuffer; + private int mPosition; + private final int mCodePointTableMode; + public static final int CODE_POINT_TABLE_OFF = 0; + public static final int CODE_POINT_TABLE_ON = 1; + + @UsedForTesting + public Ver2DictEncoder(final File dictFile, final int codePointTableMode) { + mDictFile = dictFile; + mOutStream = null; + mBuffer = null; + mCodePointTableMode = codePointTableMode; + } + + // This constructor is used only by BinaryDictOffdeviceUtilsTests. + // If you want to use this in the production code, you should consider keeping consistency of + // the interface of Ver3DictDecoder by using factory. + @UsedForTesting + public Ver2DictEncoder(final OutputStream outStream) { + mDictFile = null; + mOutStream = outStream; + mCodePointTableMode = CODE_POINT_TABLE_OFF; + } + + private void openStream() throws FileNotFoundException { + mOutStream = new FileOutputStream(mDictFile); + } + + private void close() throws IOException { + if (mOutStream != null) { + mOutStream.close(); + mOutStream = null; + } + } + + // Package for testing + static CodePointTable makeCodePointTable(final FusionDictionary dict) { + final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>(); + for (final WordProperty word : dict) { + // Store per code point occurrence + final String wordString = word.mWord; + for (int i = 0; i < wordString.length(); ++i) { + final int codePoint = Character.codePointAt(wordString, i); + if (codePointOccurrenceCounts.containsKey(codePoint)) { + codePointOccurrenceCounts.put(codePoint, + codePointOccurrenceCounts.get(codePoint) + 1); + } else { + codePointOccurrenceCounts.put(codePoint, 1); + } + } + } + final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray = + new ArrayList<>(codePointOccurrenceCounts.entrySet()); + // Descending order sort by occurrence (value side) + Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() { + @Override + public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) { + if (!Objects.equals(a.getValue(), b.getValue())) { + return b.getValue().compareTo(a.getValue()); + } + return b.getKey().compareTo(a.getKey()); + } + }); + int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; + // Temporary map for writing of nodes + final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>(); + for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { + // Put a relation from the original code point to the one byte code. + codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); + if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { + break; + } + } + // codePointToOneByteCodeMap for writing the trie + // codePointOccurrenceArray for writing the header + return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + // We no longer support anything but the latest version of v2. + if (formatOptions.mVersion != FormatSpec.VERSION202) { + throw new UnsupportedFormatException( + "The given format options has wrong version number : " + + formatOptions.mVersion); + } + + if (mOutStream == null) { + openStream(); + } + + // Make code point conversion table ordered by occurrence of code points + // Version 201 or later have codePointTable + final CodePointTable codePointTable; + if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion + < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) { + codePointTable = new CodePointTable(); + } else { + codePointTable = makeCodePointTable(dict); + } + + BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, + codePointTable.mCodePointOccurrenceArray); + + // Addresses are limited to 3 bytes, but since addresses can be relative to each node + // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding + // the order of the PtNode arrays becomes a quite complicated problem, because though the + // dictionary itself does not have a size limit, each node array must still be within 16MB + // of all its children and parents. As long as this is ensured, the dictionary file may + // grow to any size. + + // Leave the choice of the optimal node order to the flattenTree function. + MakedictLog.i("Flattening the tree..."); + ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, + codePointTable.mCodePointToOneByteCodeMap); + MakedictLog.i("Checking PtNode array..."); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + // Create a buffer that matches the final dictionary size. + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mBuffer = new byte[bufferSize]; + + MakedictLog.i("Writing file..."); + + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, + codePointTable.mCodePointToOneByteCodeMap); + } + if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); + mOutStream.write(mBuffer, 0, mPosition); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(final int position) { + if (mBuffer == null || position < 0 || position >= mBuffer.length) return; + mPosition = position; + } + + @Override + public int getPosition() { + return mPosition; + } + + @Override + public void writePtNodeCount(final int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); + } + final int encodedPtNodeCount = (countSize == 2) ? + (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, + codePointToOneByteCodeMap); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition, + codePointToOneByteCodeMap); + if (hasSeveralChars) { + mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeFrequency(final int frequency) { + if (frequency >= 0) { + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, + FormatSpec.PTNODE_FREQUENCY_SIZE); + } + } + + private void writeChildrenPosition(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, + codePointToOneByteCodeMap); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + childrenPos); + } + + /** + * Write a bigram attributes list to mBuffer. + * + * @param bigrams the bigram attributes list. + * @param dict the dictionary the node array is a part of (for relative offsets). + */ + private void writeBigrams(final ArrayList<WeightedString> bigrams, + final FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator<WeightedString> bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.getProbability(); + final int offset = addressOfBigram + - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + Math.abs(offset)); + } + } + + @Override + public void writePtNode(final PtNode ptNode, final FusionDictionary dict, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + writePtNodeFlags(ptNode, codePointToOneByteCodeMap); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap); + writeFrequency(ptNode.getProbability()); + writeChildrenPosition(ptNode, codePointToOneByteCodeMap); + writeBigrams(ptNode.mBigrams, dict); + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/Ver4DictDecoder.java b/tests/src/org/kelar/inputmethod/latin/makedict/Ver4DictDecoder.java new file mode 100644 index 000000000..351fd72c4 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.BinaryDictionary; +import org.kelar.inputmethod.latin.common.FileUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; + +/** + * An implementation of binary dictionary decoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictDecoder extends AbstractDictDecoder { + final File mDictDirectory; + + @UsedForTesting + /* package */ Ver4DictDecoder(final File dictDirectory) { + mDictDirectory = dictDirectory; + + } + + @Override + public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException { + // dictType is not being used in dicttool. Passing an empty string. + final BinaryDictionary binaryDictionary= new BinaryDictionary( + mDictDirectory.getAbsolutePath(), 0 /* offset */, 0 /* length */, + true /* useFullEditDistance */, null /* locale */, + "" /* dictType */, true /* isUpdatable */); + final DictionaryHeader header = binaryDictionary.getHeader(); + binaryDictionary.close(); + if (header == null) { + throw new IOException("Cannot read the dictionary header."); + } + return header; + } + + @Override + public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken) + throws FileNotFoundException, IOException, UnsupportedFormatException { + // dictType is not being used in dicttool. Passing an empty string. + final BinaryDictionary binaryDictionary = new BinaryDictionary( + mDictDirectory.getAbsolutePath(), 0 /* offset */, 0 /* length */, + true /* useFullEditDistance */, null /* locale */, + "" /* dictType */, true /* isUpdatable */); + final DictionaryHeader header = readHeader(); + final FusionDictionary fusionDict = + new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions); + int token = 0; + final ArrayList<WordProperty> wordProperties = new ArrayList<>(); + do { + final BinaryDictionary.GetNextWordPropertyResult result = + binaryDictionary.getNextWordProperty(token); + final WordProperty wordProperty = result.mWordProperty; + if (wordProperty == null) { + binaryDictionary.close(); + if (deleteDictIfBroken) { + FileUtils.deleteRecursively(mDictDirectory); + } + return null; + } + wordProperties.add(wordProperty); + token = result.mNextToken; + } while (token != 0); + + // Insert unigrams into the fusion dictionary. + for (final WordProperty wordProperty : wordProperties) { + fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo, + wordProperty.mIsNotAWord, + wordProperty.mIsPossiblyOffensive); + } + // Insert bigrams into the fusion dictionary. + // TODO: Support ngrams. + for (final WordProperty wordProperty : wordProperties) { + if (!wordProperty.mHasNgrams) { + continue; + } + final String word0 = wordProperty.mWord; + for (final WeightedString bigram : wordProperty.getBigrams()) { + fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo); + } + } + binaryDictionary.close(); + return fusionDict; + } +} diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/Ver4DictEncoder.java b/tests/src/org/kelar/inputmethod/latin/makedict/Ver4DictEncoder.java new file mode 100644 index 000000000..2eb3010c2 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.BinaryDictionary; +import org.kelar.inputmethod.latin.Dictionary; +import org.kelar.inputmethod.latin.NgramContext; +import org.kelar.inputmethod.latin.common.LocaleUtils; +import org.kelar.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNode; +import org.kelar.inputmethod.latin.utils.BinaryDictionaryUtils; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; + +/** + * An implementation of DictEncoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictEncoder implements DictEncoder { + private final File mDictPlacedDir; + + @UsedForTesting + public Ver4DictEncoder(final File dictPlacedDir) { + mDictPlacedDir = dictPlacedDir; + } + + // TODO: This builds a FusionDictionary first and iterates it to add words to the binary + // dictionary. However, it is possible to just add words directly to the binary dictionary + // instead. + // In the long run, when we stop supporting version 2, FusionDictionary will become deprecated + // and we can remove it. Then we'll be able to just call BinaryDictionary directly. + @Override + public void writeDictionary(FusionDictionary dict, FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion != FormatSpec.VERSION4) { + throw new UnsupportedFormatException("File header has a wrong version number : " + + formatOptions.mVersion); + } + if (!mDictPlacedDir.isDirectory()) { + throw new UnsupportedFormatException("Given path is not a directory."); + } + if (!BinaryDictionaryUtils.createEmptyDictFile(mDictPlacedDir.getAbsolutePath(), + FormatSpec.VERSION4, LocaleUtils.constructLocaleFromString( + dict.mOptions.mAttributes.get(DictionaryHeader.DICTIONARY_LOCALE_KEY)), + dict.mOptions.mAttributes)) { + throw new IOException("Cannot create dictionary file : " + + mDictPlacedDir.getAbsolutePath()); + } + final BinaryDictionary binaryDict = new BinaryDictionary(mDictPlacedDir.getAbsolutePath(), + 0l, mDictPlacedDir.length(), true /* useFullEditDistance */, + LocaleUtils.constructLocaleFromString(dict.mOptions.mAttributes.get( + DictionaryHeader.DICTIONARY_LOCALE_KEY)), + Dictionary.TYPE_USER /* Dictionary type. Does not matter for us */, + true /* isUpdatable */); + if (!binaryDict.isValidDictionary()) { + // Somehow createEmptyDictFile returned true, but the file was not created correctly + throw new IOException("Cannot create dictionary file"); + } + for (final WordProperty wordProperty : dict) { + if (!binaryDict.addUnigramEntry(wordProperty.mWord, wordProperty.getProbability(), + wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord, + wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) { + MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord); + } + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + if (!binaryDict.flushWithGC()) { + MakedictLog.e("Cannot flush dict with GC."); + return; + } + } + } + for (final WordProperty word0Property : dict) { + if (!word0Property.mHasNgrams) continue; + // TODO: Support ngram. + for (final WeightedString word1 : word0Property.getBigrams()) { + final NgramContext ngramContext = + new NgramContext(new NgramContext.WordInfo(word0Property.mWord)); + if (!binaryDict.addNgramEntry(ngramContext, word1.mWord, + word1.getProbability(), 0 /* timestamp */)) { + MakedictLog.e("Cannot add n-gram entry for " + + ngramContext + " -> " + word1.mWord); + return; + } + if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) { + if (!binaryDict.flushWithGC()) { + MakedictLog.e("Cannot flush dict with GC."); + return; + } + } + } + } + if (!binaryDict.flushWithGC()) { + MakedictLog.e("Cannot flush dict with GC."); + return; + } + binaryDict.close(); + } + + @Override + public void setPosition(int position) { + } + + @Override + public int getPosition() { + return 0; + } + + @Override + public void writePtNodeCount(int ptNodeCount) { + } + + @Override + public void writePtNode(PtNode ptNode, FusionDictionary dict, + HashMap<Integer, Integer> codePointToOneByteCodeMap) { + } +} |