diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java')
-rw-r--r-- | java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java new file mode 100644 index 000000000..75b75ae2e --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -0,0 +1,269 @@ +/* +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Iterator; + +/** + * An implementation of DictEncoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictEncoder implements DictEncoder { + private final File mDictPlacedDir; + private byte[] mTrieBuf; + private byte[] mFreqBuf; + private int mTriePos; + private OutputStream mTrieOutStream; + private OutputStream mFreqOutStream; + + @UsedForTesting + public Ver4DictEncoder(final File dictPlacedDir) { + mDictPlacedDir = dictPlacedDir; + } + + private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) + throws FileNotFoundException, IOException { + final FileHeader header = new FileHeader(0, dictOptions, formatOptions); + final String filename = header.getId() + "." + header.getVersion(); + final File mDictDir = new File(mDictPlacedDir, filename); + final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION); + final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION); + if (!mDictDir.isDirectory()) { + if (mDictDir.exists()) mDictDir.delete(); + mDictDir.mkdirs(); + } + if (!trieFile.exists()) trieFile.createNewFile(); + if (!freqFile.exists()) freqFile.createNewFile(); + mTrieOutStream = new FileOutputStream(trieFile); + mFreqOutStream = new FileOutputStream(freqFile); + } + + private void close() throws IOException { + try { + if (mTrieOutStream != null) { + mTrieOutStream.close(); + } + if (mFreqOutStream != null) { + mFreqOutStream.close(); + } + } finally { + mTrieOutStream = null; + mFreqOutStream = null; + } + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion != FormatSpec.VERSION4) { + throw new UnsupportedFormatException("File header has a wrong version number : " + + formatOptions.mVersion); + } + if (!mDictPlacedDir.isDirectory()) { + throw new UnsupportedFormatException("Given path is not a directory."); + } + + if (mTrieOutStream == null) { + openStreams(formatOptions, dict.mOptions); + } + + BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict, formatOptions); + + MakedictLog.i("Flattening the tree..."); + ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + int terminalCount = 0; + for (final PtNodeArray array : flatNodes) { + for (final PtNode node : array.mData) { + if (node.isTerminal()) node.mTerminalId = terminalCount++; + } + } + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mTrieBuf = new byte[bufferSize]; + mFreqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE]; + + MakedictLog.i("Writing file..."); + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); + } + if (MakedictLog.DBG) { + BinaryDictEncoderUtils.showStatistics(flatNodes); + MakedictLog.i("has " + terminalCount + " terminals."); + } + mTrieOutStream.write(mTrieBuf); + mFreqOutStream.write(mFreqBuf); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(int position) { + if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return; + mTriePos = position; + } + + @Override + public int getPosition() { + return mTriePos; + } + + @Override + public void writePtNodeCount(int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + // ptNodeCount must fit on one byte or two bytes. + // Please see comments in FormatSpec + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); + } + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, ptNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode, final int parentAddress, + final FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mTriePos, childrenPos, + formatOptions), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeParentPosition(int parentPos, final PtNode ptNode, + final FormatOptions formatOptions) { + if (parentPos != FormatSpec.NO_PARENT_ADDRESS) { + parentPos -= ptNode.mCachedAddressAfterUpdate; + } + mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos, + formatOptions); + } + + private void writeCharacters(final int[] characters, final boolean hasSeveralChars) { + mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos); + if (hasSeveralChars) { + mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeTerminalId(final int terminalId) { + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId, + FormatSpec.PTNODE_TERMINAL_ID_SIZE); + } + + private void writeFrequency(final int frequency, final int terminalId) { + final int freqPos = terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE; + BinaryDictEncoderUtils.writeUIntToBuffer(mFreqBuf, freqPos, frequency, + FormatSpec.FREQUENCY_AND_FLAGS_SIZE); + } + + private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + if (formatOptions.mSupportsDynamicUpdate) { + mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf, + mTriePos, childrenPos); + } else { + mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, + mTriePos, childrenPos); + } + } + + private void writeShortcuts(ArrayList<WeightedString> shortcuts) { + if (null == shortcuts || shortcuts.isEmpty()) return; + + final int indexOfShortcutByteSize = mTriePos; + mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; + final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( + shortcutIterator.hasNext(), + target.mFrequency); + mTrieBuf[mTriePos++] = (byte)shortcutFlags; + final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos, + target.mWord); + mTriePos += shortcutShift; + } + final int shortcutByteSize = mTriePos - indexOfShortcutByteSize; + if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { + throw new RuntimeException("Shortcut list too large : " + shortcutByteSize); + } + BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize, + shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + + private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator<WeightedString> bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.mFrequency; + final int offset = addressOfBigram + - (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); + mTrieBuf[mTriePos++] = (byte) bigramFlags; + mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, + mTriePos, Math.abs(offset)); + } + } + + @Override + public void writeForwardLinkAddress(int forwardLinkAddress) { + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, + forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + } + + @Override + public void writePtNode(final PtNode ptNode, final int parentPosition, + final FormatOptions formatOptions, final FusionDictionary dict) { + writePtNodeFlags(ptNode, parentPosition, formatOptions); + writeParentPosition(parentPosition, ptNode, formatOptions); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); + if (ptNode.isTerminal()) { + writeTerminalId(ptNode.mTerminalId); + writeFrequency(ptNode.mFrequency, ptNode.mTerminalId); + } + writeChildrenPosition(ptNode, formatOptions); + writeShortcuts(ptNode.mShortcutTargets); + writeBigrams(ptNode.mBigrams, dict); + } +} |