diff options
author | 2013-09-10 17:16:32 +0900 | |
---|---|---|
committer | 2013-09-13 17:33:51 +0900 | |
commit | a141d8ef7dcf8f942eb7bd4ca006f63da1744319 (patch) | |
tree | 9dfc96843392b8c287baf3a9317ca54d07f2a9ed /java/src | |
parent | 610f3eb4ec7f073c9ed058598b33e637e8bd6188 (diff) | |
download | latinime-a141d8ef7dcf8f942eb7bd4ca006f63da1744319.tar.gz latinime-a141d8ef7dcf8f942eb7bd4ca006f63da1744319.tar.xz latinime-a141d8ef7dcf8f942eb7bd4ca006f63da1744319.zip |
Add Ver4DictEncoder.
Bug: 9618601
Change-Id: I161d2845906f07c1251deb8005fdffe49c5b7940
Diffstat (limited to 'java/src')
5 files changed, 305 insertions, 7 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java index 21e9811ef..f333b0d86 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -126,8 +126,14 @@ public class BinaryDictEncoderUtils { */ private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) { int size = getNodeHeaderSize(ptNode, options); - // If terminal, one byte for the frequency - if (ptNode.isTerminal()) size += FormatSpec.PTNODE_FREQUENCY_SIZE; + if (ptNode.isTerminal()) { + // If terminal, one byte for the frequency or four bytes for the terminal id. + if (options.mHasTerminalId) { + size += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + } else { + size += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + } size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address size += getShortcutListSize(ptNode.mShortcutTargets); if (null != ptNode.mBigrams) { @@ -345,7 +351,13 @@ public class BinaryDictEncoderUtils { changed = true; } int nodeSize = getNodeHeaderSize(ptNode, formatOptions); - if (ptNode.isTerminal()) nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; + if (ptNode.isTerminal()) { + if (formatOptions.mHasTerminalId) { + nodeSize += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + } else { + nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; + } + } if (formatOptions.mSupportsDynamicUpdate) { nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; } else if (null != ptNode.mChildren) { @@ -787,7 +799,6 @@ public class BinaryDictEncoderUtils { + FormatSpec.MAX_TERMINAL_FREQUENCY + " : " + ptNode.mFrequency); } - dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict); } if (formatOptions.mSupportsDynamicUpdate) { diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index 44ae33de1..96ccd8e49 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -198,9 +198,12 @@ public final class FormatSpec { public static final int MAGIC_NUMBER = 0x9BC13AFE; static final int MINIMUM_SUPPORTED_VERSION = 2; - static final int MAXIMUM_SUPPORTED_VERSION = 3; + static final int MAXIMUM_SUPPORTED_VERSION = 4; static final int NOT_A_VERSION_NUMBER = -1; static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3; + static final int FIRST_VERSION_WITH_TERMINAL_ID = 4; + static final int VERSION3 = 3; + static final int VERSION4 = 4; // These options need to be the same numeric values as the one in the native reading code. static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; @@ -251,11 +254,17 @@ public final class FormatSpec { static final int PTNODE_TERMINATOR_SIZE = 1; static final int PTNODE_FLAGS_SIZE = 1; static final int PTNODE_FREQUENCY_SIZE = 1; + static final int PTNODE_TERMINAL_ID_SIZE = 4; static final int PTNODE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1; static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2; + // These values are used only by version 4 or later. + static final String TRIE_FILE_EXTENSION = ".trie"; + static final String FREQ_FILE_EXTENSION = ".freq"; + static final int FREQUENCY_AND_FLAGS_SIZE = 2; + static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_PARENT_ADDRESS = 0; static final int NO_FORWARD_LINK_ADDRESS = 0; @@ -264,6 +273,7 @@ public final class FormatSpec { static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127 static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767 static final int MAX_BIGRAMS_IN_A_PTNODE = 10000; + static final int MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE = 0xFFFF; static final int MAX_TERMINAL_FREQUENCY = 255; static final int MAX_BIGRAM_FREQUENCY = 15; @@ -287,6 +297,7 @@ public final class FormatSpec { public static final class FormatOptions { public final int mVersion; public final boolean mSupportsDynamicUpdate; + public final boolean mHasTerminalId; @UsedForTesting public FormatOptions(final int version) { this(version, false); @@ -300,6 +311,7 @@ public final class FormatSpec { + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior."); } mSupportsDynamicUpdate = supportsDynamicUpdate; + mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID); } } diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java index 3e685a3d7..be653feec 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java +++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java @@ -111,6 +111,7 @@ public final class FusionDictionary implements Iterable<Word> { ArrayList<WeightedString> mShortcutTargets; ArrayList<WeightedString> mBigrams; int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. + int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. PtNodeArray mChildren; boolean mIsNotAWord; // Only a shortcut boolean mIsBlacklistEntry; @@ -129,6 +130,7 @@ public final class FusionDictionary implements Iterable<Word> { final boolean isNotAWord, final boolean isBlacklistEntry) { mChars = chars; mFrequency = frequency; + mTerminalId = frequency; mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = null; @@ -156,6 +158,10 @@ public final class FusionDictionary implements Iterable<Word> { mChildren.mData.add(n); } + public int getTerminalId() { + return mTerminalId; + } + public boolean isTerminal() { return NOT_A_TERMINAL != mFrequency; } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java index 48a823d43..222a0f474 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java @@ -68,7 +68,7 @@ public class Ver3DictEncoder implements DictEncoder { @Override public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) throws IOException, UnsupportedFormatException { - if (formatOptions.mVersion > 3) { + if (formatOptions.mVersion > FormatSpec.VERSION3) { throw new UnsupportedFormatException( "The given format options has wrong version number : " + formatOptions.mVersion); @@ -200,7 +200,7 @@ public class Ver3DictEncoder implements DictEncoder { mPosition += shortcutShift; } final int shortcutByteSize = mPosition - indexOfShortcutByteSize; - if (shortcutByteSize > 0xFFFF) { + if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { throw new RuntimeException("Shortcut list too large"); } BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize, diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java new file mode 100644 index 000000000..75b75ae2e --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -0,0 +1,269 @@ +/* +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; +import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Iterator; + +/** + * An implementation of DictEncoder for version 4 binary dictionary. + */ +@UsedForTesting +public class Ver4DictEncoder implements DictEncoder { + private final File mDictPlacedDir; + private byte[] mTrieBuf; + private byte[] mFreqBuf; + private int mTriePos; + private OutputStream mTrieOutStream; + private OutputStream mFreqOutStream; + + @UsedForTesting + public Ver4DictEncoder(final File dictPlacedDir) { + mDictPlacedDir = dictPlacedDir; + } + + private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) + throws FileNotFoundException, IOException { + final FileHeader header = new FileHeader(0, dictOptions, formatOptions); + final String filename = header.getId() + "." + header.getVersion(); + final File mDictDir = new File(mDictPlacedDir, filename); + final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION); + final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION); + if (!mDictDir.isDirectory()) { + if (mDictDir.exists()) mDictDir.delete(); + mDictDir.mkdirs(); + } + if (!trieFile.exists()) trieFile.createNewFile(); + if (!freqFile.exists()) freqFile.createNewFile(); + mTrieOutStream = new FileOutputStream(trieFile); + mFreqOutStream = new FileOutputStream(freqFile); + } + + private void close() throws IOException { + try { + if (mTrieOutStream != null) { + mTrieOutStream.close(); + } + if (mFreqOutStream != null) { + mFreqOutStream.close(); + } + } finally { + mTrieOutStream = null; + mFreqOutStream = null; + } + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + if (formatOptions.mVersion != FormatSpec.VERSION4) { + throw new UnsupportedFormatException("File header has a wrong version number : " + + formatOptions.mVersion); + } + if (!mDictPlacedDir.isDirectory()) { + throw new UnsupportedFormatException("Given path is not a directory."); + } + + if (mTrieOutStream == null) { + openStreams(formatOptions, dict.mOptions); + } + + BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict, formatOptions); + + MakedictLog.i("Flattening the tree..."); + ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + int terminalCount = 0; + for (final PtNodeArray array : flatNodes) { + for (final PtNode node : array.mData) { + if (node.isTerminal()) node.mTerminalId = terminalCount++; + } + } + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mTrieBuf = new byte[bufferSize]; + mFreqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE]; + + MakedictLog.i("Writing file..."); + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions); + } + if (MakedictLog.DBG) { + BinaryDictEncoderUtils.showStatistics(flatNodes); + MakedictLog.i("has " + terminalCount + " terminals."); + } + mTrieOutStream.write(mTrieBuf); + mFreqOutStream.write(mFreqBuf); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(int position) { + if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return; + mTriePos = position; + } + + @Override + public int getPosition() { + return mTriePos; + } + + @Override + public void writePtNodeCount(int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + // ptNodeCount must fit on one byte or two bytes. + // Please see comments in FormatSpec + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); + } + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, ptNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode, final int parentAddress, + final FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mTriePos, childrenPos, + formatOptions), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeParentPosition(int parentPos, final PtNode ptNode, + final FormatOptions formatOptions) { + if (parentPos != FormatSpec.NO_PARENT_ADDRESS) { + parentPos -= ptNode.mCachedAddressAfterUpdate; + } + mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos, + formatOptions); + } + + private void writeCharacters(final int[] characters, final boolean hasSeveralChars) { + mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos); + if (hasSeveralChars) { + mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeTerminalId(final int terminalId) { + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId, + FormatSpec.PTNODE_TERMINAL_ID_SIZE); + } + + private void writeFrequency(final int frequency, final int terminalId) { + final int freqPos = terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE; + BinaryDictEncoderUtils.writeUIntToBuffer(mFreqBuf, freqPos, frequency, + FormatSpec.FREQUENCY_AND_FLAGS_SIZE); + } + + private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions); + if (formatOptions.mSupportsDynamicUpdate) { + mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf, + mTriePos, childrenPos); + } else { + mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, + mTriePos, childrenPos); + } + } + + private void writeShortcuts(ArrayList<WeightedString> shortcuts) { + if (null == shortcuts || shortcuts.isEmpty()) return; + + final int indexOfShortcutByteSize = mTriePos; + mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; + final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( + shortcutIterator.hasNext(), + target.mFrequency); + mTrieBuf[mTriePos++] = (byte)shortcutFlags; + final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos, + target.mWord); + mTriePos += shortcutShift; + } + final int shortcutByteSize = mTriePos - indexOfShortcutByteSize; + if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { + throw new RuntimeException("Shortcut list too large : " + shortcutByteSize); + } + BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize, + shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); + } + + private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator<WeightedString> bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.mFrequency; + final int offset = addressOfBigram + - (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); + mTrieBuf[mTriePos++] = (byte) bigramFlags; + mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf, + mTriePos, Math.abs(offset)); + } + } + + @Override + public void writeForwardLinkAddress(int forwardLinkAddress) { + mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, + forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE); + } + + @Override + public void writePtNode(final PtNode ptNode, final int parentPosition, + final FormatOptions formatOptions, final FusionDictionary dict) { + writePtNodeFlags(ptNode, parentPosition, formatOptions); + writeParentPosition(parentPosition, ptNode, formatOptions); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars()); + if (ptNode.isTerminal()) { + writeTerminalId(ptNode.mTerminalId); + writeFrequency(ptNode.mFrequency, ptNode.mTerminalId); + } + writeChildrenPosition(ptNode, formatOptions); + writeShortcuts(ptNode.mShortcutTargets); + writeBigrams(ptNode.mBigrams, dict); + } +} |