diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java')
-rw-r--r-- | java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java | 340 |
1 files changed, 116 insertions, 224 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java index 3be62f066..734223ec2 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -40,52 +40,26 @@ import java.util.Arrays; public class Ver4DictDecoder extends AbstractDictDecoder { private static final String TAG = Ver4DictDecoder.class.getSimpleName(); - protected static final int FILETYPE_TRIE = 1; - protected static final int FILETYPE_FREQUENCY = 2; - protected static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; - protected static final int FILETYPE_BIGRAM_FREQ = 4; - protected static final int FILETYPE_SHORTCUT = 5; - protected static final int FILETYPE_HEADER = 6; - - protected final File mDictDirectory; - protected final DictionaryBufferFactory mBufferFactory; + private static final int FILETYPE_TRIE = 1; + private static final int FILETYPE_FREQUENCY = 2; + private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; + private static final int FILETYPE_BIGRAM_FREQ = 4; + private static final int FILETYPE_SHORTCUT = 5; + + private final File mDictDirectory; + private final DictionaryBufferFactory mBufferFactory; protected DictBuffer mDictBuffer; - protected DictBuffer mHeaderBuffer; - protected DictBuffer mFrequencyBuffer; - protected DictBuffer mTerminalAddressTableBuffer; - private BigramContentReader mBigramReader; - private ShortcutContentReader mShortcutReader; - - /** - * Raw PtNode info straight out of a trie file in version 4 dictionary. - */ - protected static final class Ver4PtNodeInfo { - public final int mFlags; - public final int[] mCharacters; - public final int mTerminalId; - public final int mChildrenPos; - public final int mParentPos; - public final int mNodeSize; - public int mStartIndexOfCharacters; - public int mEndIndexOfCharacters; // exclusive - - public Ver4PtNodeInfo(final int flags, final int[] characters, final int terminalId, - final int childrenPos, final int parentPos, final int nodeSize) { - mFlags = flags; - mCharacters = characters; - mTerminalId = terminalId; - mChildrenPos = childrenPos; - mParentPos = parentPos; - mNodeSize = nodeSize; - mStartIndexOfCharacters = 0; - mEndIndexOfCharacters = characters.length; - } - } + private DictBuffer mFrequencyBuffer; + private DictBuffer mTerminalAddressTableBuffer; + private DictBuffer mBigramBuffer; + private DictBuffer mShortcutBuffer; + private SparseTable mBigramAddressTable; + private SparseTable mShortcutAddressTable; @UsedForTesting /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { mDictDirectory = dictDirectory; - mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null; + mDictBuffer = mFrequencyBuffer = null; if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); @@ -102,16 +76,13 @@ public class Ver4DictDecoder extends AbstractDictDecoder { /* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) { mDictDirectory = dictDirectory; mBufferFactory = factory; - mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null; + mDictBuffer = mFrequencyBuffer = null; } - protected File getFile(final int fileType) throws UnsupportedFormatException { + private File getFile(final int fileType) { if (fileType == FILETYPE_TRIE) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION); - } else if (fileType == FILETYPE_HEADER) { - return new File(mDictDirectory, - mDictDirectory.getName() + FormatSpec.HEADER_FILE_EXTENSION); } else if (fileType == FILETYPE_FREQUENCY) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION); @@ -127,27 +98,20 @@ public class Ver4DictDecoder extends AbstractDictDecoder { mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.SHORTCUT_CONTENT_ID); } else { - throw new UnsupportedFormatException("Unsupported kind of file : " + fileType); + throw new RuntimeException("Unsupported kind of file : " + fileType); } } @Override - public void openDictBuffer() throws FileNotFoundException, IOException, - UnsupportedFormatException { - if (!mDictDirectory.isDirectory()) { - throw new UnsupportedFormatException("Format 4 dictionary needs a directory"); - } - mHeaderBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_HEADER)); + public void openDictBuffer() throws FileNotFoundException, IOException { mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE)); mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY)); mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer( getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); - mBigramReader = new BigramContentReader(mDictDirectory.getName(), - mDictDirectory, mBufferFactory, false); - mBigramReader.openBuffers(); - mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory, - mBufferFactory); - mShortcutReader.openBuffers(); + mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); + loadBigramAddressSparseTable(); + mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT)); + loadShortcutAddressSparseTable(); } @Override @@ -155,134 +119,46 @@ public class Ver4DictDecoder extends AbstractDictDecoder { return mDictBuffer != null; } - @UsedForTesting - /* package */ DictBuffer getHeaderBuffer() { - return mHeaderBuffer; - } - - @UsedForTesting /* package */ DictBuffer getDictBuffer() { return mDictBuffer; } @Override public FileHeader readHeader() throws IOException, UnsupportedFormatException { - if (mHeaderBuffer == null) { + if (mDictBuffer == null) { openDictBuffer(); } - mHeaderBuffer.position(0); - final FileHeader header = super.readHeader(mHeaderBuffer); + final FileHeader header = super.readHeader(mDictBuffer); final int version = header.mFormatOptions.mVersion; - if (version != FormatSpec.VERSION4) { + if (version != 4) { throw new UnsupportedFormatException("File header has a wrong version : " + version); } return header; } - /** - * An auxiliary class for reading bigrams. - */ - protected static class BigramContentReader extends SparseTableContentReader { - public BigramContentReader(final String name, final File baseDir, - final DictionaryBufferFactory factory, final boolean hasTimestamp) { - super(name + FormatSpec.BIGRAM_FILE_EXTENSION, - FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - getContentFilenames(name, hasTimestamp), getContentIds(hasTimestamp), factory); - } - - // TODO: Consolidate this method and BigramContentWriter.getContentFilenames. - protected static String[] getContentFilenames(final String name, - final boolean hasTimestamp) { - final String[] contentFilenames; - if (hasTimestamp) { - contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION, - name + FormatSpec.BIGRAM_FILE_EXTENSION }; - } else { - contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }; - } - return contentFilenames; - } - - // TODO: Consolidate this method and BigramContentWriter.getContentIds. - protected static String[] getContentIds(final boolean hasTimestamp) { - final String[] contentIds; - if (hasTimestamp) { - contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID, - FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID }; - } else { - contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }; - } - return contentIds; - } - - public ArrayList<PendingAttribute> readTargetsAndFrequencies(final int terminalId, - final DictBuffer terminalAddressTableBuffer) { - final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList(); - read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, - new SparseTableContentReaderInterface() { - @Override - public void read(final DictBuffer buffer) { - while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, - // remaining bigram entries are ignored. - final int bigramFlags = buffer.readUnsignedByte(); - final int targetTerminalId = buffer.readUnsignedInt24(); - terminalAddressTableBuffer.position(targetTerminalId - * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); - final int targetAddress = - terminalAddressTableBuffer.readUnsignedInt24(); - bigrams.add(new PendingAttribute(bigramFlags - & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, - targetAddress)); - if (0 == (bigramFlags - & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) { - break; - } - } - if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - throw new RuntimeException("Too many bigrams in a PtNode (" - + bigrams.size() + " but max is " - + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); - } - } - }); - if (bigrams.isEmpty()) return null; - return bigrams; - } + private void loadBigramAddressSparseTable() throws IOException { + final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + final File freqsFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.BIGRAM_FREQ_CONTENT_ID); + mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile }, + FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); } - /** - * An auxiliary class for reading shortcuts. - */ - protected static class ShortcutContentReader extends SparseTableContentReader { - public ShortcutContentReader(final String name, final File baseDir, - final DictionaryBufferFactory factory) { - super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, - FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, - new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, - new String[] { FormatSpec.SHORTCUT_CONTENT_ID }, factory); - } - - public ArrayList<WeightedString> readShortcuts(final int terminalId) { - final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList(); - read(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, - new SparseTableContentReaderInterface() { - @Override - public void read(final DictBuffer buffer) { - while (true) { - final int flags = buffer.readUnsignedByte(); - final String word = CharEncoding.readString(buffer); - shortcuts.add(new WeightedString(word, - flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); - if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) { - break; - } - } - } - }); - if (shortcuts.isEmpty()) return null; - return shortcuts; - } + // TODO: Let's have something like SparseTableContentsReader in this class. + private void loadShortcutAddressSparseTable() throws IOException { + final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + final File contentFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.SHORTCUT_CONTENT_ID); + final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.SHORTCUT_CONTENT_ID); + mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile, + new File[] { contentFile, timestampsFile }, + FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE); } protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { @@ -296,82 +172,102 @@ public class Ver4DictDecoder extends AbstractDictDecoder { } } - private final int[] mCharacterBufferForReadingVer4PtNodeInfo - = new int[FormatSpec.MAX_WORD_LENGTH]; + private ArrayList<WeightedString> readShortcuts(final int terminalId) { + if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null; + + final ArrayList<WeightedString> ret = CollectionUtils.newArrayList(); + final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX, + terminalId); + mShortcutBuffer.position(posOfShortcuts); + while (true) { + final int flags = mShortcutBuffer.readUnsignedByte(); + final String word = CharEncoding.readString(mShortcutBuffer); + ret.add(new WeightedString(word, + flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); + if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return ret; + } - /** - * Reads PtNode from ptNodePos in the trie file and returns Ver4PtNodeInfo. - * - * @param ptNodePos the position of PtNode. - * @param options the format options. - * @return Ver4PtNodeInfo. - */ // TODO: Make this buffer thread safe. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. - protected Ver4PtNodeInfo readVer4PtNodeInfo(final int ptNodePos, final FormatOptions options) { - int readingPos = ptNodePos; + private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; + @Override + public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { + int addressPointer = ptNodePos; final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - readingPos += FormatSpec.PTNODE_FLAGS_SIZE; + addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; - final int parentPos = PtNodeReader.readParentAddress(mDictBuffer, options); + final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { - readingPos += FormatSpec.PARENT_ADDRESS_SIZE; + addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; } final int characters[]; if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { int index = 0; int character = CharEncoding.readChar(mDictBuffer); - readingPos += CharEncoding.getCharSize(character); + addressPointer += CharEncoding.getCharSize(character); while (FormatSpec.INVALID_CHARACTER != character && index < FormatSpec.MAX_WORD_LENGTH) { - mCharacterBufferForReadingVer4PtNodeInfo[index++] = character; + mCharacterBuffer[index++] = character; character = CharEncoding.readChar(mDictBuffer); - readingPos += CharEncoding.getCharSize(character); + addressPointer += CharEncoding.getCharSize(character); } - characters = Arrays.copyOfRange(mCharacterBufferForReadingVer4PtNodeInfo, 0, index); + characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); } else { final int character = CharEncoding.readChar(mDictBuffer); - readingPos += CharEncoding.getCharSize(character); + addressPointer += CharEncoding.getCharSize(character); characters = new int[] { character }; } final int terminalId; if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { terminalId = PtNodeReader.readTerminalId(mDictBuffer); - readingPos += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE; } else { terminalId = PtNode.NOT_A_TERMINAL; } - int childrenPos = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); - if (childrenPos != FormatSpec.NO_CHILDREN_ADDRESS) { - childrenPos += readingPos; - } - readingPos += BinaryDictIOUtils.getChildrenAddressSize(flags, options); - - return new Ver4PtNodeInfo(flags, characters, terminalId, childrenPos, parentPos, - readingPos - ptNodePos); - } - - @Override - public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { - final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(ptNodePos, options); - final int frequency; - if (0 != (FormatSpec.FLAG_IS_TERMINAL & nodeInfo.mFlags)) { - frequency = PtNodeReader.readFrequency(mFrequencyBuffer, nodeInfo.mTerminalId); + if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { + frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId); } else { frequency = PtNode.NOT_A_TERMINAL; } - - final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts( - nodeInfo.mTerminalId); - final ArrayList<PendingAttribute> bigrams = mBigramReader.readTargetsAndFrequencies( - nodeInfo.mTerminalId, mTerminalAddressTableBuffer); - - return new PtNodeInfo(ptNodePos, ptNodePos + nodeInfo.mNodeSize, nodeInfo.mFlags, - nodeInfo.mCharacters, frequency, nodeInfo.mParentPos, nodeInfo.mChildrenPos, - shortcutTargets, bigrams); + int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); + if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { + childrenAddress += addressPointer; + } + addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); + final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); + + final ArrayList<PendingAttribute> bigrams; + if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { + bigrams = new ArrayList<PendingAttribute>(); + final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId); + mBigramBuffer.position(posOfBigrams); + while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, + // remaining bigram entries are ignored. + final int bigramFlags = mBigramBuffer.readUnsignedByte(); + final int targetTerminalId = mBigramBuffer.readUnsignedInt24(); + mTerminalAddressTableBuffer.position( + targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); + final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24(); + bigrams.add(new PendingAttribute( + bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, + targetAddress)); + if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() + + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); + } + } else { + bigrams = null; + } + return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency, + parentAddress, childrenAddress, shortcutTargets, bigrams); } private void deleteDictFiles() { @@ -422,14 +318,10 @@ public class Ver4DictDecoder extends AbstractDictDecoder { @Override public boolean readAndFollowForwardLink() { - final int forwardLinkPos = mDictBuffer.position(); - int nextRelativePos = BinaryDictDecoderUtils.readSInt24(mDictBuffer); - if (nextRelativePos != FormatSpec.NO_FORWARD_LINK_ADDRESS) { - final int nextPos = forwardLinkPos + nextRelativePos; - if (nextPos >= 0 && nextPos < mDictBuffer.limit()) { - mDictBuffer.position(nextPos); - return true; - } + final int nextAddress = mDictBuffer.readUnsignedInt24(); + if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) { + mDictBuffer.position(nextAddress); + return true; } return false; } |