diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java')
-rw-r--r-- | java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java | 343 |
1 files changed, 227 insertions, 116 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java index 734223ec2..07522b54b 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -40,26 +40,52 @@ import java.util.Arrays; public class Ver4DictDecoder extends AbstractDictDecoder { private static final String TAG = Ver4DictDecoder.class.getSimpleName(); - private static final int FILETYPE_TRIE = 1; - private static final int FILETYPE_FREQUENCY = 2; - private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; - private static final int FILETYPE_BIGRAM_FREQ = 4; - private static final int FILETYPE_SHORTCUT = 5; - - private final File mDictDirectory; - private final DictionaryBufferFactory mBufferFactory; + protected static final int FILETYPE_TRIE = 1; + protected static final int FILETYPE_FREQUENCY = 2; + protected static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; + protected static final int FILETYPE_BIGRAM_FREQ = 4; + protected static final int FILETYPE_SHORTCUT = 5; + protected static final int FILETYPE_HEADER = 6; + + protected final File mDictDirectory; + protected final DictionaryBufferFactory mBufferFactory; protected DictBuffer mDictBuffer; - private DictBuffer mFrequencyBuffer; - private DictBuffer mTerminalAddressTableBuffer; - private DictBuffer mBigramBuffer; - private DictBuffer mShortcutBuffer; - private SparseTable mBigramAddressTable; - private SparseTable mShortcutAddressTable; + protected DictBuffer mHeaderBuffer; + protected DictBuffer mFrequencyBuffer; + protected DictBuffer mTerminalAddressTableBuffer; + private BigramContentReader mBigramReader; + private ShortcutContentReader mShortcutReader; + + /** + * Raw PtNode info straight out of a trie file in version 4 dictionary. + */ + protected static final class Ver4PtNodeInfo { + public final int mFlags; + public final int[] mCharacters; + public final int mTerminalId; + public final int mChildrenPos; + public final int mParentPos; + public final int mNodeSize; + public int mStartIndexOfCharacters; + public int mEndIndexOfCharacters; // exclusive + + public Ver4PtNodeInfo(final int flags, final int[] characters, final int terminalId, + final int childrenPos, final int parentPos, final int nodeSize) { + mFlags = flags; + mCharacters = characters; + mTerminalId = terminalId; + mChildrenPos = childrenPos; + mParentPos = parentPos; + mNodeSize = nodeSize; + mStartIndexOfCharacters = 0; + mEndIndexOfCharacters = characters.length; + } + } @UsedForTesting /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { mDictDirectory = dictDirectory; - mDictBuffer = mFrequencyBuffer = null; + mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null; if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) { mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory(); @@ -76,13 +102,16 @@ public class Ver4DictDecoder extends AbstractDictDecoder { /* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) { mDictDirectory = dictDirectory; mBufferFactory = factory; - mDictBuffer = mFrequencyBuffer = null; + mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null; } - private File getFile(final int fileType) { + protected File getFile(final int fileType) throws UnsupportedFormatException { if (fileType == FILETYPE_TRIE) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION); + } else if (fileType == FILETYPE_HEADER) { + return new File(mDictDirectory, + mDictDirectory.getName() + FormatSpec.HEADER_FILE_EXTENSION); } else if (fileType == FILETYPE_FREQUENCY) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION); @@ -98,20 +127,27 @@ public class Ver4DictDecoder extends AbstractDictDecoder { mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.SHORTCUT_CONTENT_ID); } else { - throw new RuntimeException("Unsupported kind of file : " + fileType); + throw new UnsupportedFormatException("Unsupported kind of file : " + fileType); } } @Override - public void openDictBuffer() throws FileNotFoundException, IOException { + public void openDictBuffer() throws FileNotFoundException, IOException, + UnsupportedFormatException { + if (!mDictDirectory.isDirectory()) { + throw new UnsupportedFormatException("Format 4 dictionary needs a directory"); + } + mHeaderBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_HEADER)); mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE)); mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY)); mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer( getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); - mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); - loadBigramAddressSparseTable(); - mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT)); - loadShortcutAddressSparseTable(); + mBigramReader = new BigramContentReader(mDictDirectory.getName(), + mDictDirectory, mBufferFactory, false); + mBigramReader.openBuffers(); + mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory, + mBufferFactory); + mShortcutReader.openBuffers(); } @Override @@ -119,46 +155,137 @@ public class Ver4DictDecoder extends AbstractDictDecoder { return mDictBuffer != null; } + @UsedForTesting + /* package */ DictBuffer getHeaderBuffer() { + return mHeaderBuffer; + } + + @UsedForTesting /* package */ DictBuffer getDictBuffer() { return mDictBuffer; } @Override public FileHeader readHeader() throws IOException, UnsupportedFormatException { - if (mDictBuffer == null) { + if (mHeaderBuffer == null) { openDictBuffer(); } - final FileHeader header = super.readHeader(mDictBuffer); + mHeaderBuffer.position(0); + final FileHeader header = super.readHeader(mHeaderBuffer); final int version = header.mFormatOptions.mVersion; - if (version != 4) { + if (version != FormatSpec.VERSION4) { throw new UnsupportedFormatException("File header has a wrong version : " + version); } return header; } - private void loadBigramAddressSparseTable() throws IOException { - final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() - + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); - final File freqsFile = new File(mDictDirectory, mDictDirectory.getName() - + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX - + FormatSpec.BIGRAM_FREQ_CONTENT_ID); - mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile }, - FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); + /** + * An auxiliary class for reading bigrams. + */ + protected static class BigramContentReader extends SparseTableContentReader { + private final boolean mHasTimestamp; + + public BigramContentReader(final String name, final File baseDir, + final DictionaryBufferFactory factory, final boolean hasTimestamp) { + super(name + FormatSpec.BIGRAM_FILE_EXTENSION, + FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir, + getContentFilenames(name, hasTimestamp), getContentIds(hasTimestamp), factory); + mHasTimestamp = hasTimestamp; + } + + // TODO: Consolidate this method and BigramContentWriter.getContentFilenames. + protected static String[] getContentFilenames(final String name, + final boolean hasTimestamp) { + final String[] contentFilenames; + if (hasTimestamp) { + contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION, + name + FormatSpec.BIGRAM_FILE_EXTENSION }; + } else { + contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION }; + } + return contentFilenames; + } + + // TODO: Consolidate this method and BigramContentWriter.getContentIds. + protected static String[] getContentIds(final boolean hasTimestamp) { + final String[] contentIds; + if (hasTimestamp) { + contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID, + FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID }; + } else { + contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID }; + } + return contentIds; + } + + public ArrayList<PendingAttribute> readTargetsAndFrequencies(final int terminalId, + final DictBuffer terminalAddressTableBuffer) { + final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList(); + read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, + new SparseTableContentReaderInterface() { + @Override + public void read(final DictBuffer buffer) { + while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, + // remaining bigram entries are ignored. + final int bigramFlags = buffer.readUnsignedByte(); + final int targetTerminalId = buffer.readUnsignedInt24(); + terminalAddressTableBuffer.position(targetTerminalId + * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); + final int targetAddress = + terminalAddressTableBuffer.readUnsignedInt24(); + bigrams.add(new PendingAttribute(bigramFlags + & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, + targetAddress)); + if (0 == (bigramFlags + & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) { + break; + } + } + if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + throw new RuntimeException("Too many bigrams in a PtNode (" + + bigrams.size() + " but max is " + + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); + } + } + }); + if (bigrams.isEmpty()) return null; + return bigrams; + } } - // TODO: Let's have something like SparseTableContentsReader in this class. - private void loadShortcutAddressSparseTable() throws IOException { - final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() - + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); - final File contentFile = new File(mDictDirectory, mDictDirectory.getName() - + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX - + FormatSpec.SHORTCUT_CONTENT_ID); - final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName() - + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX - + FormatSpec.SHORTCUT_CONTENT_ID); - mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile, - new File[] { contentFile, timestampsFile }, - FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE); + /** + * An auxiliary class for reading shortcuts. + */ + protected static class ShortcutContentReader extends SparseTableContentReader { + public ShortcutContentReader(final String name, final File baseDir, + final DictionaryBufferFactory factory) { + super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, + FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, + new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, + new String[] { FormatSpec.SHORTCUT_CONTENT_ID }, factory); + } + + public ArrayList<WeightedString> readShortcuts(final int terminalId) { + final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList(); + read(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, + new SparseTableContentReaderInterface() { + @Override + public void read(final DictBuffer buffer) { + while (true) { + final int flags = buffer.readUnsignedByte(); + final String word = CharEncoding.readString(buffer); + shortcuts.add(new WeightedString(word, + flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); + if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) { + break; + } + } + } + }); + if (shortcuts.isEmpty()) return null; + return shortcuts; + } } protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { @@ -172,102 +299,82 @@ public class Ver4DictDecoder extends AbstractDictDecoder { } } - private ArrayList<WeightedString> readShortcuts(final int terminalId) { - if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null; - - final ArrayList<WeightedString> ret = CollectionUtils.newArrayList(); - final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX, - terminalId); - mShortcutBuffer.position(posOfShortcuts); - while (true) { - final int flags = mShortcutBuffer.readUnsignedByte(); - final String word = CharEncoding.readString(mShortcutBuffer); - ret.add(new WeightedString(word, - flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); - if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; - } - return ret; - } + private final int[] mCharacterBufferForReadingVer4PtNodeInfo + = new int[FormatSpec.MAX_WORD_LENGTH]; + /** + * Reads PtNode from ptNodePos in the trie file and returns Ver4PtNodeInfo. + * + * @param ptNodePos the position of PtNode. + * @param options the format options. + * @return Ver4PtNodeInfo. + */ // TODO: Make this buffer thread safe. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. - private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; - @Override - public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { - int addressPointer = ptNodePos; + protected Ver4PtNodeInfo readVer4PtNodeInfo(final int ptNodePos, final FormatOptions options) { + int readingPos = ptNodePos; final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); - addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; + readingPos += FormatSpec.PTNODE_FLAGS_SIZE; - final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); + final int parentPos = PtNodeReader.readParentAddress(mDictBuffer, options); if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { - addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; + readingPos += FormatSpec.PARENT_ADDRESS_SIZE; } final int characters[]; if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { int index = 0; int character = CharEncoding.readChar(mDictBuffer); - addressPointer += CharEncoding.getCharSize(character); + readingPos += CharEncoding.getCharSize(character); while (FormatSpec.INVALID_CHARACTER != character && index < FormatSpec.MAX_WORD_LENGTH) { - mCharacterBuffer[index++] = character; + mCharacterBufferForReadingVer4PtNodeInfo[index++] = character; character = CharEncoding.readChar(mDictBuffer); - addressPointer += CharEncoding.getCharSize(character); + readingPos += CharEncoding.getCharSize(character); } - characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); + characters = Arrays.copyOfRange(mCharacterBufferForReadingVer4PtNodeInfo, 0, index); } else { final int character = CharEncoding.readChar(mDictBuffer); - addressPointer += CharEncoding.getCharSize(character); + readingPos += CharEncoding.getCharSize(character); characters = new int[] { character }; } final int terminalId; if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { terminalId = PtNodeReader.readTerminalId(mDictBuffer); - addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE; + readingPos += FormatSpec.PTNODE_TERMINAL_ID_SIZE; } else { terminalId = PtNode.NOT_A_TERMINAL; } + int childrenPos = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); + if (childrenPos != FormatSpec.NO_CHILDREN_ADDRESS) { + childrenPos += readingPos; + } + readingPos += BinaryDictIOUtils.getChildrenAddressSize(flags, options); + + return new Ver4PtNodeInfo(flags, characters, terminalId, childrenPos, parentPos, + readingPos - ptNodePos); + } + + @Override + public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { + final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(ptNodePos, options); + final int frequency; - if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { - frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId); + if (0 != (FormatSpec.FLAG_IS_TERMINAL & nodeInfo.mFlags)) { + frequency = PtNodeReader.readFrequency(mFrequencyBuffer, nodeInfo.mTerminalId); } else { frequency = PtNode.NOT_A_TERMINAL; } - int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); - if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { - childrenAddress += addressPointer; - } - addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); - final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); - - final ArrayList<PendingAttribute> bigrams; - if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { - bigrams = new ArrayList<PendingAttribute>(); - final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId); - mBigramBuffer.position(posOfBigrams); - while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, - // remaining bigram entries are ignored. - final int bigramFlags = mBigramBuffer.readUnsignedByte(); - final int targetTerminalId = mBigramBuffer.readUnsignedInt24(); - mTerminalAddressTableBuffer.position( - targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); - final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24(); - bigrams.add(new PendingAttribute( - bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, - targetAddress)); - if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; - } - if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size() - + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); - } - } else { - bigrams = null; - } - return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency, - parentAddress, childrenAddress, shortcutTargets, bigrams); + + final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts( + nodeInfo.mTerminalId); + final ArrayList<PendingAttribute> bigrams = mBigramReader.readTargetsAndFrequencies( + nodeInfo.mTerminalId, mTerminalAddressTableBuffer); + + return new PtNodeInfo(ptNodePos, ptNodePos + nodeInfo.mNodeSize, nodeInfo.mFlags, + nodeInfo.mCharacters, frequency, nodeInfo.mParentPos, nodeInfo.mChildrenPos, + shortcutTargets, bigrams); } private void deleteDictFiles() { @@ -318,10 +425,14 @@ public class Ver4DictDecoder extends AbstractDictDecoder { @Override public boolean readAndFollowForwardLink() { - final int nextAddress = mDictBuffer.readUnsignedInt24(); - if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) { - mDictBuffer.position(nextAddress); - return true; + final int forwardLinkPos = mDictBuffer.position(); + int nextRelativePos = BinaryDictDecoderUtils.readSInt24(mDictBuffer); + if (nextRelativePos != FormatSpec.NO_FORWARD_LINK_ADDRESS) { + final int nextPos = forwardLinkPos + nextRelativePos; + if (nextPos >= 0 && nextPos < mDictBuffer.limit()) { + mDictBuffer.position(nextPos); + return true; + } } return false; } |