aboutsummaryrefslogtreecommitdiffstats
path: root/java/src
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-02-06 13:00:09 +0000
committerAndroid (Google) Code Review <android-gerrit@google.com>2014-02-06 13:00:09 +0000
commit6422f77e9497120fe6b2d7ec012c214c06829bab (patch)
tree16d17e911f015c995c321c8442eb034e4c647168 /java/src
parentfc6d0f8738193057296db4bdb06e3f9be14192c5 (diff)
parentab6a93773ba3cbe93002bc37b6b61f874fc09144 (diff)
downloadlatinime-6422f77e9497120fe6b2d7ec012c214c06829bab.tar.gz
latinime-6422f77e9497120fe6b2d7ec012c214c06829bab.tar.xz
latinime-6422f77e9497120fe6b2d7ec012c214c06829bab.zip
Merge "Use native logic to read Ver4 dict."
Diffstat (limited to 'java/src')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java46
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java22
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/SparseTable.java223
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java126
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java93
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java448
6 files changed, 105 insertions, 853 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java
index 5c7c4b8e3..e7d1c98a9 100644
--- a/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java
@@ -22,6 +22,7 @@ import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
@@ -223,4 +224,49 @@ public abstract class AbstractDictDecoder implements DictDecoder {
public boolean hasValidRawBinaryDictionary() {
return checkHeader() == SUCCESS;
}
+
+ // Placeholder implementations below. These are actually unused.
+ @Override
+ public void openDictBuffer() throws FileNotFoundException, IOException,
+ UnsupportedFormatException {
+ }
+
+ @Override
+ public boolean isDictBufferOpen() {
+ return false;
+ }
+
+ @Override
+ public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions options) {
+ return null;
+ }
+
+ @Override
+ public void setPosition(int newPos) {
+ }
+
+ @Override
+ public int getPosition() {
+ return 0;
+ }
+
+ @Override
+ public int readPtNodeCount() {
+ return 0;
+ }
+
+ @Override
+ public boolean readAndFollowForwardLink() {
+ return false;
+ }
+
+ @Override
+ public boolean hasNextPtNodeArray() {
+ return false;
+ }
+
+ @Override
+ @UsedForTesting
+ public void skipPtNode(final FormatOptions formatOptions) {
+ }
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
index e9561afd3..ca4a2e9bb 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
@@ -436,25 +436,25 @@ public final class FusionDictionary implements Iterable<WordProperty> {
/**
* Helper method to add a new bigram to the dictionary.
*
- * @param word1 the previous word of the context
- * @param word2 the next word of the context
+ * @param word0 the previous word of the context
+ * @param word1 the next word of the context
* @param frequency the bigram frequency
*/
- public void setBigram(final String word1, final String word2, final int frequency) {
- PtNode ptNode = findWordInTree(mRootNodeArray, word1);
- if (ptNode != null) {
- final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2);
- if (ptNode2 == null) {
- add(getCodePoints(word2), 0, null, false /* isNotAWord */,
+ public void setBigram(final String word0, final String word1, final int frequency) {
+ PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
+ if (ptNode0 != null) {
+ final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
+ if (ptNode1 == null) {
+ add(getCodePoints(word1), 0, null, false /* isNotAWord */,
false /* isBlacklistEntry */);
// The PtNode for the first word may have moved by the above insertion,
// if word1 and word2 share a common stem that happens not to have been
// a cutting point until now. In this case, we need to refresh ptNode.
- ptNode = findWordInTree(mRootNodeArray, word1);
+ ptNode0 = findWordInTree(mRootNodeArray, word0);
}
- ptNode.addBigram(word2, frequency);
+ ptNode0.addBigram(word1, frequency);
} else {
- throw new RuntimeException("First word of bigram not found");
+ throw new RuntimeException("First word of bigram not found " + word0);
}
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTable.java b/java/src/com/android/inputmethod/latin/makedict/SparseTable.java
deleted file mode 100644
index 7592a0c13..000000000
--- a/java/src/com/android/inputmethod/latin/makedict/SparseTable.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.android.inputmethod.latin.makedict;
-
-import com.android.inputmethod.annotations.UsedForTesting;
-import com.android.inputmethod.latin.utils.CollectionUtils;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-
-/**
- * SparseTable is an extensible map from integer to integer.
- * This holds one value for every mBlockSize keys, so it uses 1/mBlockSize'th of the full index
- * memory.
- */
-@UsedForTesting
-public class SparseTable {
-
- /**
- * mLookupTable is indexed by terminal ID, containing exactly one entry for every mBlockSize
- * terminals.
- * It contains at index i = j / mBlockSize the index in each ArrayList in mContentsTables where
- * the values for terminals with IDs j to j + mBlockSize - 1 are stored as an mBlockSize-sized
- * integer array.
- */
- private final ArrayList<Integer> mLookupTable;
- private final ArrayList<ArrayList<Integer>> mContentTables;
-
- private final int mBlockSize;
- private final int mContentTableCount;
- public static final int NOT_EXIST = -1;
- public static final int SIZE_OF_INT_IN_BYTES = 4;
-
- @UsedForTesting
- public SparseTable(final int initialCapacity, final int blockSize,
- final int contentTableCount) {
- mBlockSize = blockSize;
- final int lookupTableSize = initialCapacity / mBlockSize
- + (initialCapacity % mBlockSize > 0 ? 1 : 0);
- mLookupTable = new ArrayList<Integer>(Collections.nCopies(lookupTableSize, NOT_EXIST));
- mContentTableCount = contentTableCount;
- mContentTables = CollectionUtils.newArrayList();
- for (int i = 0; i < mContentTableCount; ++i) {
- mContentTables.add(new ArrayList<Integer>());
- }
- }
-
- @UsedForTesting
- public SparseTable(final ArrayList<Integer> lookupTable,
- final ArrayList<ArrayList<Integer>> contentTables, final int blockSize) {
- mBlockSize = blockSize;
- mContentTableCount = contentTables.size();
- mLookupTable = lookupTable;
- mContentTables = contentTables;
- }
-
- /**
- * Converts an byte array to an int array considering each set of 4 bytes is an int stored in
- * big-endian.
- * The length of byteArray must be a multiple of four.
- * Otherwise, IndexOutOfBoundsException will be raised.
- */
- @UsedForTesting
- private static ArrayList<Integer> convertByteArrayToIntegerArray(final byte[] byteArray) {
- final ArrayList<Integer> integerArray = new ArrayList<Integer>(byteArray.length / 4);
- for (int i = 0; i < byteArray.length; i += 4) {
- int value = 0;
- for (int j = i; j < i + 4; ++j) {
- value <<= 8;
- value |= byteArray[j] & 0xFF;
- }
- integerArray.add(value);
- }
- return integerArray;
- }
-
- @UsedForTesting
- public int get(final int contentTableIndex, final int index) {
- if (!contains(index)) {
- return NOT_EXIST;
- }
- return mContentTables.get(contentTableIndex).get(
- mLookupTable.get(index / mBlockSize) + (index % mBlockSize));
- }
-
- @UsedForTesting
- public ArrayList<Integer> getAll(final int index) {
- final ArrayList<Integer> ret = CollectionUtils.newArrayList();
- for (int i = 0; i < mContentTableCount; ++i) {
- ret.add(get(i, index));
- }
- return ret;
- }
-
- @UsedForTesting
- public void set(final int contentTableIndex, final int index, final int value) {
- if (mLookupTable.get(index / mBlockSize) == NOT_EXIST) {
- mLookupTable.set(index / mBlockSize, mContentTables.get(contentTableIndex).size());
- for (int i = 0; i < mContentTableCount; ++i) {
- for (int j = 0; j < mBlockSize; ++j) {
- mContentTables.get(i).add(NOT_EXIST);
- }
- }
- }
- mContentTables.get(contentTableIndex).set(
- mLookupTable.get(index / mBlockSize) + (index % mBlockSize), value);
- }
-
- public void remove(final int indexOfContent, final int index) {
- set(indexOfContent, index, NOT_EXIST);
- }
-
- @UsedForTesting
- public int size() {
- return mLookupTable.size() * mBlockSize;
- }
-
- @UsedForTesting
- /* package */ int getContentTableSize() {
- // This class always has at least one content table.
- return mContentTables.get(0).size();
- }
-
- @UsedForTesting
- /* package */ int getLookupTableSize() {
- return mLookupTable.size();
- }
-
- public boolean contains(final int index) {
- if (index < 0 || index / mBlockSize >= mLookupTable.size()
- || mLookupTable.get(index / mBlockSize) == NOT_EXIST) {
- return false;
- }
- return true;
- }
-
- @UsedForTesting
- public void write(final OutputStream lookupOutStream, final OutputStream[] contentOutStreams)
- throws IOException {
- if (contentOutStreams.length != mContentTableCount) {
- throw new RuntimeException(contentOutStreams.length + " streams are given, but the"
- + " table has " + mContentTableCount + " content tables.");
- }
- for (final int index : mLookupTable) {
- BinaryDictEncoderUtils.writeUIntToStream(lookupOutStream, index, SIZE_OF_INT_IN_BYTES);
- }
-
- for (int i = 0; i < contentOutStreams.length; ++i) {
- for (final int data : mContentTables.get(i)) {
- BinaryDictEncoderUtils.writeUIntToStream(contentOutStreams[i], data,
- SIZE_OF_INT_IN_BYTES);
- }
- }
- }
-
- @UsedForTesting
- public void writeToFiles(final File lookupTableFile, final File[] contentFiles)
- throws IOException {
- FileOutputStream lookupTableOutStream = null;
- final FileOutputStream[] contentTableOutStreams = new FileOutputStream[mContentTableCount];
- try {
- lookupTableOutStream = new FileOutputStream(lookupTableFile);
- for (int i = 0; i < contentFiles.length; ++i) {
- contentTableOutStreams[i] = new FileOutputStream(contentFiles[i]);
- }
- write(lookupTableOutStream, contentTableOutStreams);
- } finally {
- if (lookupTableOutStream != null) {
- lookupTableOutStream.close();
- }
- for (int i = 0; i < contentTableOutStreams.length; ++i) {
- if (contentTableOutStreams[i] != null) {
- contentTableOutStreams[i].close();
- }
- }
- }
- }
-
- private static byte[] readFileToByteArray(final File file) throws IOException {
- final byte[] contents = new byte[(int) file.length()];
- FileInputStream inStream = null;
- try {
- inStream = new FileInputStream(file);
- inStream.read(contents);
- } finally {
- if (inStream != null) {
- inStream.close();
- }
- }
- return contents;
- }
-
- @UsedForTesting
- public static SparseTable readFromFiles(final File lookupTableFile, final File[] contentFiles,
- final int blockSize) throws IOException {
- final ArrayList<ArrayList<Integer>> contentTables =
- new ArrayList<ArrayList<Integer>>(contentFiles.length);
- for (int i = 0; i < contentFiles.length; ++i) {
- contentTables.add(convertByteArrayToIntegerArray(readFileToByteArray(contentFiles[i])));
- }
- return new SparseTable(convertByteArrayToIntegerArray(readFileToByteArray(lookupTableFile)),
- contentTables, blockSize);
- }
-}
diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java
deleted file mode 100644
index 63e1f56f5..000000000
--- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.android.inputmethod.latin.makedict;
-
-import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
-import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-
-/**
- * An auxiliary class for reading SparseTable and data written by SparseTableContentWriter.
- */
-public class SparseTableContentReader {
-
- /**
- * An interface of a function which is passed to SparseTableContentReader.read.
- */
- public interface SparseTableContentReaderInterface {
- /**
- * Reads data.
- *
- * @param buffer the DictBuffer. The position of the buffer is set to the head of data.
- */
- public void read(final DictBuffer buffer);
- }
-
- protected final int mContentCount;
- protected final int mBlockSize;
- protected final File mBaseDir;
- protected final File mLookupTableFile;
- protected final File[] mAddressTableFiles;
- protected final File[] mContentFiles;
- protected DictBuffer mLookupTableBuffer;
- protected final DictBuffer[] mAddressTableBuffers;
- private final DictBuffer[] mContentBuffers;
- protected final DictionaryBufferFactory mFactory;
-
- /**
- * Sole constructor of SparseTableContentReader.
- *
- * @param name the name of SparseTable.
- * @param blockSize the block size of the content table.
- * @param baseDir the directory which contains the files of the content table.
- * @param contentFilenames the file names of content files.
- * @param contentSuffixes the ids of contents. These ids are used for a suffix of a name of
- * address files and content files.
- * @param factory the DictionaryBufferFactory which is used for opening the files.
- */
- public SparseTableContentReader(final String name, final int blockSize, final File baseDir,
- final String[] contentFilenames, final String[] contentSuffixes,
- final DictionaryBufferFactory factory) {
- if (contentFilenames.length != contentSuffixes.length) {
- throw new RuntimeException("The length of contentFilenames and the length of"
- + " contentSuffixes are different " + contentFilenames.length + ", "
- + contentSuffixes.length);
- }
- mBlockSize = blockSize;
- mBaseDir = baseDir;
- mFactory = factory;
- mContentCount = contentFilenames.length;
- mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
- mAddressTableFiles = new File[mContentCount];
- mContentFiles = new File[mContentCount];
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableFiles[i] = new File(mBaseDir,
- name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentSuffixes[i]);
- mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentSuffixes[i]);
- }
- mAddressTableBuffers = new DictBuffer[mContentCount];
- mContentBuffers = new DictBuffer[mContentCount];
- }
-
- public void openBuffers() throws FileNotFoundException, IOException {
- mLookupTableBuffer = mFactory.getDictionaryBuffer(mLookupTableFile);
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableBuffers[i] = mFactory.getDictionaryBuffer(mAddressTableFiles[i]);
- mContentBuffers[i] = mFactory.getDictionaryBuffer(mContentFiles[i]);
- }
- }
-
- /**
- * Calls the read() callback of the reader with the appropriate buffer appropriately positioned.
- * @param contentNumber the index in the original contentFilenames[] array.
- * @param terminalId the terminal ID to read.
- * @param reader the reader on which to call the callback.
- */
- protected void read(final int contentNumber, final int terminalId,
- final SparseTableContentReaderInterface reader) {
- if (terminalId < 0 || (terminalId / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES
- >= mLookupTableBuffer.limit()) {
- return;
- }
-
- mLookupTableBuffer.position((terminalId / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES);
- final int indexInAddressTable = mLookupTableBuffer.readInt();
- if (indexInAddressTable == SparseTable.NOT_EXIST) {
- return;
- }
-
- mAddressTableBuffers[contentNumber].position(SparseTable.SIZE_OF_INT_IN_BYTES
- * ((indexInAddressTable * mBlockSize) + (terminalId % mBlockSize)));
- final int address = mAddressTableBuffers[contentNumber].readInt();
- if (address == SparseTable.NOT_EXIST) {
- return;
- }
-
- mContentBuffers[contentNumber].position(address);
- reader.read(mContentBuffers[contentNumber]);
- }
-} \ No newline at end of file
diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java
deleted file mode 100644
index 49f0fd624..000000000
--- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.android.inputmethod.latin.makedict;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-
-/**
- * An auxiliary class for writing data associated with SparseTable to files.
- */
-public class SparseTableContentWriter {
- public interface SparseTableContentWriterInterface {
- public void write(final OutputStream outStream) throws IOException;
- }
-
- private final int mContentCount;
- private final SparseTable mSparseTable;
- private final File mLookupTableFile;
- protected final File mBaseDir;
- private final File[] mAddressTableFiles;
- private final File[] mContentFiles;
- protected final OutputStream[] mContentOutStreams;
-
- /**
- * Sole constructor of SparseTableContentWriter.
- *
- * @param name the name of SparseTable.
- * @param initialCapacity the initial capacity of SparseTable.
- * @param blockSize the block size of the content table.
- * @param baseDir the directory which contains the files of the content table.
- * @param contentFilenames the file names of content files.
- * @param contentIds the ids of contents. These ids are used for a suffix of a name of address
- * files and content files.
- */
- public SparseTableContentWriter(final String name, final int initialCapacity,
- final int blockSize, final File baseDir, final String[] contentFilenames,
- final String[] contentIds) {
- if (contentFilenames.length != contentIds.length) {
- throw new RuntimeException("The length of contentFilenames and the length of"
- + " contentIds are different " + contentFilenames.length + ", "
- + contentIds.length);
- }
- mContentCount = contentFilenames.length;
- mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount);
- mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
- mAddressTableFiles = new File[mContentCount];
- mContentFiles = new File[mContentCount];
- mBaseDir = baseDir;
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableFiles[i] = new File(mBaseDir,
- name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
- mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
- }
- mContentOutStreams = new OutputStream[mContentCount];
- }
-
- public void openStreams() throws FileNotFoundException {
- for (int i = 0; i < mContentCount; ++i) {
- mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]);
- }
- }
-
- protected void write(final int contentIndex, final int index,
- final SparseTableContentWriterInterface writer) throws IOException {
- mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length());
- writer.write(mContentOutStreams[contentIndex]);
- mContentOutStreams[contentIndex].flush();
- }
-
- public void closeStreams() throws IOException {
- mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles);
- for (int i = 0; i < mContentCount; ++i) {
- mContentOutStreams[i].close();
- }
- }
-} \ No newline at end of file
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
index 9ddaaf734..83707480d 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
@@ -17,20 +17,15 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
-import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
-import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
-import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
-import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
+import com.android.inputmethod.latin.BinaryDictionary;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.utils.CollectionUtils;
-
-import android.util.Log;
+import com.android.inputmethod.latin.utils.FileUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
/**
* An implementation of binary dictionary decoder for version 4 binary dictionary.
@@ -39,421 +34,74 @@ import java.util.Arrays;
public class Ver4DictDecoder extends AbstractDictDecoder {
private static final String TAG = Ver4DictDecoder.class.getSimpleName();
- protected static final int FILETYPE_TRIE = 1;
- protected static final int FILETYPE_FREQUENCY = 2;
- protected static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
- protected static final int FILETYPE_BIGRAM_FREQ = 4;
- protected static final int FILETYPE_SHORTCUT = 5;
- protected static final int FILETYPE_HEADER = 6;
-
- protected final File mDictDirectory;
- protected final DictionaryBufferFactory mBufferFactory;
- protected DictBuffer mDictBuffer;
- protected DictBuffer mHeaderBuffer;
- protected DictBuffer mFrequencyBuffer;
- protected DictBuffer mTerminalAddressTableBuffer;
- private BigramContentReader mBigramReader;
- private ShortcutContentReader mShortcutReader;
-
- /**
- * Raw PtNode info straight out of a trie file in version 4 dictionary.
- */
- protected static final class Ver4PtNodeInfo {
- public final int mFlags;
- public final int[] mCharacters;
- public final int mTerminalId;
- public final int mChildrenPos;
- public final int mParentPos;
- public final int mNodeSize;
- public int mStartIndexOfCharacters;
- public int mEndIndexOfCharacters; // exclusive
-
- public Ver4PtNodeInfo(final int flags, final int[] characters, final int terminalId,
- final int childrenPos, final int parentPos, final int nodeSize) {
- mFlags = flags;
- mCharacters = characters;
- mTerminalId = terminalId;
- mChildrenPos = childrenPos;
- mParentPos = parentPos;
- mNodeSize = nodeSize;
- mStartIndexOfCharacters = 0;
- mEndIndexOfCharacters = characters.length;
- }
- }
+ final File mDictDirectory;
+ final BinaryDictionary mBinaryDictionary;
@UsedForTesting
/* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
- mDictDirectory = dictDirectory;
- mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null;
-
- if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
- mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
- } else if ((factoryFlag & MASK_DICTBUFFER) == USE_BYTEARRAY) {
- mBufferFactory = new DictionaryBufferFromByteArrayFactory();
- } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) {
- mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory();
- } else {
- mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
- }
+ this(dictDirectory, null /* factory */);
}
@UsedForTesting
/* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) {
mDictDirectory = dictDirectory;
- mBufferFactory = factory;
- mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null;
- }
-
- protected File getFile(final int fileType) throws UnsupportedFormatException {
- if (fileType == FILETYPE_TRIE) {
- return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION);
- } else if (fileType == FILETYPE_HEADER) {
- return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.HEADER_FILE_EXTENSION);
- } else if (fileType == FILETYPE_FREQUENCY) {
- return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION);
- } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
- return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
- } else if (fileType == FILETYPE_BIGRAM_FREQ) {
- return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
- + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
- } else if (fileType == FILETYPE_SHORTCUT) {
- return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION
- + FormatSpec.SHORTCUT_CONTENT_ID);
- } else {
- throw new UnsupportedFormatException("Unsupported kind of file : " + fileType);
- }
- }
-
- @Override
- public void openDictBuffer() throws FileNotFoundException, IOException,
- UnsupportedFormatException {
- if (!mDictDirectory.isDirectory()) {
- throw new UnsupportedFormatException("Format 4 dictionary needs a directory");
- }
- mHeaderBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_HEADER));
- mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE));
- mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
- mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
- getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
- mBigramReader = new BigramContentReader(mDictDirectory.getName(),
- mDictDirectory, mBufferFactory);
- mBigramReader.openBuffers();
- mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory,
- mBufferFactory);
- mShortcutReader.openBuffers();
- }
-
- @Override
- public boolean isDictBufferOpen() {
- return mDictBuffer != null;
- }
-
- @UsedForTesting
- /* package */ DictBuffer getHeaderBuffer() {
- return mHeaderBuffer;
- }
-
- @UsedForTesting
- /* package */ DictBuffer getDictBuffer() {
- return mDictBuffer;
+ mBinaryDictionary = new BinaryDictionary(dictDirectory.getAbsolutePath(),
+ 0 /* offset */, 0 /* length */, true /* useFullEditDistance */, null /* locale */,
+ "" /* dictType */, true /* isUpdatable */);
}
@Override
public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException {
- if (mHeaderBuffer == null) {
- openDictBuffer();
- }
- mHeaderBuffer.position(0);
- final DictionaryHeader header = super.readHeader(mHeaderBuffer);
- final int version = header.mFormatOptions.mVersion;
- if (version != FormatSpec.VERSION4) {
- throw new UnsupportedFormatException("File header has a wrong version : " + version);
- }
- return header;
- }
-
- /**
- * An auxiliary class for reading bigrams.
- */
- protected static class BigramContentReader extends SparseTableContentReader {
- public BigramContentReader(final String name, final File baseDir,
- final DictionaryBufferFactory factory) {
- super(name + FormatSpec.BIGRAM_FILE_EXTENSION,
- FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- getContentFilenames(name), getContentIds(), factory);
- }
-
- // TODO: Consolidate this method and BigramContentWriter.getContentFilenames.
- protected static String[] getContentFilenames(final String name) {
- return new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
- }
-
- // TODO: Consolidate this method and BigramContentWriter.getContentIds.
- protected static String[] getContentIds() {
- return new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
- }
-
- public ArrayList<PendingAttribute> readTargetsAndFrequencies(final int terminalId,
- final DictBuffer terminalAddressTableBuffer, final FormatOptions options) {
- final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList();
- read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
- new SparseTableContentReaderInterface() {
- @Override
- public void read(final DictBuffer buffer) {
- while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
- // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
- // remaining bigram entries are ignored.
- final int bigramFlags = buffer.readUnsignedByte();
- final int probability;
-
- if (options.mHasTimestamp) {
- probability = buffer.readUnsignedByte();
- // Skip timestamp
- buffer.readInt();
- // Skip level
- buffer.readUnsignedByte();
- // Skip count
- buffer.readUnsignedByte();
- } else {
- probability = bigramFlags
- & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
- }
- final int targetTerminalId = buffer.readUnsignedInt24();
- terminalAddressTableBuffer.position(targetTerminalId
- * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
- final int targetAddress =
- terminalAddressTableBuffer.readUnsignedInt24();
- bigrams.add(new PendingAttribute(probability, targetAddress));
- if (0 == (bigramFlags
- & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
- break;
- }
- }
- if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
- throw new RuntimeException("Too many bigrams in a PtNode ("
- + bigrams.size() + " but max is "
- + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
- }
- }
- });
- if (bigrams.isEmpty()) return null;
- return bigrams;
- }
- }
-
- /**
- * An auxiliary class for reading shortcuts.
- */
- protected static class ShortcutContentReader extends SparseTableContentReader {
- public ShortcutContentReader(final String name, final File baseDir,
- final DictionaryBufferFactory factory) {
- super(name + FormatSpec.SHORTCUT_FILE_EXTENSION,
- FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
- new String[] { FormatSpec.SHORTCUT_CONTENT_ID }, factory);
- }
-
- public ArrayList<WeightedString> readShortcuts(final int terminalId) {
- final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList();
- read(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
- new SparseTableContentReaderInterface() {
- @Override
- public void read(final DictBuffer buffer) {
- while (true) {
- final int flags = buffer.readUnsignedByte();
- final String word = CharEncoding.readString(buffer);
- shortcuts.add(new WeightedString(word,
- flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
- if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
- break;
- }
- }
- }
- });
- if (shortcuts.isEmpty()) return null;
- return shortcuts;
- }
- }
-
- protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
- protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId,
- final FormatOptions formatOptions) {
- final int readingPos;
- if (formatOptions.mHasTimestamp) {
- final int entrySize = FormatSpec.FREQUENCY_AND_FLAGS_SIZE
- + FormatSpec.UNIGRAM_TIMESTAMP_SIZE + FormatSpec.UNIGRAM_LEVEL_SIZE
- + FormatSpec.UNIGRAM_COUNTER_SIZE;
- readingPos = terminalId * entrySize + FormatSpec.FLAGS_IN_FREQ_FILE_SIZE;
- } else {
- readingPos = terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE
- + FormatSpec.FLAGS_IN_FREQ_FILE_SIZE;
- }
- frequencyBuffer.position(readingPos);
- return frequencyBuffer.readUnsignedByte();
- }
-
- protected static int readTerminalId(final DictBuffer dictBuffer) {
- return dictBuffer.readInt();
- }
- }
-
- private final int[] mCharacterBufferForReadingVer4PtNodeInfo
- = new int[FormatSpec.MAX_WORD_LENGTH];
-
- /**
- * Reads PtNode from ptNodePos in the trie file and returns Ver4PtNodeInfo.
- *
- * @param ptNodePos the position of PtNode.
- * @param options the format options.
- * @return Ver4PtNodeInfo.
- */
- // TODO: Make this buffer thread safe.
- // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH.
- protected Ver4PtNodeInfo readVer4PtNodeInfo(final int ptNodePos, final FormatOptions options) {
- int readingPos = ptNodePos;
- final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
- readingPos += FormatSpec.PTNODE_FLAGS_SIZE;
-
- final int parentPos = PtNodeReader.readParentAddress(mDictBuffer, options);
- if (BinaryDictIOUtils.supportsDynamicUpdate(options)) {
- readingPos += FormatSpec.PARENT_ADDRESS_SIZE;
- }
-
- final int characters[];
- if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
- int index = 0;
- int character = CharEncoding.readChar(mDictBuffer);
- readingPos += CharEncoding.getCharSize(character);
- while (FormatSpec.INVALID_CHARACTER != character
- && index < FormatSpec.MAX_WORD_LENGTH) {
- mCharacterBufferForReadingVer4PtNodeInfo[index++] = character;
- character = CharEncoding.readChar(mDictBuffer);
- readingPos += CharEncoding.getCharSize(character);
- }
- characters = Arrays.copyOfRange(mCharacterBufferForReadingVer4PtNodeInfo, 0, index);
- } else {
- final int character = CharEncoding.readChar(mDictBuffer);
- readingPos += CharEncoding.getCharSize(character);
- characters = new int[] { character };
- }
- final int terminalId;
- if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
- terminalId = PtNodeReader.readTerminalId(mDictBuffer);
- readingPos += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
- } else {
- terminalId = PtNode.NOT_A_TERMINAL;
- }
-
- int childrenPos = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
- if (childrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
- childrenPos += readingPos;
- }
- readingPos += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
-
- return new Ver4PtNodeInfo(flags, characters, terminalId, childrenPos, parentPos,
- readingPos - ptNodePos);
- }
-
- @Override
- public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions options) {
- final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(ptNodePos, options);
-
- final int frequency;
- if (0 != (FormatSpec.FLAG_IS_TERMINAL & nodeInfo.mFlags)) {
- frequency = PtNodeReader.readFrequency(mFrequencyBuffer, nodeInfo.mTerminalId, options);
- } else {
- frequency = PtNode.NOT_A_TERMINAL;
- }
-
- final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts(
- nodeInfo.mTerminalId);
- final ArrayList<PendingAttribute> bigrams = mBigramReader.readTargetsAndFrequencies(
- nodeInfo.mTerminalId, mTerminalAddressTableBuffer, options);
-
- return new PtNodeInfo(ptNodePos, ptNodePos + nodeInfo.mNodeSize, nodeInfo.mFlags,
- nodeInfo.mCharacters, frequency, nodeInfo.mParentPos, nodeInfo.mChildrenPos,
- shortcutTargets, bigrams);
- }
-
- private void deleteDictFiles() {
- final File[] files = mDictDirectory.listFiles();
- for (int i = 0; i < files.length; ++i) {
- files[i].delete();
- }
+ return mBinaryDictionary.getHeader();
}
@Override
public FusionDictionary readDictionaryBinary(final FusionDictionary dict,
final boolean deleteDictIfBroken)
throws FileNotFoundException, IOException, UnsupportedFormatException {
- if (mDictBuffer == null) {
- openDictBuffer();
- }
- try {
- return BinaryDictDecoderUtils.readDictionaryBinary(this, dict);
- } catch (IOException e) {
- Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e);
- if (deleteDictIfBroken) {
- deleteDictFiles();
+ final DictionaryHeader header = readHeader();
+ final FusionDictionary fusionDict = dict != null ? dict :
+ new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions);
+ int token = 0;
+ final ArrayList<WordProperty> wordProperties = CollectionUtils.newArrayList();
+ do {
+ final BinaryDictionary.GetNextWordPropertyResult result =
+ mBinaryDictionary.getNextWordProperty(token);
+ final WordProperty wordProperty = result.mWordProperty;
+ if (wordProperty == null) {
+ if (deleteDictIfBroken) {
+ mBinaryDictionary.close();
+ FileUtils.deleteRecursively(mDictDirectory);
+ }
+ return null;
}
- throw e;
- } catch (UnsupportedFormatException e) {
- Log.e(TAG, "The dictionary " + mDictDirectory.getName() + " is broken.", e);
- if (deleteDictIfBroken) {
- deleteDictFiles();
+ wordProperties.add(wordProperty);
+ token = result.mNextToken;
+ } while (token != 0);
+
+ // Insert unigrams to the fusion dictionary.
+ for (final WordProperty wordProperty : wordProperties) {
+ // TODO: Support probability that is -1.
+ final int probability = wordProperty.getProbability() < 0 ?
+ 0 : wordProperty.getProbability();
+ if (wordProperty.mIsBlacklistEntry) {
+ fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
+ wordProperty.mIsNotAWord);
+ } else {
+ fusionDict.add(wordProperty.mWord, probability,
+ wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
}
- throw e;
}
- }
-
- @Override
- public void setPosition(int newPos) {
- mDictBuffer.position(newPos);
- }
-
- @Override
- public int getPosition() {
- return mDictBuffer.position();
- }
-
- @Override
- public int readPtNodeCount() {
- return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer);
- }
-
- @Override
- public boolean readAndFollowForwardLink() {
- final int forwardLinkPos = mDictBuffer.position();
- int nextRelativePos = BinaryDictDecoderUtils.readSInt24(mDictBuffer);
- if (nextRelativePos != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
- final int nextPos = forwardLinkPos + nextRelativePos;
- if (nextPos >= 0 && nextPos < mDictBuffer.limit()) {
- mDictBuffer.position(nextPos);
- return true;
+ // Insert bigrams to the fusion dictionary.
+ for (final WordProperty wordProperty : wordProperties) {
+ if (wordProperty.mBigrams == null) {
+ continue;
+ }
+ final String word0 = wordProperty.mWord;
+ for (final WeightedString bigram : wordProperty.mBigrams) {
+ fusionDict.setBigram(word0, bigram.mWord, bigram.getProbability());
}
}
- return false;
- }
-
- @Override
- public boolean hasNextPtNodeArray() {
- return mDictBuffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS;
- }
-
- @Override
- @UsedForTesting
- public void skipPtNode(final FormatOptions formatOptions) {
- final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
- PtNodeReader.readParentAddress(mDictBuffer, formatOptions);
- BinaryDictIOUtils.skipString(mDictBuffer,
- (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
- if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer);
- PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions);
+ return fusionDict;
}
}