aboutsummaryrefslogtreecommitdiffstats
path: root/java/src
diff options
context:
space:
mode:
authorYuichiro Hanada <yhanada@google.com>2013-10-01 23:08:27 +0900
committerYuichiro Hanada <yhanada@google.com>2013-10-03 20:31:01 +0900
commit3dd77a6d6696bb426b200b27adeb8be7e887a667 (patch)
treeaadac0046a6ea76b8797763db83e7be38612a47a /java/src
parentd6e307a4b7933ad5efebc6b0d3b775c5ab5c0e6e (diff)
downloadlatinime-3dd77a6d6696bb426b200b27adeb8be7e887a667.tar.gz
latinime-3dd77a6d6696bb426b200b27adeb8be7e887a667.tar.xz
latinime-3dd77a6d6696bb426b200b27adeb8be7e887a667.zip
Add SparseTableContentWriter to Ver4DictEncoder.
Bug: 10920165 Change-Id: I6372492e97297baad4c5aeeb3fb36dcccd7a944b
Diffstat (limited to 'java/src')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/FormatSpec.java7
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java20
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java144
3 files changed, 119 insertions, 52 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
index 9481a8c14..a5516bd41 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
@@ -266,11 +266,14 @@ public final class FormatSpec {
// tat = Terminal Address Table
static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
static final String BIGRAM_FILE_EXTENSION = ".bigram";
- static final String BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup";
- static final String BIGRAM_ADDRESS_TABLE_FILE_EXTENSION = ".bigram_index";
+ static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup";
+ static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
static final int FREQUENCY_AND_FLAGS_SIZE = 2;
static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
+ static final int BIGRAM_CONTENT_COUNT = 1;
+ static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
+ static final String BIGRAM_FREQ_CONTENT_ID = "_freq";
static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
static final int NO_PARENT_ADDRESS = 0;
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
index 624b2784f..5089687da 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
@@ -42,7 +42,7 @@ public class Ver4DictDecoder extends DictDecoder {
private static final int FILETYPE_TRIE = 1;
private static final int FILETYPE_FREQUENCY = 2;
private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
- private static final int FILETYPE_BIGRAM = 4;
+ private static final int FILETYPE_BIGRAM_FREQ = 4;
private final File mDictDirectory;
private final DictionaryBufferFactory mBufferFactory;
@@ -85,9 +85,10 @@ public class Ver4DictDecoder extends DictDecoder {
} else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
- } else if (fileType == FILETYPE_BIGRAM) {
+ } else if (fileType == FILETYPE_BIGRAM_FREQ) {
return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION);
+ mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
+ + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
} else {
throw new RuntimeException("Unsupported kind of file : " + fileType);
}
@@ -99,7 +100,7 @@ public class Ver4DictDecoder extends DictDecoder {
mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
- mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM));
+ mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
loadBigramAddressSparseTable();
}
@@ -126,11 +127,12 @@ public class Ver4DictDecoder extends DictDecoder {
}
private void loadBigramAddressSparseTable() throws IOException {
- final File lookupIndexFile = new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
- final File contentFile = new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
- mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { contentFile },
+ final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
+ + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
+ final File freqsFile = new File(mDictDirectory, mDictDirectory.getName()
+ + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
+ mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile },
FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
index a403e25db..b38c33019 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
@@ -26,7 +26,6 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
-import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
@@ -44,19 +43,115 @@ public class Ver4DictEncoder implements DictEncoder {
private byte[] mTrieBuf;
private int mTriePos;
private int mHeaderSize;
- private SparseTable mBigramAddressTable;
private OutputStream mTrieOutStream;
private OutputStream mFreqOutStream;
private OutputStream mTerminalAddressTableOutStream;
- private OutputStream mBigramOutStream;
private File mDictDir;
private String mBaseFilename;
+ private BigramContentWriter mBigramWriter;
@UsedForTesting
public Ver4DictEncoder(final File dictPlacedDir) {
mDictPlacedDir = dictPlacedDir;
}
+ private interface SparseTableContentWriterInterface {
+ public void write(final OutputStream outStream) throws IOException;
+ }
+
+ private static class SparseTableContentWriter {
+ private final int mContentCount;
+ private final SparseTable mSparseTable;
+ private final File mLookupTableFile;
+ protected final File mBaseDir;
+ private final File[] mAddressTableFiles;
+ private final File[] mContentFiles;
+ protected final OutputStream[] mContentOutStreams;
+
+ public SparseTableContentWriter(final String name, final int contentCount,
+ final int initialCapacity, final int blockSize, final File baseDir,
+ final String[] contentFilenames, final String[] contentIds) {
+ if (contentFilenames.length != contentIds.length) {
+ throw new RuntimeException("The length of contentFilenames and the length of"
+ + " contentIds are different " + contentFilenames.length + ", "
+ + contentIds.length);
+ }
+ mContentCount = contentCount;
+ mSparseTable = new SparseTable(initialCapacity, blockSize, contentCount);
+ mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
+ mAddressTableFiles = new File[mContentCount];
+ mContentFiles = new File[mContentCount];
+ mBaseDir = baseDir;
+ for (int i = 0; i < mContentCount; ++i) {
+ mAddressTableFiles[i] = new File(mBaseDir,
+ name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
+ mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
+ }
+ mContentOutStreams = new OutputStream[mContentCount];
+ }
+
+ public void openStreams() throws FileNotFoundException {
+ for (int i = 0; i < mContentCount; ++i) {
+ mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]);
+ }
+ }
+
+ protected void write(final int contentIndex, final int index,
+ final SparseTableContentWriterInterface writer) throws IOException {
+ mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length());
+ writer.write(mContentOutStreams[contentIndex]);
+ mContentOutStreams[contentIndex].flush();
+ }
+
+ public void closeStreams() throws IOException {
+ mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles);
+ for (int i = 0; i < mContentCount; ++i) {
+ mContentOutStreams[i].close();
+ }
+ }
+ }
+
+ private static class BigramContentWriter extends SparseTableContentWriter {
+
+ public BigramContentWriter(final String name, final int initialCapacity,
+ final File baseDir) {
+ super(name + FormatSpec.BIGRAM_FILE_EXTENSION, FormatSpec.BIGRAM_CONTENT_COUNT,
+ initialCapacity, FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
+ new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION },
+ new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID });
+ }
+
+ public void writeBigramsForOneWord(final int terminalId,
+ final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
+ throws IOException {
+ write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
+ new SparseTableContentWriterInterface() {
+ @Override
+ public void write(final OutputStream outStream) throws IOException {
+ writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
+ }
+ });
+ }
+
+ private void writeBigramsForOneWordInternal(final OutputStream outStream,
+ final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
+ throws IOException {
+ while (bigramIterator.hasNext()) {
+ final WeightedString bigram = bigramIterator.next();
+ final PtNode target =
+ FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
+ final int unigramFrequencyForThisWord = target.mFrequency;
+ final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
+ bigramIterator.hasNext(), 0, bigram.mFrequency,
+ unigramFrequencyForThisWord, bigram.mWord);
+ BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags,
+ FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
+ BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId,
+ FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
+ }
+ }
+ }
+
private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
throws FileNotFoundException, IOException {
final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
@@ -66,8 +161,6 @@ public class Ver4DictEncoder implements DictEncoder {
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
final File terminalAddressTableFile = new File(mDictDir,
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
- final File bigramFile = new File(mDictDir,
- mBaseFilename + FormatSpec.BIGRAM_FILE_EXTENSION);
if (!mDictDir.isDirectory()) {
if (mDictDir.exists()) mDictDir.delete();
mDictDir.mkdirs();
@@ -78,7 +171,6 @@ public class Ver4DictEncoder implements DictEncoder {
mTrieOutStream = new FileOutputStream(trieFile);
mFreqOutStream = new FileOutputStream(freqFile);
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
- mBigramOutStream = new FileOutputStream(bigramFile);
}
private void close() throws IOException {
@@ -92,14 +184,10 @@ public class Ver4DictEncoder implements DictEncoder {
if (mTerminalAddressTableOutStream != null) {
mTerminalAddressTableOutStream.close();
}
- if (mBigramOutStream != null) {
- mBigramOutStream.close();
- }
} finally {
mTrieOutStream = null;
mFreqOutStream = null;
mTerminalAddressTableOutStream = null;
- mBigramOutStream = null;
}
}
@@ -135,10 +223,8 @@ public class Ver4DictEncoder implements DictEncoder {
if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
writeTerminalData(flatNodes, terminalCount);
- mBigramAddressTable = new SparseTable(terminalCount,
- FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, 1 /* contentTableCount */);
+ mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir);
writeBigrams(flatNodes, dict);
- writeBigramAddressSparseTable();
final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
@@ -245,40 +331,16 @@ public class Ver4DictEncoder implements DictEncoder {
private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
throws IOException {
- final ByteArrayOutputStream bigramBuffer = new ByteArrayOutputStream();
-
+ mBigramWriter.openStreams();
for (final PtNodeArray nodeArray : flatNodes) {
for (final PtNode ptNode : nodeArray.mData) {
if (ptNode.mBigrams != null) {
- final int startPos = bigramBuffer.size();
- mBigramAddressTable.set(0 /* contentTableIndex */, ptNode.mTerminalId,
- startPos);
- final Iterator<WeightedString> bigramIterator = ptNode.mBigrams.iterator();
- while (bigramIterator.hasNext()) {
- final WeightedString bigram = bigramIterator.next();
- final PtNode target =
- FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
- final int unigramFrequencyForThisWord = target.mFrequency;
- final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
- bigramIterator.hasNext(), 0, bigram.mFrequency,
- unigramFrequencyForThisWord, bigram.mWord);
- BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, bigramFlags,
- FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
- BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, target.mTerminalId,
- FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
- }
+ mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId,
+ ptNode.mBigrams.iterator(), dict);
}
}
}
- bigramBuffer.writeTo(mBigramOutStream);
- }
-
- private void writeBigramAddressSparseTable() throws IOException {
- final File lookupIndexFile =
- new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
- final File contentFile =
- new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
- mBigramAddressTable.writeToFiles(lookupIndexFile, new File[] { contentFile });
+ mBigramWriter.closeStreams();
}
@Override