aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java427
1 files changed, 50 insertions, 377 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
index 8b80ebe63..a746f9945 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
@@ -1,5 +1,4 @@
/*
-/*
* Copyright (C) 2013 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,25 +17,15 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
-import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
-import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
+import com.android.inputmethod.latin.BinaryDictionary;
+import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
-import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
-import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
-import com.android.inputmethod.latin.utils.CollectionUtils;
-import com.android.inputmethod.latin.utils.FileUtils;
+import com.android.inputmethod.latin.utils.LocaleUtils;
import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
/**
* An implementation of DictEncoder for version 4 binary dictionary.
@@ -44,197 +33,19 @@ import java.util.Iterator;
@UsedForTesting
public class Ver4DictEncoder implements DictEncoder {
private final File mDictPlacedDir;
- private byte[] mTrieBuf;
- private int mTriePos;
- private OutputStream mTrieOutStream;
- private OutputStream mHeaderOutStream;
- private OutputStream mFreqOutStream;
- private OutputStream mUnigramTimestampOutStream;
- private OutputStream mTerminalAddressTableOutStream;
- private File mDictDir;
- private String mBaseFilename;
- private BigramContentWriter mBigramWriter;
- private ShortcutContentWriter mShortcutWriter;
@UsedForTesting
public Ver4DictEncoder(final File dictPlacedDir) {
mDictPlacedDir = dictPlacedDir;
}
- private static class BigramContentWriter extends SparseTableContentWriter {
- private final boolean mWriteTimestamp;
-
- public BigramContentWriter(final String name, final int initialCapacity,
- final File baseDir, final boolean writeTimestamp) {
- super(name + FormatSpec.BIGRAM_FILE_EXTENSION, initialCapacity,
- FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- getContentFilenames(name, writeTimestamp), getContentIds(writeTimestamp));
- mWriteTimestamp = writeTimestamp;
- }
-
- private static String[] getContentFilenames(final String name,
- final boolean writeTimestamp) {
- final String[] contentFilenames;
- if (writeTimestamp) {
- contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION,
- name + FormatSpec.BIGRAM_FILE_EXTENSION };
- } else {
- contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
- }
- return contentFilenames;
- }
-
- private static String[] getContentIds(final boolean writeTimestamp) {
- final String[] contentIds;
- if (writeTimestamp) {
- contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID,
- FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID };
- } else {
- contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
- }
- return contentIds;
- }
-
- public void writeBigramsForOneWord(final int terminalId, final int bigramCount,
- final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
- throws IOException {
- write(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
- new SparseTableContentWriterInterface() {
- @Override
- public void write(final OutputStream outStream) throws IOException {
- writeBigramsForOneWordInternal(outStream, bigramIterator, dict);
- }});
- if (mWriteTimestamp) {
- write(FormatSpec.BIGRAM_TIMESTAMP_CONTENT_INDEX, terminalId,
- new SparseTableContentWriterInterface() {
- @Override
- public void write(final OutputStream outStream) throws IOException {
- initBigramTimestampsCountersAndLevelsForOneWordInternal(outStream,
- bigramCount);
- }});
- }
- }
-
- private void writeBigramsForOneWordInternal(final OutputStream outStream,
- final Iterator<WeightedString> bigramIterator, final FusionDictionary dict)
- throws IOException {
- while (bigramIterator.hasNext()) {
- final WeightedString bigram = bigramIterator.next();
- final PtNode target =
- FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
- final int unigramFrequencyForThisWord = target.mFrequency;
- final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
- bigramIterator.hasNext(), 0, bigram.mFrequency,
- unigramFrequencyForThisWord, bigram.mWord);
- BinaryDictEncoderUtils.writeUIntToStream(outStream, bigramFlags,
- FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
- BinaryDictEncoderUtils.writeUIntToStream(outStream, target.mTerminalId,
- FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
- }
- }
-
- private void initBigramTimestampsCountersAndLevelsForOneWordInternal(
- final OutputStream outStream, final int bigramCount) throws IOException {
- for (int i = 0; i < bigramCount; ++i) {
- // TODO: Figure out what initial values should be.
- BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
- FormatSpec.BIGRAM_TIMESTAMP_SIZE);
- BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
- FormatSpec.BIGRAM_COUNTER_SIZE);
- BinaryDictEncoderUtils.writeUIntToStream(outStream, 0 /* value */,
- FormatSpec.BIGRAM_LEVEL_SIZE);
- }
- }
- }
-
- private static class ShortcutContentWriter extends SparseTableContentWriter {
- public ShortcutContentWriter(final String name, final int initialCapacity,
- final File baseDir) {
- super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, initialCapacity,
- FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
- new String[] { FormatSpec.SHORTCUT_CONTENT_ID });
- }
-
- public void writeShortcutForOneWord(final int terminalId,
- final Iterator<WeightedString> shortcutIterator) throws IOException {
- write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
- new SparseTableContentWriterInterface() {
- @Override
- public void write(final OutputStream outStream) throws IOException {
- writeShortcutForOneWordInternal(outStream, shortcutIterator);
- }
- });
- }
-
- private void writeShortcutForOneWordInternal(final OutputStream outStream,
- final Iterator<WeightedString> shortcutIterator) throws IOException {
- while (shortcutIterator.hasNext()) {
- final WeightedString target = shortcutIterator.next();
- final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
- shortcutIterator.hasNext(), target.mFrequency);
- BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags,
- FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
- CharEncoding.writeString(outStream, target.mWord);
- }
- }
- }
-
- private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
- throws FileNotFoundException, IOException {
- final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
- mBaseFilename = header.getId() + "." + header.getVersion();
- mDictDir = new File(mDictPlacedDir, mBaseFilename);
- final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
- final File headerFile = new File(mDictDir,
- mBaseFilename + FormatSpec.HEADER_FILE_EXTENSION);
- final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
- final File timestampFile = new File(mDictDir,
- mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION);
- final File terminalAddressTableFile = new File(mDictDir,
- mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
- if (!mDictDir.isDirectory()) {
- if (mDictDir.exists()) {
- FileUtils.deleteRecursively(mDictDir);
- }
- mDictDir.mkdirs();
- }
- mTrieOutStream = new FileOutputStream(trieFile);
- mHeaderOutStream = new FileOutputStream(headerFile);
- mFreqOutStream = new FileOutputStream(freqFile);
- mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
- if (formatOptions.mHasTimestamp) {
- mUnigramTimestampOutStream = new FileOutputStream(timestampFile);
- }
- }
-
- private void close() throws IOException {
- try {
- if (mTrieOutStream != null) {
- mTrieOutStream.close();
- }
- if (mHeaderOutStream != null) {
- mHeaderOutStream.close();
- }
- if (mFreqOutStream != null) {
- mFreqOutStream.close();
- }
- if (mTerminalAddressTableOutStream != null) {
- mTerminalAddressTableOutStream.close();
- }
- if (mUnigramTimestampOutStream != null) {
- mUnigramTimestampOutStream.close();
- }
- } finally {
- mTrieOutStream = null;
- mHeaderOutStream = null;
- mFreqOutStream = null;
- mTerminalAddressTableOutStream = null;
- }
- }
-
+ // TODO: This builds a FusionDictionary first and iterates it to add words to the binary
+ // dictionary. However, it is possible to just add words directly to the binary dictionary
+ // instead.
+ // In the long run, when we stop supporting version 2, FusionDictionary will become deprecated
+ // and we can remove it. Then we'll be able to just call BinaryDictionary directly.
@Override
- public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
+ public void writeDictionary(FusionDictionary dict, FormatOptions formatOptions)
throws IOException, UnsupportedFormatException {
if (formatOptions.mVersion != FormatSpec.VERSION4) {
throw new UnsupportedFormatException("File header has a wrong version number : "
@@ -243,208 +54,70 @@ public class Ver4DictEncoder implements DictEncoder {
if (!mDictPlacedDir.isDirectory()) {
throw new UnsupportedFormatException("Given path is not a directory.");
}
-
- if (mTrieOutStream == null) {
- openStreams(formatOptions, dict.mOptions);
- }
-
- BinaryDictEncoderUtils.writeDictionaryHeader(mHeaderOutStream, dict, formatOptions);
-
- MakedictLog.i("Flattening the tree...");
- ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
- int terminalCount = 0;
- final ArrayList<PtNode> nodes = CollectionUtils.newArrayList();
- for (final PtNodeArray array : flatNodes) {
- for (final PtNode node : array.mData) {
- if (node.isTerminal()) {
- nodes.add(node);
- node.mTerminalId = terminalCount++;
+ if (!BinaryDictionary.createEmptyDictFile(mDictPlacedDir.getAbsolutePath(),
+ FormatSpec.VERSION4, dict.mOptions.mAttributes)) {
+ throw new IOException("Cannot create dictionary file");
+ }
+ final BinaryDictionary binaryDict = new BinaryDictionary(mDictPlacedDir.getAbsolutePath(),
+ 0l, mDictPlacedDir.length(), true /* useFullEditDistance */,
+ LocaleUtils.constructLocaleFromString(dict.mOptions.mAttributes.get(
+ FormatSpec.FileHeader.DICTIONARY_LOCALE_ATTRIBUTE)),
+ Dictionary.TYPE_USER /* Dictionary type. Does not matter for us */,
+ true /* isUpdatable */);
+ if (!binaryDict.isValidDictionary()) {
+ // Somehow createEmptyDictFile returned true, but the file was not created correctly
+ throw new IOException("Cannot create dictionary file");
+ }
+ for (final Word word : dict) {
+ // TODO: switch to addMultipleDictionaryEntries when they support shortcuts
+ if (null == word.mShortcutTargets || word.mShortcutTargets.isEmpty()) {
+ binaryDict.addUnigramWord(word.mWord, word.mFrequency,
+ null /* shortcutTarget */, 0 /* shortcutProbability */,
+ word.mIsNotAWord, word.mIsBlacklistEntry, 0 /* timestamp */);
+ } else {
+ for (final WeightedString shortcutTarget : word.mShortcutTargets) {
+ binaryDict.addUnigramWord(word.mWord, word.mFrequency,
+ shortcutTarget.mWord, shortcutTarget.mFrequency,
+ word.mIsNotAWord, word.mIsBlacklistEntry, 0 /* timestamp */);
}
}
- }
- Collections.sort(nodes, new Comparator<PtNode>() {
- @Override
- public int compare(final PtNode lhs, final PtNode rhs) {
- if (lhs.mFrequency != rhs.mFrequency) {
- return lhs.mFrequency < rhs.mFrequency ? -1 : 1;
- }
- if (lhs.mTerminalId < rhs.mTerminalId) return -1;
- if (lhs.mTerminalId > rhs.mTerminalId) return 1;
- return 0;
+ if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) {
+ binaryDict.flushWithGC();
}
- });
- int count = 0;
- for (final PtNode node : nodes) {
- node.mTerminalId = count++;
- }
-
- MakedictLog.i("Computing addresses...");
- BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions);
- if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
-
- writeTerminalData(flatNodes, terminalCount);
- if (formatOptions.mHasTimestamp) {
- initUnigramTimestamps(terminalCount);
- }
- mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir,
- formatOptions.mHasTimestamp);
- writeBigrams(flatNodes, dict);
- mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir);
- writeShortcuts(flatNodes);
-
- final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
- final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
- mTrieBuf = new byte[bufferSize];
-
- MakedictLog.i("Writing file...");
- for (PtNodeArray nodeArray : flatNodes) {
- BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions);
}
- if (MakedictLog.DBG) {
- BinaryDictEncoderUtils.showStatistics(flatNodes);
- MakedictLog.i("has " + terminalCount + " terminals.");
+ for (final Word word0 : dict) {
+ if (null == word0.mBigrams) continue;
+ for (final WeightedString word1 : word0.mBigrams) {
+ binaryDict.addBigramWords(word0.mWord, word1.mWord, word1.mFrequency,
+ 0 /* timestamp */);
+ }
+ if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) {
+ binaryDict.flushWithGC();
+ }
}
- mTrieOutStream.write(mTrieBuf);
-
- MakedictLog.i("Done");
- close();
+ binaryDict.flushWithGC();
+ binaryDict.close();
}
@Override
public void setPosition(int position) {
- if (mTrieBuf == null || position < 0 || position > mTrieBuf.length) return;
- mTriePos = position;
}
@Override
public int getPosition() {
- return mTriePos;
+ return 0;
}
@Override
public void writePtNodeCount(int ptNodeCount) {
- final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
- // ptNodeCount must fit on one byte or two bytes.
- // Please see comments in FormatSpec
- if (countSize != 1 && countSize != 2) {
- throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
- }
- final int encodedPtNodeCount = (countSize == 2) ?
- (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
- mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, encodedPtNodeCount,
- countSize);
- }
-
- private void writePtNodeFlags(final PtNode ptNode, final FormatOptions formatOptions) {
- final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
- mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
- BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos, formatOptions),
- FormatSpec.PTNODE_FLAGS_SIZE);
- }
-
- private void writeParentPosition(int parentPos, final PtNode ptNode,
- final FormatOptions formatOptions) {
- if (parentPos != FormatSpec.NO_PARENT_ADDRESS) {
- parentPos -= ptNode.mCachedAddressAfterUpdate;
- }
- mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos,
- formatOptions);
- }
-
- private void writeCharacters(final int[] characters, final boolean hasSeveralChars) {
- mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos);
- if (hasSeveralChars) {
- mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
- }
- }
-
- private void writeTerminalId(final int terminalId) {
- mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId,
- FormatSpec.PTNODE_TERMINAL_ID_SIZE);
- }
-
- private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) {
- final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
- if (formatOptions.supportsDynamicUpdate()) {
- mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf,
- mTriePos, childrenPos);
- } else {
- mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
- mTriePos, childrenPos);
- }
- }
-
- private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
- throws IOException {
- mBigramWriter.openStreams();
- for (final PtNodeArray nodeArray : flatNodes) {
- for (final PtNode ptNode : nodeArray.mData) {
- if (ptNode.mBigrams != null) {
- mBigramWriter.writeBigramsForOneWord(ptNode.mTerminalId, ptNode.mBigrams.size(),
- ptNode.mBigrams.iterator(), dict);
- }
- }
- }
- mBigramWriter.closeStreams();
- }
-
- private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException {
- mShortcutWriter.openStreams();
- for (final PtNodeArray nodeArray : flatNodes) {
- for (final PtNode ptNode : nodeArray.mData) {
- if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) {
- mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId,
- ptNode.mShortcutTargets.iterator());
- }
- }
- }
- mShortcutWriter.closeStreams();
}
@Override
public void writeForwardLinkAddress(int forwardLinkAddress) {
- mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
- forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
}
@Override
- public void writePtNode(final PtNode ptNode, final int parentPosition,
- final FormatOptions formatOptions, final FusionDictionary dict) {
- writePtNodeFlags(ptNode, formatOptions);
- writeParentPosition(parentPosition, ptNode, formatOptions);
- writeCharacters(ptNode.mChars, ptNode.hasSeveralChars());
- if (ptNode.isTerminal()) {
- writeTerminalId(ptNode.mTerminalId);
- }
- writeChildrenPosition(ptNode, formatOptions);
- }
-
- private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes,
- final int terminalCount) throws IOException {
- final byte[] freqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE];
- final byte[] terminalAddressTableBuf =
- new byte[terminalCount * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE];
- for (final PtNodeArray nodeArray : flatNodes) {
- for (final PtNode ptNode : nodeArray.mData) {
- if (ptNode.isTerminal()) {
- BinaryDictEncoderUtils.writeUIntToBuffer(freqBuf,
- ptNode.mTerminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE,
- ptNode.mFrequency, FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
- BinaryDictEncoderUtils.writeUIntToBuffer(terminalAddressTableBuf,
- ptNode.mTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
- ptNode.mCachedAddressAfterUpdate,
- FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
- }
- }
- }
- mFreqOutStream.write(freqBuf);
- mTerminalAddressTableOutStream.write(terminalAddressTableBuf);
- }
-
- private void initUnigramTimestamps(final int terminalCount) throws IOException {
- // Initial value of time stamps for each word is 0.
- final byte[] unigramTimestampBuf =
- new byte[terminalCount * FormatSpec.UNIGRAM_TIMESTAMP_SIZE];
- mUnigramTimestampOutStream.write(unigramTimestampBuf);
+ public void writePtNode(
+ PtNode ptNode, int parentPosition, FormatOptions formatOptions, FusionDictionary dict) {
}
}