aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/latin/makedict
diff options
context:
space:
mode:
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java48
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java102
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java60
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java88
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/DictDecoder.java11
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java6
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/FormatSpec.java152
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java147
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java120
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/SparseTableContentUpdater.java123
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java93
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java2
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java340
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java103
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java723
15 files changed, 549 insertions, 1569 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java
index f8fa68f45..fda97dafc 100644
--- a/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java
@@ -32,35 +32,36 @@ import java.util.TreeMap;
* A base class of the binary dictionary decoder.
*/
public abstract class AbstractDictDecoder implements DictDecoder {
- private static final int SUCCESS = 0;
- private static final int ERROR_CANNOT_READ = 1;
- private static final int ERROR_WRONG_FORMAT = 2;
-
- protected FileHeader readHeader(final DictBuffer headerBuffer)
+ protected FileHeader readHeader(final DictBuffer dictBuffer)
throws IOException, UnsupportedFormatException {
- if (headerBuffer == null) {
+ if (dictBuffer == null) {
openDictBuffer();
}
- final int version = HeaderReader.readVersion(headerBuffer);
+ final int version = HeaderReader.readVersion(dictBuffer);
if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
|| version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) {
throw new UnsupportedFormatException("Unsupported version : " + version);
}
// TODO: Remove this field.
- final int optionsFlags = HeaderReader.readOptionFlags(headerBuffer);
- final int headerSize = HeaderReader.readHeaderSize(headerBuffer);
+ final int optionsFlags = HeaderReader.readOptionFlags(dictBuffer);
+
+ final int headerSize = HeaderReader.readHeaderSize(dictBuffer);
+
if (headerSize < 0) {
throw new UnsupportedFormatException("header size can't be negative.");
}
- final HashMap<String, String> attributes = HeaderReader.readAttributes(headerBuffer,
+ final HashMap<String, String> attributes = HeaderReader.readAttributes(dictBuffer,
headerSize);
final FileHeader header = new FileHeader(headerSize,
- new FusionDictionary.DictionaryOptions(attributes),
- new FormatOptions(version,
- 0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG)));
+ new FusionDictionary.DictionaryOptions(attributes,
+ 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG),
+ 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)),
+ new FormatOptions(version,
+ 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE),
+ 0 != (optionsFlags & FormatSpec.CONTAINS_TIMESTAMP_FLAG)));
return header;
}
@@ -203,25 +204,4 @@ public abstract class AbstractDictDecoder implements DictDecoder {
return readLength;
}
}
-
- /**
- * Check whether the header contains the expected information. This is a no-error method,
- * that will return an error code and never throw a checked exception.
- * @return an error code, either ERROR_* or SUCCESS.
- */
- private int checkHeader() {
- try {
- readHeader();
- } catch (IOException e) {
- return ERROR_CANNOT_READ;
- } catch (UnsupportedFormatException e) {
- return ERROR_WRONG_FORMAT;
- }
- return SUCCESS;
- }
-
- @Override
- public boolean hasValidRawBinaryDictionary() {
- return checkHeader() == SUCCESS;
- }
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
index 7f0aa777f..216492b4d 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
@@ -24,9 +24,12 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Map;
import java.util.TreeMap;
@@ -166,14 +169,6 @@ public final class BinaryDictDecoderUtils {
return size;
}
- static int getCharArraySize(final int[] chars, final int start, final int end) {
- int size = 0;
- for (int i = start; i < end; ++i) {
- size += getCharSize(chars[i]);
- }
- return size;
- }
-
/**
* Writes a char array to a byte buffer.
*
@@ -205,7 +200,8 @@ public final class BinaryDictDecoderUtils {
* @param word the string to write.
* @return the size written, in bytes.
*/
- static int writeString(final byte[] buffer, final int origin, final String word) {
+ static int writeString(final byte[] buffer, final int origin,
+ final String word) {
final int length = word.length();
int index = origin;
for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
@@ -227,62 +223,22 @@ public final class BinaryDictDecoderUtils {
*
* This will also write the terminator byte.
*
- * @param stream the OutputStream to write to.
+ * @param buffer the OutputStream to write to.
* @param word the string to write.
- * @return the size written, in bytes.
*/
- static int writeString(final OutputStream stream, final String word) throws IOException {
+ static void writeString(final OutputStream buffer, final String word) throws IOException {
final int length = word.length();
- int written = 0;
for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
final int codePoint = word.codePointAt(i);
- final int charSize = getCharSize(codePoint);
- if (1 == charSize) {
- stream.write((byte) codePoint);
- } else {
- stream.write((byte) (0xFF & (codePoint >> 16)));
- stream.write((byte) (0xFF & (codePoint >> 8)));
- stream.write((byte) (0xFF & codePoint));
- }
- written += charSize;
- }
- stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
- written += FormatSpec.PTNODE_TERMINATOR_SIZE;
- return written;
- }
-
- /**
- * Writes an array of code points with our character format to an OutputStream.
- *
- * This will also write the terminator byte.
- *
- * @param stream the OutputStream to write to.
- * @param codePoints the array of code points
- * @return the size written, in bytes.
- */
- // TODO: Merge this method with writeCharArray and rename the various write* methods to
- // make the difference clear.
- static int writeCodePoints(final OutputStream stream, final int[] codePoints,
- final int startIndex, final int endIndex)
- throws IOException {
- int written = 0;
- for (int i = startIndex; i < endIndex; ++i) {
- final int codePoint = codePoints[i];
- final int charSize = getCharSize(codePoint);
- if (1 == charSize) {
- stream.write((byte) codePoint);
+ if (1 == getCharSize(codePoint)) {
+ buffer.write((byte) codePoint);
} else {
- stream.write((byte) (0xFF & (codePoint >> 16)));
- stream.write((byte) (0xFF & (codePoint >> 8)));
- stream.write((byte) (0xFF & codePoint));
+ buffer.write((byte) (0xFF & (codePoint >> 16)));
+ buffer.write((byte) (0xFF & (codePoint >> 8)));
+ buffer.write((byte) (0xFF & codePoint));
}
- written += charSize;
}
- if (endIndex - startIndex > 1) {
- stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
- written += FormatSpec.PTNODE_TERMINATOR_SIZE;
- }
- return written;
+ buffer.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
}
/**
@@ -330,7 +286,7 @@ public final class BinaryDictDecoderUtils {
static int readChildrenAddress(final DictBuffer dictBuffer,
final int optionFlags, final FormatOptions options) {
- if (options.supportsDynamicUpdate()) {
+ if (options.mSupportsDynamicUpdate) {
final int address = dictBuffer.readUnsignedInt24();
if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS;
if ((address & FormatSpec.MSB24) != 0) {
@@ -540,11 +496,11 @@ public final class BinaryDictDecoderUtils {
}
// reach the end of the array.
- if (options.supportsDynamicUpdate()) {
+ if (options.mSupportsDynamicUpdate) {
final boolean hasValidForwardLink = dictDecoder.readAndFollowForwardLink();
if (!hasValidForwardLink) break;
}
- } while (options.supportsDynamicUpdate() && dictDecoder.hasNextPtNodeArray());
+ } while (options.mSupportsDynamicUpdate && dictDecoder.hasNextPtNodeArray());
final PtNodeArray nodeArray = new PtNodeArray(nodeArrayContents);
nodeArray.mCachedAddressBeforeUpdate = nodeArrayOriginPos;
@@ -600,7 +556,7 @@ public final class BinaryDictDecoderUtils {
Map<Integer, PtNodeArray> reverseNodeArrayMapping = new TreeMap<Integer, PtNodeArray>();
Map<Integer, PtNode> reversePtNodeMapping = new TreeMap<Integer, PtNode>();
- final PtNodeArray root = readNodeArray(dictDecoder, fileHeader.mBodyOffset,
+ final PtNodeArray root = readNodeArray(dictDecoder, fileHeader.mHeaderSize,
reverseNodeArrayMapping, reversePtNodeMapping, fileHeader.mFormatOptions);
FusionDictionary newDict = new FusionDictionary(root, fileHeader.mDictionaryOptions);
@@ -636,10 +592,32 @@ public final class BinaryDictDecoderUtils {
/**
* Basic test to find out whether the file is a binary dictionary or not.
*
+ * Concretely this only tests the magic number.
+ *
* @param file The file to test.
* @return true if it's a binary dictionary, false otherwise
*/
public static boolean isBinaryDictionary(final File file) {
- return FormatSpec.getDictDecoder(file).hasValidRawBinaryDictionary();
+ FileInputStream inStream = null;
+ try {
+ inStream = new FileInputStream(file);
+ final ByteBuffer buffer = inStream.getChannel().map(
+ FileChannel.MapMode.READ_ONLY, 0, file.length());
+ final int version = getFormatVersion(new ByteBufferDictBuffer(buffer));
+ return (version >= FormatSpec.MINIMUM_SUPPORTED_VERSION
+ && version <= FormatSpec.MAXIMUM_SUPPORTED_VERSION);
+ } catch (FileNotFoundException e) {
+ return false;
+ } catch (IOException e) {
+ return false;
+ } finally {
+ if (inStream != null) {
+ try {
+ inStream.close();
+ } catch (IOException e) {
+ // do nothing
+ }
+ }
+ }
}
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java
index 8ba0797de..f761829de 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java
@@ -17,9 +17,9 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
-import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
+import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
@@ -160,7 +160,7 @@ public class BinaryDictEncoderUtils {
node.mCachedSize = nodeSize;
size += nodeSize;
}
- if (options.supportsDynamicUpdate()) {
+ if (options.mSupportsDynamicUpdate) {
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
ptNodeArray.mCachedSize = size;
@@ -245,26 +245,6 @@ public class BinaryDictEncoderUtils {
}
}
- static void writeUIntToDictBuffer(final DictBuffer dictBuffer, final int value,
- final int size) {
- switch(size) {
- case 4:
- dictBuffer.put((byte) ((value >> 24) & 0xFF));
- /* fall through */
- case 3:
- dictBuffer.put((byte) ((value >> 16) & 0xFF));
- /* fall through */
- case 2:
- dictBuffer.put((byte) ((value >> 8) & 0xFF));
- /* fall through */
- case 1:
- dictBuffer.put((byte) (value & 0xFF));
- break;
- default:
- /* nop */
- }
- }
-
// End utility methods
// This method is responsible for finding a nice ordering of the nodes that favors run-time
@@ -397,7 +377,7 @@ public class BinaryDictEncoderUtils {
nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
}
}
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
} else if (null != ptNode.mChildren) {
nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
@@ -417,7 +397,7 @@ public class BinaryDictEncoderUtils {
ptNode.mCachedSize = nodeSize;
size += nodeSize;
}
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
if (ptNodeArray.mCachedSize != size) {
@@ -533,7 +513,7 @@ public class BinaryDictEncoderUtils {
if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
} while (changesDone);
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
computeParentAddresses(flatNodes);
}
final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1);
@@ -642,7 +622,7 @@ public class BinaryDictEncoderUtils {
byte flags = 0;
if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL;
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
flags |= FormatSpec.FLAG_IS_NOT_MOVED;
} else if (true) {
switch (childrenAddressSize) {
@@ -710,13 +690,6 @@ public class BinaryDictEncoderUtils {
+ word + " is " + unigramFrequency);
bigramFrequency = unigramFrequency;
}
- bigramFlags += getBigramFrequencyDiff(unigramFrequency, bigramFrequency)
- & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
- return bigramFlags;
- }
-
- public static int getBigramFrequencyDiff(final int unigramFrequency,
- final int bigramFrequency) {
// We compute the difference between 255 (which means probability = 1) and the
// unigram score. We split this into a number of discrete steps.
// Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15
@@ -750,15 +723,22 @@ public class BinaryDictEncoderUtils {
// include this bigram in the dictionary. For now, register as 0, and live with the
// small over-estimation that we get in this case. TODO: actually remove this bigram
// if discretizedFrequency < 0.
- return discretizedFrequency > 0 ? discretizedFrequency : 0;
+ final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0;
+ bigramFlags += finalBigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
+ return bigramFlags;
}
/**
- * Makes the 2-byte value for options flags. Unused at the moment, and always 0.
+ * Makes the 2-byte value for options flags.
*/
- private static final int makeOptionsValue(final FormatOptions formatOptions) {
- // TODO: why doesn't this handle CONTAINS_TIMESTAMP_FLAG?
- return 0;
+ private static final int makeOptionsValue(final FusionDictionary dictionary,
+ final FormatOptions formatOptions) {
+ final DictionaryOptions options = dictionary.mOptions;
+ final boolean hasBigrams = dictionary.hasBigrams();
+ return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0)
+ + (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0)
+ + (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0)
+ + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0);
}
/**
@@ -846,7 +826,7 @@ public class BinaryDictEncoderUtils {
}
dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict);
}
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
dictEncoder.writeForwardLinkAddress(FormatSpec.NO_FORWARD_LINK_ADDRESS);
}
if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate
@@ -947,7 +927,7 @@ public class BinaryDictEncoderUtils {
headerBuffer.write((byte) (0xFF & version));
// Options flags
- final int options = makeOptionsValue(formatOptions);
+ final int options = makeOptionsValue(dict, formatOptions);
headerBuffer.write((byte) (0xFF & (options >> 8)));
headerBuffer.write((byte) (0xFF & options));
final int headerSizeOffset = headerBuffer.size();
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
index 640d778bb..d5516ef46 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
@@ -62,7 +62,7 @@ public final class BinaryDictIOUtils {
* Retrieves all node arrays without recursive call.
*/
private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
- final int bodyOffset, final Map<Integer, String> words,
+ final int headerSize, final Map<Integer, String> words,
final Map<Integer, Integer> frequencies,
final Map<Integer, ArrayList<PendingAttribute>> bigrams,
final FormatOptions formatOptions) {
@@ -71,7 +71,7 @@ public final class BinaryDictIOUtils {
Stack<Position> stack = new Stack<Position>();
int index = 0;
- Position initPos = new Position(bodyOffset, 0);
+ Position initPos = new Position(headerSize, 0);
stack.push(initPos);
while (!stack.empty()) {
@@ -112,7 +112,7 @@ public final class BinaryDictIOUtils {
}
if (p.mPosition == p.mNumOfPtNode) {
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
final boolean hasValidForwardLinkAddress =
dictDecoder.readAndFollowForwardLink();
if (hasValidForwardLinkAddress && dictDecoder.hasNextPtNodeArray()) {
@@ -154,7 +154,7 @@ public final class BinaryDictIOUtils {
UnsupportedFormatException {
// Read header
final FileHeader header = dictDecoder.readHeader();
- readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
+ readUnigramsAndBigramsBinaryInner(dictDecoder, header.mHeaderSize, words,
frequencies, bigrams, header.mFormatOptions);
}
@@ -228,7 +228,7 @@ public final class BinaryDictIOUtils {
// a forward link address that we need to consult and possibly resume
// search on the next node array in the linked list.
if (foundNextPtNode) break;
- if (!header.mFormatOptions.supportsDynamicUpdate()) {
+ if (!header.mFormatOptions.mSupportsDynamicUpdate) {
return FormatSpec.NOT_VALID_WORD;
}
@@ -245,7 +245,8 @@ public final class BinaryDictIOUtils {
/**
* @return the size written, in bytes. Always 3 bytes.
*/
- static int writeSInt24ToBuffer(final DictBuffer dictBuffer, final int value) {
+ static int writeSInt24ToBuffer(final DictBuffer dictBuffer,
+ final int value) {
final int absValue = Math.abs(value);
dictBuffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
dictBuffer.put((byte)((absValue >> 8) & 0xFF));
@@ -300,6 +301,35 @@ public final class BinaryDictIOUtils {
}
/**
+ * Write a string to a stream.
+ *
+ * @param destination the stream to write.
+ * @param word the string to be written.
+ * @return the size written, in bytes.
+ * @throws IOException
+ */
+ private static int writeString(final OutputStream destination, final String word)
+ throws IOException {
+ int size = 0;
+ final int length = word.length();
+ for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
+ final int codePoint = word.codePointAt(i);
+ if (CharEncoding.getCharSize(codePoint) == 1) {
+ destination.write((byte)codePoint);
+ size++;
+ } else {
+ destination.write((byte)(0xFF & (codePoint >> 16)));
+ destination.write((byte)(0xFF & (codePoint >> 8)));
+ destination.write((byte)(0xFF & codePoint));
+ size += 3;
+ }
+ }
+ destination.write((byte)FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
+ size += FormatSpec.PTNODE_TERMINATOR_SIZE;
+ return size;
+ }
+
+ /**
* Write a PtNode to an output stream from a PtNodeInfo.
* A PtNode is an in-memory representation of a node in the patricia trie.
* A PtNode info is a container for low-level information about how the
@@ -357,7 +387,7 @@ public final class BinaryDictIOUtils {
destination.write((byte)BinaryDictEncoderUtils.makeShortcutFlags(
shortcutIterator.hasNext(), target.mFrequency));
size++;
- size += CharEncoding.writeString(destination, target.mWord);
+ size += writeString(destination, target.mWord);
}
}
@@ -415,27 +445,6 @@ public final class BinaryDictIOUtils {
}
/**
- * Writes a PtNodeCount to the stream.
- *
- * @param destination the stream to write.
- * @param ptNodeCount the count.
- * @return the size written in bytes.
- */
- static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
- throws IOException {
- final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
- // the count must fit on one byte or two bytes.
- // Please see comments in FormatSpec.
- if (countSize != 1 && countSize != 2) {
- throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
- }
- final int encodedPtNodeCount = (countSize == 2) ?
- (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
- BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
- return countSize;
- }
-
- /**
* Write a node array to the stream.
*
* @param destination the stream to write.
@@ -445,7 +454,20 @@ public final class BinaryDictIOUtils {
*/
static int writeNodes(final OutputStream destination, final PtNodeInfo[] infos)
throws IOException {
- int size = writePtNodeCount(destination, infos.length);
+ int size = getPtNodeCountSize(infos.length);
+ switch (getPtNodeCountSize(infos.length)) {
+ case 1:
+ destination.write((byte)infos.length);
+ break;
+ case 2:
+ final int encodedPtNodeCount =
+ infos.length | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG;
+ destination.write((byte)(encodedPtNodeCount >> 8));
+ destination.write((byte)(encodedPtNodeCount & 0xFF));
+ break;
+ default:
+ throw new RuntimeException("Invalid node count size.");
+ }
for (final PtNodeInfo info : infos) size += writePtNode(destination, info);
writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
@@ -507,7 +529,7 @@ public final class BinaryDictIOUtils {
* Helper method to check whether the node is moved.
*/
public static boolean isMovedPtNode(final int flags, final FormatOptions options) {
- return options.supportsDynamicUpdate()
+ return options.mSupportsDynamicUpdate
&& ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED);
}
@@ -516,14 +538,14 @@ public final class BinaryDictIOUtils {
*/
public static boolean supportsDynamicUpdate(final FormatOptions options) {
return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_DYNAMIC_UPDATE
- && options.supportsDynamicUpdate();
+ && options.mSupportsDynamicUpdate;
}
/**
* Helper method to check whether the node is deleted.
*/
public static boolean isDeletedPtNode(final int flags, final FormatOptions formatOptions) {
- return formatOptions.supportsDynamicUpdate()
+ return formatOptions.mSupportsDynamicUpdate
&& ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED);
}
@@ -546,7 +568,7 @@ public final class BinaryDictIOUtils {
static int getChildrenAddressSize(final int optionFlags,
final FormatOptions formatOptions) {
- if (formatOptions.supportsDynamicUpdate()) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
+ if (formatOptions.mSupportsDynamicUpdate) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
return 1;
diff --git a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java
index b4838f00f..3dbeee099 100644
--- a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java
@@ -35,7 +35,6 @@ import java.util.TreeMap;
/**
* An interface of binary dictionary decoders.
*/
-// TODO: Straighten out responsibility for the buffer's file pointer.
public interface DictDecoder {
/**
@@ -44,7 +43,7 @@ public interface DictDecoder {
public FileHeader readHeader() throws IOException, UnsupportedFormatException;
/**
- * Reads PtNode from ptNodePos.
+ * Reads PtNode from nodeAddress.
* @param ptNodePos the position of PtNode.
* @param formatOptions the format options.
* @return PtNodeInfo.
@@ -128,8 +127,7 @@ public interface DictDecoder {
* Opens the dictionary file and makes DictBuffer.
*/
@UsedForTesting
- public void openDictBuffer() throws FileNotFoundException, IOException,
- UnsupportedFormatException;
+ public void openDictBuffer() throws FileNotFoundException, IOException;
@UsedForTesting
public boolean isDictBufferOpen();
@@ -230,9 +228,4 @@ public interface DictDecoder {
}
public void skipPtNode(final FormatOptions formatOptions);
-
- /**
- * @return whether this decoder has a valid binary dictionary that it can decode.
- */
- public boolean hasValidRawBinaryDictionary();
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java
index ff03190a3..28da9ffdd 100644
--- a/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java
+++ b/java/src/com/android/inputmethod/latin/makedict/DynamicBinaryDictIOUtils.java
@@ -37,7 +37,7 @@ import java.util.Arrays;
@UsedForTesting
public final class DynamicBinaryDictIOUtils {
private static final boolean DBG = false;
- static final int MAX_JUMPS = 10000;
+ private static final int MAX_JUMPS = 10000;
private DynamicBinaryDictIOUtils() {
// This utility class is not publicly instantiable.
@@ -61,7 +61,7 @@ public final class DynamicBinaryDictIOUtils {
final DictBuffer dictBuffer = dictUpdater.getDictBuffer();
final int originalPosition = dictBuffer.position();
dictBuffer.position(ptNodeOriginAddress);
- if (!formatOptions.supportsDynamicUpdate()) {
+ if (!formatOptions.mSupportsDynamicUpdate) {
throw new RuntimeException("this file format does not support parent addresses");
}
final int flags = dictBuffer.readUnsignedByte();
@@ -102,7 +102,7 @@ public final class DynamicBinaryDictIOUtils {
}
if (!dictUpdater.readAndFollowForwardLink()) break;
if (dictUpdater.getPosition() == FormatSpec.NO_FORWARD_LINK_ADDRESS) break;
- } while (formatOptions.supportsDynamicUpdate());
+ } while (formatOptions.mSupportsDynamicUpdate);
dictUpdater.setPosition(originalPosition);
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
index 20ddba836..b56234f6d 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java
@@ -40,8 +40,12 @@ public final class FormatSpec {
* p | not used 3 bits
* t | each unigram and bigram entry has a time stamp?
* i | 1 bit, 1 = yes, 0 = no : CONTAINS_TIMESTAMP_FLAG
- * o |
- * nflags
+ * o | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
+ * n | FRENCH_LIGATURE_PROCESSING_FLAG
+ * f | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
+ * l | GERMAN_UMLAUT_PROCESSING_FLAG
+ * a |
+ * gs
*
* h |
* e | size of the file header, 4bytes
@@ -78,36 +82,45 @@ public final class FormatSpec {
* s
*
* f |
- * o | forward link address, 3byte
- * r | 1 byte = bbbbbbbb match
- * w | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte)
- * a | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte
- * r |
- * dlinkaddress
+ * o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
+ * r | forward link address, 3byte
+ * w | 1 byte = bbbbbbbb match
+ * a | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte)
+ * r | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte
+ * d |
+ * linkaddress
*/
/* Node (FusionDictionary.PtNode) layout is as follows:
- * | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED
- * | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES
- * | 01 = yes : FLAG_IS_MOVED
- * f | the new address is stored in the same place as the parent address
- * l | is deleted? 10 = yes : FLAG_IS_DELETED
- * a | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS
- * g | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
- * s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
+ * | IF !SUPPORTS_DYNAMIC_UPDATE
+ * | addressType xx : mask with MASK_CHILDREN_ADDRESS_TYPE
+ * | 2 bits, 00 = no children : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS
+ * f | 01 = 1 byte : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE
+ * l | 10 = 2 bytes : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES
+ * a | 11 = 3 bytes : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES
+ * g | ELSE
+ * s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED
+ * | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES
+ * | 01 = yes : FLAG_IS_MOVED
+ * | the new address is stored in the same place as the parent address
+ * | is deleted? 10 = yes : FLAG_IS_DELETED
+ * | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS
+ * | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
+ * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
* | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED
*
* p |
- * a | parent address, 3byte
- * r | 1 byte = bbbbbbbb match
- * e | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
- * n | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte
- * t | This address is relative to the head of the PtNode.
- * a | If the node doesn't have a parent, this field is set to 0.
+ * a | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
+ * r | parent address, 3byte
+ * e | 1 byte = bbbbbbbb match
+ * n | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
+ * t | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte
+ * a | This address is relative to the head of the PtNode.
+ * d | If the node doesn't have a parent, this field is set to 0.
* d |
- * dress
+ * ress
*
* c | IF FLAG_HAS_MULTIPLE_CHARS
* h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers
@@ -121,16 +134,23 @@ public final class FormatSpec {
* e | frequency 1 byte
* q |
*
- * c |
- * h | children address, 3 bytes
- * i | 1 byte = bbbbbbbb match
- * l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
- * d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte
- * r | if this node doesn't have children, this field is set to 0.
- * e | (see BinaryDictEncoderUtils#writeVariableSignedAddress)
- * n | This address is relative to the position of this field.
- * a |
- * ddress
+ * c | IF SUPPORTS_DYNAMIC_UPDATE
+ * h | children address, 3 bytes
+ * i | 1 byte = bbbbbbbb match
+ * l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
+ * d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte
+ * r | if this node doesn't have children, this field is set to 0.
+ * e | (see BinaryDictEncoderUtils#writeVariableSignedAddress)
+ * n | ELSIF 00 = FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS == addressType
+ * a | // nothing
+ * d | ELSIF 01 = FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE == addressType
+ * d | children address, 1 byte
+ * r | ELSIF 10 = FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES == addressType
+ * e | children address, 2 bytes
+ * s | ELSE // 11 = FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = addressType
+ * s | children address, 3 bytes
+ * | END
+ * | This address is relative to the position of this field.
*
* | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
* | shortcut string list
@@ -179,22 +199,20 @@ public final class FormatSpec {
*/
public static final int MAGIC_NUMBER = 0x9BC13AFE;
+ static final int MINIMUM_SUPPORTED_VERSION = 2;
+ static final int MAXIMUM_SUPPORTED_VERSION = 4;
static final int NOT_A_VERSION_NUMBER = -1;
static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3;
static final int FIRST_VERSION_WITH_TERMINAL_ID = 4;
-
- // These MUST have the same values as the relevant constants in format_utils.h.
- // From version 4 on, we use version * 100 + revision as a version number. That allows
- // us to change the format during development while having testing devices remove
- // older files with each upgrade, while still having a readable versioning scheme.
- public static final int VERSION2 = 2;
- public static final int VERSION3 = 3;
- public static final int VERSION4 = 400;
- static final int MINIMUM_SUPPORTED_VERSION = VERSION2;
- static final int MAXIMUM_SUPPORTED_VERSION = VERSION4;
+ static final int VERSION3 = 3;
+ static final int VERSION4 = 4;
// These options need to be the same numeric values as the one in the native reading code.
+ static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
// TODO: Make the native reading code read this variable.
+ static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
+ static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
+ static final int CONTAINS_BIGRAMS_FLAG = 0x8;
static final int CONTAINS_TIMESTAMP_FLAG = 0x10;
// TODO: Make this value adaptative to content data, store it in the header, and
@@ -245,10 +263,8 @@ public final class FormatSpec {
static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2;
- // These values are used only by version 4 or later. They MUST match the definitions in
- // ver4_dict_constants.cpp.
+ // These values are used only by version 4 or later.
static final String TRIE_FILE_EXTENSION = ".trie";
- public static final String HEADER_FILE_EXTENSION = ".header";
static final String FREQ_FILE_EXTENSION = ".freq";
static final String UNIGRAM_TIMESTAMP_FILE_EXTENSION = ".timestamp";
// tat = Terminal Address Table
@@ -262,9 +278,9 @@ public final class FormatSpec {
static final int UNIGRAM_TIMESTAMP_SIZE = 4;
// With the English main dictionary as of October 2013, the size of bigram address table is
- // is 345KB with the block size being 16.
- // This is 54% of that of full address table.
- static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16;
+ // is 584KB with the block size being 4.
+ // This is 91% of that of full address table.
+ static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
static final int BIGRAM_CONTENT_COUNT = 2;
static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
static final int BIGRAM_TIMESTAMP_CONTENT_INDEX = 1;
@@ -277,7 +293,7 @@ public final class FormatSpec {
static final int SHORTCUT_CONTENT_COUNT = 1;
static final int SHORTCUT_CONTENT_INDEX = 0;
// With the English main dictionary as of October 2013, the size of shortcut address table is
- // 26KB with the block size being 64.
+ // 29KB with the block size being 64.
// This is only 4.4% of that of full address table.
static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
static final String SHORTCUT_CONTENT_ID = "_shortcut";
@@ -315,36 +331,43 @@ public final class FormatSpec {
*/
public static final class FormatOptions {
public final int mVersion;
+ public final boolean mSupportsDynamicUpdate;
public final boolean mHasTerminalId;
public final boolean mHasTimestamp;
-
@UsedForTesting
public FormatOptions(final int version) {
- this(version, false /* hasTimestamp */);
+ this(version, false);
}
- public FormatOptions(final int version, final boolean hasTimestamp) {
+ @UsedForTesting
+ public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
+ this(version, supportsDynamicUpdate, false /* hasTimestamp */);
+ }
+
+ public FormatOptions(final int version, final boolean supportsDynamicUpdate,
+ final boolean hasTimestamp) {
mVersion = version;
+ if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
+ throw new RuntimeException("Dynamic updates are only supported with versions "
+ + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior.");
+ }
+ mSupportsDynamicUpdate = supportsDynamicUpdate;
mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID);
mHasTimestamp = hasTimestamp;
}
-
- public boolean supportsDynamicUpdate() {
- return mVersion >= FIRST_VERSION_WITH_DYNAMIC_UPDATE;
- }
}
/**
* Class representing file header.
*/
public static final class FileHeader {
- public final int mBodyOffset;
+ public final int mHeaderSize;
public final DictionaryOptions mDictionaryOptions;
public final FormatOptions mFormatOptions;
// Note that these are corresponding definitions in native code in latinime::HeaderPolicy
// and latinime::HeaderReadWriteUtils.
+ public static final String SUPPORTS_DYNAMIC_UPDATE_ATTRIBUTE = "SUPPORTS_DYNAMIC_UPDATE";
public static final String USES_FORGETTING_CURVE_ATTRIBUTE = "USES_FORGETTING_CURVE";
- public static final String HAS_HISTORICAL_INFO_ATTRIBUTE = "HAS_HISTORICAL_INFO";
public static final String ATTRIBUTE_VALUE_TRUE = "1";
public static final String DICTIONARY_VERSION_ATTRIBUTE = "version";
@@ -353,18 +376,9 @@ public final class FormatSpec {
private static final String DICTIONARY_DESCRIPTION_ATTRIBUTE = "description";
public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
final FormatOptions formatOptions) {
+ mHeaderSize = headerSize;
mDictionaryOptions = dictionaryOptions;
mFormatOptions = formatOptions;
- mBodyOffset = formatOptions.mVersion < VERSION4 ? headerSize : 0;
- if (null == getLocaleString()) {
- throw new RuntimeException("Cannot create a FileHeader without a locale");
- }
- if (null == getVersion()) {
- throw new RuntimeException("Cannot create a FileHeader without a version");
- }
- if (null == getId()) {
- throw new RuntimeException("Cannot create a FileHeader without an ID");
- }
}
// Helper method to get the locale as a String
diff --git a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
index fdf2ae7b5..3bb218bea 100644
--- a/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
+++ b/java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java
@@ -303,9 +303,14 @@ public final class FusionDictionary implements Iterable<Word> {
* Options global to the dictionary.
*/
public static final class DictionaryOptions {
+ public final boolean mGermanUmlautProcessing;
+ public final boolean mFrenchLigatureProcessing;
public final HashMap<String, String> mAttributes;
- public DictionaryOptions(final HashMap<String, String> attributes) {
+ public DictionaryOptions(final HashMap<String, String> attributes,
+ final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) {
mAttributes = attributes;
+ mGermanUmlautProcessing = germanUmlautProcessing;
+ mFrenchLigatureProcessing = frenchLigatureProcessing;
}
@Override
public String toString() { // Convenience method
@@ -334,6 +339,14 @@ public final class FusionDictionary implements Iterable<Word> {
}
s.append("\n");
}
+ if (mGermanUmlautProcessing) {
+ s.append(indent);
+ s.append("Needs German umlaut processing\n");
+ }
+ if (mFrenchLigatureProcessing) {
+ s.append(indent);
+ s.append("Needs French ligature processing\n");
+ }
return s.toString();
}
}
@@ -688,6 +701,138 @@ public final class FusionDictionary implements Iterable<Word> {
}
/**
+ * Recursively count the number of nodes in a given branch of the trie.
+ *
+ * @param nodeArray the node array to count.
+ * @return the number of nodes in this branch.
+ */
+ public static int countNodeArrays(final PtNodeArray nodeArray) {
+ int size = 1;
+ for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
+ PtNode ptNode = nodeArray.mData.get(i);
+ if (null != ptNode.mChildren)
+ size += countNodeArrays(ptNode.mChildren);
+ }
+ return size;
+ }
+
+ // Recursively find out whether there are any bigrams.
+ // This can be pretty expensive especially if there aren't any (we return as soon
+ // as we find one, so it's much cheaper if there are bigrams)
+ private static boolean hasBigramsInternal(final PtNodeArray nodeArray) {
+ if (null == nodeArray) return false;
+ for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
+ PtNode ptNode = nodeArray.mData.get(i);
+ if (null != ptNode.mBigrams) return true;
+ if (hasBigramsInternal(ptNode.mChildren)) return true;
+ }
+ return false;
+ }
+
+ /**
+ * Finds out whether there are any bigrams in this dictionary.
+ *
+ * @return true if there is any bigram, false otherwise.
+ */
+ // TODO: this is expensive especially for large dictionaries without any bigram.
+ // The up side is, this is always accurate and correct and uses no memory. We should
+ // find a more efficient way of doing this, without compromising too much on memory
+ // and ease of use.
+ public boolean hasBigrams() {
+ return hasBigramsInternal(mRootNodeArray);
+ }
+
+ // Historically, the tails of the words were going to be merged to save space.
+ // However, that would prevent the code to search for a specific address in log(n)
+ // time so this was abandoned.
+ // The code is still of interest as it does add some compression to any dictionary
+ // that has no need for attributes. Implementations that does not read attributes should be
+ // able to read a dictionary with merged tails.
+ // Also, the following code does support frequencies, as in, it will only merges
+ // tails that share the same frequency. Though it would result in the above loss of
+ // performance while searching by address, it is still technically possible to merge
+ // tails that contain attributes, but this code does not take that into account - it does
+ // not compare attributes and will merge terminals with different attributes regardless.
+ public void mergeTails() {
+ MakedictLog.i("Do not merge tails");
+ return;
+
+// MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root));
+// MakedictLog.i("Number of PtNodes : " + countPtNodes(root));
+//
+// final HashMap<String, ArrayList<PtNodeArray>> repository =
+// new HashMap<String, ArrayList<PtNodeArray>>();
+// mergeTailsInner(repository, root);
+//
+// MakedictLog.i("Number of different pseudohashes : " + repository.size());
+// int size = 0;
+// for (ArrayList<PtNodeArray> a : repository.values()) {
+// size += a.size();
+// }
+// MakedictLog.i("Number of nodes after merge : " + (1 + size));
+// MakedictLog.i("Recursively seen nodes : " + countNodes(root));
+ }
+
+ // The following methods are used by the deactivated mergeTails()
+// private static boolean isEqual(PtNodeArray a, PtNodeArray b) {
+// if (null == a && null == b) return true;
+// if (null == a || null == b) return false;
+// if (a.data.size() != b.data.size()) return false;
+// final int size = a.data.size();
+// for (int i = size - 1; i >= 0; --i) {
+// PtNode aPtNode = a.data.get(i);
+// PtNode bPtNode = b.data.get(i);
+// if (aPtNode.frequency != bPtNode.frequency) return false;
+// if (aPtNode.alternates == null && bPtNode.alternates != null) return false;
+// if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false;
+// if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false;
+// if (!isEqual(aPtNode.children, bPtNode.children)) return false;
+// }
+// return true;
+// }
+
+// static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner(
+// final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) {
+// final ArrayList<PtNode> branches = nodeArray.data;
+// final int nodeSize = branches.size();
+// for (int i = 0; i < nodeSize; ++i) {
+// PtNode ptNode = branches.get(i);
+// if (null != ptNode.children) {
+// String pseudoHash = getPseudoHash(ptNode.children);
+// ArrayList<PtNodeArray> similarList = map.get(pseudoHash);
+// if (null == similarList) {
+// similarList = new ArrayList<PtNodeArray>();
+// map.put(pseudoHash, similarList);
+// }
+// boolean merged = false;
+// for (PtNodeArray similar : similarList) {
+// if (isEqual(ptNode.children, similar)) {
+// ptNode.children = similar;
+// merged = true;
+// break;
+// }
+// }
+// if (!merged) {
+// similarList.add(ptNode.children);
+// }
+// mergeTailsInner(map, ptNode.children);
+// }
+// }
+// return map;
+// }
+
+// private static String getPseudoHash(final PtNodeArray nodeArray) {
+// StringBuilder s = new StringBuilder();
+// for (PtNode ptNode : nodeArray.data) {
+// s.append(ptNode.frequency);
+// for (int ch : ptNode.chars) {
+// s.append(Character.toChars(ch));
+// }
+// }
+// return s.toString();
+// }
+
+ /**
* Iterator to walk through a dictionary.
*
* This is purely for convenience.
diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java
deleted file mode 100644
index 06088b651..000000000
--- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentReader.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.android.inputmethod.latin.makedict;
-
-import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
-import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-
-/**
- * An auxiliary class for reading SparseTable and data written by SparseTableContentWriter.
- */
-public class SparseTableContentReader {
-
- /**
- * An interface of a function which is passed to SparseTableContentReader.read.
- */
- public interface SparseTableContentReaderInterface {
- /**
- * Reads data.
- *
- * @param buffer the DictBuffer. The position of the buffer is set to the head of data.
- */
- public void read(final DictBuffer buffer);
- }
-
- protected final int mContentCount;
- protected final int mBlockSize;
- protected final File mBaseDir;
- protected final File mLookupTableFile;
- protected final File[] mAddressTableFiles;
- protected final File[] mContentFiles;
- protected DictBuffer mLookupTableBuffer;
- protected final DictBuffer[] mAddressTableBuffers;
- private final DictBuffer[] mContentBuffers;
- protected final DictionaryBufferFactory mFactory;
-
- /**
- * Sole constructor of SparseTableContentReader.
- *
- * @param name the name of SparseTable.
- * @param blockSize the block size of the content table.
- * @param baseDir the directory which contains the files of the content table.
- * @param contentFilenames the file names of content files.
- * @param contentIds the ids of contents. These ids are used for a suffix of a name of
- * address files and content files.
- * @param factory the DictionaryBufferFactory which is used for opening the files.
- */
- public SparseTableContentReader(final String name, final int blockSize, final File baseDir,
- final String[] contentFilenames, final String[] contentIds,
- final DictionaryBufferFactory factory) {
- if (contentFilenames.length != contentIds.length) {
- throw new RuntimeException("The length of contentFilenames and the length of"
- + " contentIds are different " + contentFilenames.length + ", "
- + contentIds.length);
- }
- mBlockSize = blockSize;
- mBaseDir = baseDir;
- mFactory = factory;
- mContentCount = contentFilenames.length;
- mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
- mAddressTableFiles = new File[mContentCount];
- mContentFiles = new File[mContentCount];
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableFiles[i] = new File(mBaseDir,
- name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
- mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
- }
- mAddressTableBuffers = new DictBuffer[mContentCount];
- mContentBuffers = new DictBuffer[mContentCount];
- }
-
- public void openBuffers() throws FileNotFoundException, IOException {
- mLookupTableBuffer = mFactory.getDictionaryBuffer(mLookupTableFile);
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableBuffers[i] = mFactory.getDictionaryBuffer(mAddressTableFiles[i]);
- mContentBuffers[i] = mFactory.getDictionaryBuffer(mContentFiles[i]);
- }
- }
-
- protected void read(final int contentIndex, final int index,
- final SparseTableContentReaderInterface reader) {
- if (index < 0 || (index / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES
- >= mLookupTableBuffer.limit()) {
- return;
- }
-
- mLookupTableBuffer.position((index / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES);
- final int posInAddressTable = mLookupTableBuffer.readInt();
- if (posInAddressTable == SparseTable.NOT_EXIST) {
- return;
- }
-
- mAddressTableBuffers[contentIndex].position(
- (posInAddressTable + index % mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES);
- final int address = mAddressTableBuffers[contentIndex].readInt();
- if (address == SparseTable.NOT_EXIST) {
- return;
- }
-
- mContentBuffers[contentIndex].position(address);
- reader.read(mContentBuffers[contentIndex]);
- }
-} \ No newline at end of file
diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentUpdater.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentUpdater.java
deleted file mode 100644
index 4518f21b9..000000000
--- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentUpdater.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.android.inputmethod.latin.makedict;
-
-import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-
-/**
- * An auxiliary class for updating data associated with SparseTable.
- */
-public class SparseTableContentUpdater extends SparseTableContentReader {
- protected OutputStream mLookupTableOutStream;
- protected OutputStream[] mAddressTableOutStreams;
- protected OutputStream[] mContentOutStreams;
-
- public SparseTableContentUpdater(final String name, final int blockSize,
- final File baseDir, final String[] contentFilenames, final String[] contentIds,
- final DictionaryBufferFactory factory) {
- super(name, blockSize, baseDir, contentFilenames, contentIds, factory);
- mAddressTableOutStreams = new OutputStream[mContentCount];
- mContentOutStreams = new OutputStream[mContentCount];
- }
-
- protected void openStreamsAndBuffers() throws IOException {
- openBuffers();
- mLookupTableOutStream = new FileOutputStream(mLookupTableFile, true /* append */);
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableOutStreams[i] = new FileOutputStream(mAddressTableFiles[i],
- true /* append */);
- mContentOutStreams[i] = new FileOutputStream(mContentFiles[i], true /* append */);
- }
- }
-
- /**
- * Set the contentIndex-th elements of contentId-th table.
- *
- * @param contentId the id of the content table.
- * @param contentIndex the index where to set the valie.
- * @param value the value to set.
- */
- protected void setContentValue(final int contentId, final int contentIndex, final int value)
- throws IOException {
- if ((contentIndex / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES
- >= mLookupTableBuffer.limit()) {
- // Need to extend the lookup table
- final int currentSize = mLookupTableBuffer.limit()
- / SparseTable.SIZE_OF_INT_IN_BYTES;
- final int target = contentIndex / mBlockSize + 1;
- for (int i = currentSize; i < target; ++i) {
- BinaryDictEncoderUtils.writeUIntToStream(mLookupTableOutStream,
- SparseTable.NOT_EXIST, SparseTable.SIZE_OF_INT_IN_BYTES);
- }
- // We need to reopen the byte buffer of the lookup table because a MappedByteBuffer in
- // Java isn't expanded automatically when the underlying file is expanded.
- reopenLookupTable();
- }
-
- mLookupTableBuffer.position((contentIndex / mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES);
- int posInAddressTable = mLookupTableBuffer.readInt();
- if (posInAddressTable == SparseTable.NOT_EXIST) {
- // Need to extend the address table
- mLookupTableBuffer.position(mLookupTableBuffer.position()
- - SparseTable.SIZE_OF_INT_IN_BYTES);
- posInAddressTable = mAddressTableBuffers[0].limit() / mBlockSize;
- BinaryDictEncoderUtils.writeUIntToDictBuffer(mLookupTableBuffer,
- posInAddressTable, SparseTable.SIZE_OF_INT_IN_BYTES);
- for (int i = 0; i < mContentCount; ++i) {
- for (int j = 0; j < mBlockSize; ++j) {
- BinaryDictEncoderUtils.writeUIntToStream(mAddressTableOutStreams[i],
- SparseTable.NOT_EXIST, SparseTable.SIZE_OF_INT_IN_BYTES);
- }
- }
- // We need to reopen the byte buffers of the address tables because a MappedByteBuffer
- // in Java isn't expanded automatically when the underlying file is expanded.
- reopenAddressTables();
- }
- posInAddressTable += (contentIndex % mBlockSize) * SparseTable.SIZE_OF_INT_IN_BYTES;
-
- mAddressTableBuffers[contentId].position(posInAddressTable);
- BinaryDictEncoderUtils.writeUIntToDictBuffer(mAddressTableBuffers[contentId],
- value, SparseTable.SIZE_OF_INT_IN_BYTES);
- }
-
- private void reopenLookupTable() throws IOException {
- mLookupTableOutStream.flush();
- mLookupTableBuffer = mFactory.getDictionaryBuffer(mLookupTableFile);
- }
-
- private void reopenAddressTables() throws IOException {
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableOutStreams[i].flush();
- mAddressTableBuffers[i] = mFactory.getDictionaryBuffer(mAddressTableFiles[i]);
- }
- }
-
- protected void close() throws IOException {
- mLookupTableOutStream.close();
- for (final OutputStream stream : mAddressTableOutStreams) {
- stream.close();
- }
- for (final OutputStream stream : mContentOutStreams) {
- stream.close();
- }
- }
-}
diff --git a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java b/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java
deleted file mode 100644
index 49f0fd624..000000000
--- a/java/src/com/android/inputmethod/latin/makedict/SparseTableContentWriter.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.android.inputmethod.latin.makedict;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-
-/**
- * An auxiliary class for writing data associated with SparseTable to files.
- */
-public class SparseTableContentWriter {
- public interface SparseTableContentWriterInterface {
- public void write(final OutputStream outStream) throws IOException;
- }
-
- private final int mContentCount;
- private final SparseTable mSparseTable;
- private final File mLookupTableFile;
- protected final File mBaseDir;
- private final File[] mAddressTableFiles;
- private final File[] mContentFiles;
- protected final OutputStream[] mContentOutStreams;
-
- /**
- * Sole constructor of SparseTableContentWriter.
- *
- * @param name the name of SparseTable.
- * @param initialCapacity the initial capacity of SparseTable.
- * @param blockSize the block size of the content table.
- * @param baseDir the directory which contains the files of the content table.
- * @param contentFilenames the file names of content files.
- * @param contentIds the ids of contents. These ids are used for a suffix of a name of address
- * files and content files.
- */
- public SparseTableContentWriter(final String name, final int initialCapacity,
- final int blockSize, final File baseDir, final String[] contentFilenames,
- final String[] contentIds) {
- if (contentFilenames.length != contentIds.length) {
- throw new RuntimeException("The length of contentFilenames and the length of"
- + " contentIds are different " + contentFilenames.length + ", "
- + contentIds.length);
- }
- mContentCount = contentFilenames.length;
- mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount);
- mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
- mAddressTableFiles = new File[mContentCount];
- mContentFiles = new File[mContentCount];
- mBaseDir = baseDir;
- for (int i = 0; i < mContentCount; ++i) {
- mAddressTableFiles[i] = new File(mBaseDir,
- name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
- mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
- }
- mContentOutStreams = new OutputStream[mContentCount];
- }
-
- public void openStreams() throws FileNotFoundException {
- for (int i = 0; i < mContentCount; ++i) {
- mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]);
- }
- }
-
- protected void write(final int contentIndex, final int index,
- final SparseTableContentWriterInterface writer) throws IOException {
- mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length());
- writer.write(mContentOutStreams[contentIndex]);
- mContentOutStreams[contentIndex].flush();
- }
-
- public void closeStreams() throws IOException {
- mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles);
- for (int i = 0; i < mContentCount; ++i) {
- mContentOutStreams[i].close();
- }
- }
-} \ No newline at end of file
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java
index 92eb861d6..5da34534e 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictEncoder.java
@@ -169,7 +169,7 @@ public class Ver3DictEncoder implements DictEncoder {
private void writeChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions) {
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
mPosition += BinaryDictEncoderUtils.writeSignedChildrenPosition(mBuffer, mPosition,
childrenPos);
} else {
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
index 3be62f066..734223ec2 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java
@@ -40,52 +40,26 @@ import java.util.Arrays;
public class Ver4DictDecoder extends AbstractDictDecoder {
private static final String TAG = Ver4DictDecoder.class.getSimpleName();
- protected static final int FILETYPE_TRIE = 1;
- protected static final int FILETYPE_FREQUENCY = 2;
- protected static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
- protected static final int FILETYPE_BIGRAM_FREQ = 4;
- protected static final int FILETYPE_SHORTCUT = 5;
- protected static final int FILETYPE_HEADER = 6;
-
- protected final File mDictDirectory;
- protected final DictionaryBufferFactory mBufferFactory;
+ private static final int FILETYPE_TRIE = 1;
+ private static final int FILETYPE_FREQUENCY = 2;
+ private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
+ private static final int FILETYPE_BIGRAM_FREQ = 4;
+ private static final int FILETYPE_SHORTCUT = 5;
+
+ private final File mDictDirectory;
+ private final DictionaryBufferFactory mBufferFactory;
protected DictBuffer mDictBuffer;
- protected DictBuffer mHeaderBuffer;
- protected DictBuffer mFrequencyBuffer;
- protected DictBuffer mTerminalAddressTableBuffer;
- private BigramContentReader mBigramReader;
- private ShortcutContentReader mShortcutReader;
-
- /**
- * Raw PtNode info straight out of a trie file in version 4 dictionary.
- */
- protected static final class Ver4PtNodeInfo {
- public final int mFlags;
- public final int[] mCharacters;
- public final int mTerminalId;
- public final int mChildrenPos;
- public final int mParentPos;
- public final int mNodeSize;
- public int mStartIndexOfCharacters;
- public int mEndIndexOfCharacters; // exclusive
-
- public Ver4PtNodeInfo(final int flags, final int[] characters, final int terminalId,
- final int childrenPos, final int parentPos, final int nodeSize) {
- mFlags = flags;
- mCharacters = characters;
- mTerminalId = terminalId;
- mChildrenPos = childrenPos;
- mParentPos = parentPos;
- mNodeSize = nodeSize;
- mStartIndexOfCharacters = 0;
- mEndIndexOfCharacters = characters.length;
- }
- }
+ private DictBuffer mFrequencyBuffer;
+ private DictBuffer mTerminalAddressTableBuffer;
+ private DictBuffer mBigramBuffer;
+ private DictBuffer mShortcutBuffer;
+ private SparseTable mBigramAddressTable;
+ private SparseTable mShortcutAddressTable;
@UsedForTesting
/* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
mDictDirectory = dictDirectory;
- mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null;
+ mDictBuffer = mFrequencyBuffer = null;
if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
@@ -102,16 +76,13 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
/* package */ Ver4DictDecoder(final File dictDirectory, final DictionaryBufferFactory factory) {
mDictDirectory = dictDirectory;
mBufferFactory = factory;
- mDictBuffer = mHeaderBuffer = mFrequencyBuffer = null;
+ mDictBuffer = mFrequencyBuffer = null;
}
- protected File getFile(final int fileType) throws UnsupportedFormatException {
+ private File getFile(final int fileType) {
if (fileType == FILETYPE_TRIE) {
return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.TRIE_FILE_EXTENSION);
- } else if (fileType == FILETYPE_HEADER) {
- return new File(mDictDirectory,
- mDictDirectory.getName() + FormatSpec.HEADER_FILE_EXTENSION);
} else if (fileType == FILETYPE_FREQUENCY) {
return new File(mDictDirectory,
mDictDirectory.getName() + FormatSpec.FREQ_FILE_EXTENSION);
@@ -127,27 +98,20 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION
+ FormatSpec.SHORTCUT_CONTENT_ID);
} else {
- throw new UnsupportedFormatException("Unsupported kind of file : " + fileType);
+ throw new RuntimeException("Unsupported kind of file : " + fileType);
}
}
@Override
- public void openDictBuffer() throws FileNotFoundException, IOException,
- UnsupportedFormatException {
- if (!mDictDirectory.isDirectory()) {
- throw new UnsupportedFormatException("Format 4 dictionary needs a directory");
- }
- mHeaderBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_HEADER));
+ public void openDictBuffer() throws FileNotFoundException, IOException {
mDictBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_TRIE));
mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
- mBigramReader = new BigramContentReader(mDictDirectory.getName(),
- mDictDirectory, mBufferFactory, false);
- mBigramReader.openBuffers();
- mShortcutReader = new ShortcutContentReader(mDictDirectory.getName(), mDictDirectory,
- mBufferFactory);
- mShortcutReader.openBuffers();
+ mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
+ loadBigramAddressSparseTable();
+ mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT));
+ loadShortcutAddressSparseTable();
}
@Override
@@ -155,134 +119,46 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
return mDictBuffer != null;
}
- @UsedForTesting
- /* package */ DictBuffer getHeaderBuffer() {
- return mHeaderBuffer;
- }
-
- @UsedForTesting
/* package */ DictBuffer getDictBuffer() {
return mDictBuffer;
}
@Override
public FileHeader readHeader() throws IOException, UnsupportedFormatException {
- if (mHeaderBuffer == null) {
+ if (mDictBuffer == null) {
openDictBuffer();
}
- mHeaderBuffer.position(0);
- final FileHeader header = super.readHeader(mHeaderBuffer);
+ final FileHeader header = super.readHeader(mDictBuffer);
final int version = header.mFormatOptions.mVersion;
- if (version != FormatSpec.VERSION4) {
+ if (version != 4) {
throw new UnsupportedFormatException("File header has a wrong version : " + version);
}
return header;
}
- /**
- * An auxiliary class for reading bigrams.
- */
- protected static class BigramContentReader extends SparseTableContentReader {
- public BigramContentReader(final String name, final File baseDir,
- final DictionaryBufferFactory factory, final boolean hasTimestamp) {
- super(name + FormatSpec.BIGRAM_FILE_EXTENSION,
- FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- getContentFilenames(name, hasTimestamp), getContentIds(hasTimestamp), factory);
- }
-
- // TODO: Consolidate this method and BigramContentWriter.getContentFilenames.
- protected static String[] getContentFilenames(final String name,
- final boolean hasTimestamp) {
- final String[] contentFilenames;
- if (hasTimestamp) {
- contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION,
- name + FormatSpec.BIGRAM_FILE_EXTENSION };
- } else {
- contentFilenames = new String[] { name + FormatSpec.BIGRAM_FILE_EXTENSION };
- }
- return contentFilenames;
- }
-
- // TODO: Consolidate this method and BigramContentWriter.getContentIds.
- protected static String[] getContentIds(final boolean hasTimestamp) {
- final String[] contentIds;
- if (hasTimestamp) {
- contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID,
- FormatSpec.BIGRAM_TIMESTAMP_CONTENT_ID };
- } else {
- contentIds = new String[] { FormatSpec.BIGRAM_FREQ_CONTENT_ID };
- }
- return contentIds;
- }
-
- public ArrayList<PendingAttribute> readTargetsAndFrequencies(final int terminalId,
- final DictBuffer terminalAddressTableBuffer) {
- final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList();
- read(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
- new SparseTableContentReaderInterface() {
- @Override
- public void read(final DictBuffer buffer) {
- while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
- // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
- // remaining bigram entries are ignored.
- final int bigramFlags = buffer.readUnsignedByte();
- final int targetTerminalId = buffer.readUnsignedInt24();
- terminalAddressTableBuffer.position(targetTerminalId
- * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
- final int targetAddress =
- terminalAddressTableBuffer.readUnsignedInt24();
- bigrams.add(new PendingAttribute(bigramFlags
- & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
- targetAddress));
- if (0 == (bigramFlags
- & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
- break;
- }
- }
- if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
- throw new RuntimeException("Too many bigrams in a PtNode ("
- + bigrams.size() + " but max is "
- + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
- }
- }
- });
- if (bigrams.isEmpty()) return null;
- return bigrams;
- }
+ private void loadBigramAddressSparseTable() throws IOException {
+ final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
+ + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
+ final File freqsFile = new File(mDictDirectory, mDictDirectory.getName()
+ + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
+ mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { freqsFile },
+ FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
}
- /**
- * An auxiliary class for reading shortcuts.
- */
- protected static class ShortcutContentReader extends SparseTableContentReader {
- public ShortcutContentReader(final String name, final File baseDir,
- final DictionaryBufferFactory factory) {
- super(name + FormatSpec.SHORTCUT_FILE_EXTENSION,
- FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
- new String[] { FormatSpec.SHORTCUT_CONTENT_ID }, factory);
- }
-
- public ArrayList<WeightedString> readShortcuts(final int terminalId) {
- final ArrayList<WeightedString> shortcuts = CollectionUtils.newArrayList();
- read(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
- new SparseTableContentReaderInterface() {
- @Override
- public void read(final DictBuffer buffer) {
- while (true) {
- final int flags = buffer.readUnsignedByte();
- final String word = CharEncoding.readString(buffer);
- shortcuts.add(new WeightedString(word,
- flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
- if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) {
- break;
- }
- }
- }
- });
- if (shortcuts.isEmpty()) return null;
- return shortcuts;
- }
+ // TODO: Let's have something like SparseTableContentsReader in this class.
+ private void loadShortcutAddressSparseTable() throws IOException {
+ final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
+ + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
+ final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
+ + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ + FormatSpec.SHORTCUT_CONTENT_ID);
+ final File timestampsFile = new File(mDictDirectory, mDictDirectory.getName()
+ + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
+ + FormatSpec.SHORTCUT_CONTENT_ID);
+ mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
+ new File[] { contentFile, timestampsFile },
+ FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
}
protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader {
@@ -296,82 +172,102 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
}
}
- private final int[] mCharacterBufferForReadingVer4PtNodeInfo
- = new int[FormatSpec.MAX_WORD_LENGTH];
+ private ArrayList<WeightedString> readShortcuts(final int terminalId) {
+ if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null;
+
+ final ArrayList<WeightedString> ret = CollectionUtils.newArrayList();
+ final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX,
+ terminalId);
+ mShortcutBuffer.position(posOfShortcuts);
+ while (true) {
+ final int flags = mShortcutBuffer.readUnsignedByte();
+ final String word = CharEncoding.readString(mShortcutBuffer);
+ ret.add(new WeightedString(word,
+ flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
+ if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
+ }
+ return ret;
+ }
- /**
- * Reads PtNode from ptNodePos in the trie file and returns Ver4PtNodeInfo.
- *
- * @param ptNodePos the position of PtNode.
- * @param options the format options.
- * @return Ver4PtNodeInfo.
- */
// TODO: Make this buffer thread safe.
// TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH.
- protected Ver4PtNodeInfo readVer4PtNodeInfo(final int ptNodePos, final FormatOptions options) {
- int readingPos = ptNodePos;
+ private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
+ @Override
+ public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) {
+ int addressPointer = ptNodePos;
final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
- readingPos += FormatSpec.PTNODE_FLAGS_SIZE;
+ addressPointer += FormatSpec.PTNODE_FLAGS_SIZE;
- final int parentPos = PtNodeReader.readParentAddress(mDictBuffer, options);
+ final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options);
if (BinaryDictIOUtils.supportsDynamicUpdate(options)) {
- readingPos += FormatSpec.PARENT_ADDRESS_SIZE;
+ addressPointer += FormatSpec.PARENT_ADDRESS_SIZE;
}
final int characters[];
if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
int index = 0;
int character = CharEncoding.readChar(mDictBuffer);
- readingPos += CharEncoding.getCharSize(character);
+ addressPointer += CharEncoding.getCharSize(character);
while (FormatSpec.INVALID_CHARACTER != character
&& index < FormatSpec.MAX_WORD_LENGTH) {
- mCharacterBufferForReadingVer4PtNodeInfo[index++] = character;
+ mCharacterBuffer[index++] = character;
character = CharEncoding.readChar(mDictBuffer);
- readingPos += CharEncoding.getCharSize(character);
+ addressPointer += CharEncoding.getCharSize(character);
}
- characters = Arrays.copyOfRange(mCharacterBufferForReadingVer4PtNodeInfo, 0, index);
+ characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
} else {
final int character = CharEncoding.readChar(mDictBuffer);
- readingPos += CharEncoding.getCharSize(character);
+ addressPointer += CharEncoding.getCharSize(character);
characters = new int[] { character };
}
final int terminalId;
if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
terminalId = PtNodeReader.readTerminalId(mDictBuffer);
- readingPos += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
+ addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
} else {
terminalId = PtNode.NOT_A_TERMINAL;
}
- int childrenPos = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
- if (childrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
- childrenPos += readingPos;
- }
- readingPos += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
-
- return new Ver4PtNodeInfo(flags, characters, terminalId, childrenPos, parentPos,
- readingPos - ptNodePos);
- }
-
- @Override
- public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) {
- final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(ptNodePos, options);
-
final int frequency;
- if (0 != (FormatSpec.FLAG_IS_TERMINAL & nodeInfo.mFlags)) {
- frequency = PtNodeReader.readFrequency(mFrequencyBuffer, nodeInfo.mTerminalId);
+ if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
+ frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId);
} else {
frequency = PtNode.NOT_A_TERMINAL;
}
-
- final ArrayList<WeightedString> shortcutTargets = mShortcutReader.readShortcuts(
- nodeInfo.mTerminalId);
- final ArrayList<PendingAttribute> bigrams = mBigramReader.readTargetsAndFrequencies(
- nodeInfo.mTerminalId, mTerminalAddressTableBuffer);
-
- return new PtNodeInfo(ptNodePos, ptNodePos + nodeInfo.mNodeSize, nodeInfo.mFlags,
- nodeInfo.mCharacters, frequency, nodeInfo.mParentPos, nodeInfo.mChildrenPos,
- shortcutTargets, bigrams);
+ int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options);
+ if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
+ childrenAddress += addressPointer;
+ }
+ addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
+ final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId);
+
+ final ArrayList<PendingAttribute> bigrams;
+ if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
+ bigrams = new ArrayList<PendingAttribute>();
+ final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId);
+ mBigramBuffer.position(posOfBigrams);
+ while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
+ // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
+ // remaining bigram entries are ignored.
+ final int bigramFlags = mBigramBuffer.readUnsignedByte();
+ final int targetTerminalId = mBigramBuffer.readUnsignedInt24();
+ mTerminalAddressTableBuffer.position(
+ targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
+ final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24();
+ bigrams.add(new PendingAttribute(
+ bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
+ targetAddress));
+ if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
+ }
+ if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
+ throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
+ + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
+ }
+ } else {
+ bigrams = null;
+ }
+ return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, frequency,
+ parentAddress, childrenAddress, shortcutTargets, bigrams);
}
private void deleteDictFiles() {
@@ -422,14 +318,10 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
@Override
public boolean readAndFollowForwardLink() {
- final int forwardLinkPos = mDictBuffer.position();
- int nextRelativePos = BinaryDictDecoderUtils.readSInt24(mDictBuffer);
- if (nextRelativePos != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
- final int nextPos = forwardLinkPos + nextRelativePos;
- if (nextPos >= 0 && nextPos < mDictBuffer.limit()) {
- mDictBuffer.position(nextPos);
- return true;
- }
+ final int nextAddress = mDictBuffer.readUnsignedInt24();
+ if (nextAddress >= 0 && nextAddress < mDictBuffer.limit()) {
+ mDictBuffer.position(nextAddress);
+ return true;
}
return false;
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
index 8b80ebe63..8d5b48a9b 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java
@@ -25,8 +25,6 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
-import com.android.inputmethod.latin.utils.CollectionUtils;
-import com.android.inputmethod.latin.utils.FileUtils;
import java.io.File;
import java.io.FileNotFoundException;
@@ -34,8 +32,6 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
import java.util.Iterator;
/**
@@ -46,8 +42,8 @@ public class Ver4DictEncoder implements DictEncoder {
private final File mDictPlacedDir;
private byte[] mTrieBuf;
private int mTriePos;
+ private int mHeaderSize;
private OutputStream mTrieOutStream;
- private OutputStream mHeaderOutStream;
private OutputStream mFreqOutStream;
private OutputStream mUnigramTimestampOutStream;
private OutputStream mTerminalAddressTableOutStream;
@@ -61,6 +57,62 @@ public class Ver4DictEncoder implements DictEncoder {
mDictPlacedDir = dictPlacedDir;
}
+ private interface SparseTableContentWriterInterface {
+ public void write(final OutputStream outStream) throws IOException;
+ }
+
+ private static class SparseTableContentWriter {
+ private final int mContentCount;
+ private final SparseTable mSparseTable;
+ private final File mLookupTableFile;
+ protected final File mBaseDir;
+ private final File[] mAddressTableFiles;
+ private final File[] mContentFiles;
+ protected final OutputStream[] mContentOutStreams;
+
+ public SparseTableContentWriter(final String name, final int initialCapacity,
+ final int blockSize, final File baseDir, final String[] contentFilenames,
+ final String[] contentIds) {
+ if (contentFilenames.length != contentIds.length) {
+ throw new RuntimeException("The length of contentFilenames and the length of"
+ + " contentIds are different " + contentFilenames.length + ", "
+ + contentIds.length);
+ }
+ mContentCount = contentFilenames.length;
+ mSparseTable = new SparseTable(initialCapacity, blockSize, mContentCount);
+ mLookupTableFile = new File(baseDir, name + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
+ mAddressTableFiles = new File[mContentCount];
+ mContentFiles = new File[mContentCount];
+ mBaseDir = baseDir;
+ for (int i = 0; i < mContentCount; ++i) {
+ mAddressTableFiles[i] = new File(mBaseDir,
+ name + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + contentIds[i]);
+ mContentFiles[i] = new File(mBaseDir, contentFilenames[i] + contentIds[i]);
+ }
+ mContentOutStreams = new OutputStream[mContentCount];
+ }
+
+ public void openStreams() throws FileNotFoundException {
+ for (int i = 0; i < mContentCount; ++i) {
+ mContentOutStreams[i] = new FileOutputStream(mContentFiles[i]);
+ }
+ }
+
+ protected void write(final int contentIndex, final int index,
+ final SparseTableContentWriterInterface writer) throws IOException {
+ mSparseTable.set(contentIndex, index, (int) mContentFiles[contentIndex].length());
+ writer.write(mContentOutStreams[contentIndex]);
+ mContentOutStreams[contentIndex].flush();
+ }
+
+ public void closeStreams() throws IOException {
+ mSparseTable.writeToFiles(mLookupTableFile, mAddressTableFiles);
+ for (int i = 0; i < mContentCount; ++i) {
+ mContentOutStreams[i].close();
+ }
+ }
+ }
+
private static class BigramContentWriter extends SparseTableContentWriter {
private final boolean mWriteTimestamp;
@@ -186,21 +238,16 @@ public class Ver4DictEncoder implements DictEncoder {
mBaseFilename = header.getId() + "." + header.getVersion();
mDictDir = new File(mDictPlacedDir, mBaseFilename);
final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
- final File headerFile = new File(mDictDir,
- mBaseFilename + FormatSpec.HEADER_FILE_EXTENSION);
final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
final File timestampFile = new File(mDictDir,
mBaseFilename + FormatSpec.UNIGRAM_TIMESTAMP_FILE_EXTENSION);
final File terminalAddressTableFile = new File(mDictDir,
mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
if (!mDictDir.isDirectory()) {
- if (mDictDir.exists()) {
- FileUtils.deleteRecursively(mDictDir);
- }
+ if (mDictDir.exists()) mDictDir.delete();
mDictDir.mkdirs();
}
mTrieOutStream = new FileOutputStream(trieFile);
- mHeaderOutStream = new FileOutputStream(headerFile);
mFreqOutStream = new FileOutputStream(freqFile);
mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
if (formatOptions.mHasTimestamp) {
@@ -213,9 +260,6 @@ public class Ver4DictEncoder implements DictEncoder {
if (mTrieOutStream != null) {
mTrieOutStream.close();
}
- if (mHeaderOutStream != null) {
- mHeaderOutStream.close();
- }
if (mFreqOutStream != null) {
mFreqOutStream.close();
}
@@ -227,7 +271,6 @@ public class Ver4DictEncoder implements DictEncoder {
}
} finally {
mTrieOutStream = null;
- mHeaderOutStream = null;
mFreqOutStream = null;
mTerminalAddressTableOutStream = null;
}
@@ -248,34 +291,16 @@ public class Ver4DictEncoder implements DictEncoder {
openStreams(formatOptions, dict.mOptions);
}
- BinaryDictEncoderUtils.writeDictionaryHeader(mHeaderOutStream, dict, formatOptions);
+ mHeaderSize = BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict,
+ formatOptions);
MakedictLog.i("Flattening the tree...");
ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
int terminalCount = 0;
- final ArrayList<PtNode> nodes = CollectionUtils.newArrayList();
for (final PtNodeArray array : flatNodes) {
for (final PtNode node : array.mData) {
- if (node.isTerminal()) {
- nodes.add(node);
- node.mTerminalId = terminalCount++;
- }
- }
- }
- Collections.sort(nodes, new Comparator<PtNode>() {
- @Override
- public int compare(final PtNode lhs, final PtNode rhs) {
- if (lhs.mFrequency != rhs.mFrequency) {
- return lhs.mFrequency < rhs.mFrequency ? -1 : 1;
- }
- if (lhs.mTerminalId < rhs.mTerminalId) return -1;
- if (lhs.mTerminalId > rhs.mTerminalId) return 1;
- return 0;
+ if (node.isTerminal()) node.mTerminalId = terminalCount++;
}
- });
- int count = 0;
- for (final PtNode node : nodes) {
- node.mTerminalId = count++;
}
MakedictLog.i("Computing addresses...");
@@ -312,7 +337,7 @@ public class Ver4DictEncoder implements DictEncoder {
@Override
public void setPosition(int position) {
- if (mTrieBuf == null || position < 0 || position > mTrieBuf.length) return;
+ if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return;
mTriePos = position;
}
@@ -365,7 +390,7 @@ public class Ver4DictEncoder implements DictEncoder {
private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) {
final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
- if (formatOptions.supportsDynamicUpdate()) {
+ if (formatOptions.mSupportsDynamicUpdate) {
mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf,
mTriePos, childrenPos);
} else {
@@ -432,7 +457,7 @@ public class Ver4DictEncoder implements DictEncoder {
ptNode.mFrequency, FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
BinaryDictEncoderUtils.writeUIntToBuffer(terminalAddressTableBuf,
ptNode.mTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
- ptNode.mCachedAddressAfterUpdate,
+ ptNode.mCachedAddressAfterUpdate + mHeaderSize,
FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
}
}
diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java
index c46bc36bb..3d8f186ba 100644
--- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java
+++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictUpdater.java
@@ -17,130 +17,29 @@
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
-import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
-import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
-import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
-import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
-import com.android.inputmethod.latin.utils.CollectionUtils;
-
-import android.util.Log;
import java.io.File;
-import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStream;
import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Iterator;
/**
* An implementation of DictUpdater for version 4 binary dictionary.
*/
@UsedForTesting
public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater {
- private static final String TAG = Ver4DictUpdater.class.getSimpleName();
-
- private OutputStream mDictStream;
- private final File mFrequencyFile;
@UsedForTesting
- public Ver4DictUpdater(final File dictDirectory, final int factoryType)
- throws UnsupportedFormatException {
+ public Ver4DictUpdater(final File dictDirectory, final int factoryType) {
// DictUpdater must have an updatable DictBuffer.
super(dictDirectory, ((factoryType & MASK_DICTBUFFER) == USE_BYTEARRAY)
? USE_BYTEARRAY : USE_WRITABLE_BYTEBUFFER);
- mFrequencyFile = getFile(FILETYPE_FREQUENCY);
- }
-
- private static class BigramContentUpdater extends SparseTableContentUpdater {
- public BigramContentUpdater(final String name, final File baseDir,
- final boolean hasTimestamp) {
- super(name + FormatSpec.BIGRAM_FILE_EXTENSION,
- FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- BigramContentReader.getContentFilenames(name, hasTimestamp),
- BigramContentReader.getContentIds(hasTimestamp),
- new DictionaryBufferFromWritableByteBufferFactory());
- }
-
- public void insertBigramEntries(final int terminalId, final int frequency,
- final ArrayList<PendingAttribute> entries) throws IOException {
- if (terminalId < 0) {
- throw new RuntimeException("Invalid terminal id : " + terminalId);
- }
- openStreamsAndBuffers();
-
- if (entries == null || entries.isEmpty()) {
- setContentValue(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId,
- SparseTable.NOT_EXIST);
- return;
- }
- final int positionOfEntries =
- (int) mContentFiles[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX].length();
- setContentValue(FormatSpec.BIGRAM_FREQ_CONTENT_INDEX, terminalId, positionOfEntries);
-
- final Iterator<PendingAttribute> bigramIterator = entries.iterator();
- while (bigramIterator.hasNext()) {
- final PendingAttribute entry = bigramIterator.next();
- final int flags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
- 0 /* offset */, entry.mFrequency, frequency, "" /* word */);
- BinaryDictEncoderUtils.writeUIntToStream(
- mContentOutStreams[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX], flags,
- FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
- BinaryDictEncoderUtils.writeUIntToStream(
- mContentOutStreams[FormatSpec.BIGRAM_FREQ_CONTENT_INDEX], entry.mAddress,
- FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
- }
- close();
- }
- }
-
- private static class ShortcutContentUpdater extends SparseTableContentUpdater {
- public ShortcutContentUpdater(final String name, final File baseDir) {
- super(name + FormatSpec.SHORTCUT_FILE_EXTENSION,
- FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
- new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
- new String[] { FormatSpec.SHORTCUT_CONTENT_ID },
- new DictionaryBufferFromWritableByteBufferFactory());
- }
-
- public void insertShortcuts(final int terminalId,
- final ArrayList<WeightedString> shortcuts) throws IOException {
- if (terminalId < 0) {
- throw new RuntimeException("Invalid terminal id : " + terminalId);
- }
- openStreamsAndBuffers();
- if (shortcuts == null || shortcuts.isEmpty()) {
- setContentValue(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
- SparseTable.NOT_EXIST);
- return;
- }
-
- final int positionOfShortcuts =
- (int) mContentFiles[FormatSpec.SHORTCUT_CONTENT_INDEX].length();
- setContentValue(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, positionOfShortcuts);
-
- final Iterator<WeightedString> shortcutIterator = shortcuts.iterator();
- while (shortcutIterator.hasNext()) {
- final WeightedString target = shortcutIterator.next();
- final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
- shortcutIterator.hasNext(), target.mFrequency);
- BinaryDictEncoderUtils.writeUIntToStream(
- mContentOutStreams[FormatSpec.SHORTCUT_CONTENT_INDEX], shortcutFlags,
- FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
- CharEncoding.writeString(mContentOutStreams[FormatSpec.SHORTCUT_CONTENT_INDEX],
- target.mWord);
- }
- close();
- }
}
@Override
public void deleteWord(final String word) throws IOException, UnsupportedFormatException {
- if (mDictBuffer == null) {
- openDictBuffer();
- readHeader();
- }
+ if (mDictBuffer == null) openDictBuffer();
+ readHeader();
final int wordPos = getTerminalPosition(word);
if (wordPos != FormatSpec.NOT_VALID_WORD) {
mDictBuffer.position(wordPos);
@@ -150,623 +49,11 @@ public class Ver4DictUpdater extends Ver4DictDecoder implements DictUpdater {
}
}
- private int getNewTerminalId() {
- // The size of frequency file is FormatSpec.FREQUENCY_AND_FLAGS_SIZE * number of terminals
- // because each terminal always has a frequency.
- // So we can get a fresh terminal id by this logic.
- // CAVEAT: we are reading the file size from the disk each time: beware of race conditions,
- // even on one thread.
- return (int) (mFrequencyFile.length() / FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
- }
-
- private void updateParentPosIfNotMoved(final int nodePos, final int newParentPos,
- final FormatOptions formatOptions) {
- final int originalPos = getPosition();
- setPosition(nodePos);
- final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
- if (!BinaryDictIOUtils.isMovedPtNode(flags, formatOptions)) {
- final int parentOffset = newParentPos - nodePos;
- BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, parentOffset);
- }
- setPosition(originalPos);
- }
-
- private void updateParentPositions(final int nodeArrayPos, final int newParentPos,
- final FormatOptions formatOptions) {
- final int originalPos = mDictBuffer.position();
- mDictBuffer.position(nodeArrayPos);
- int jumpCount = 0;
- do {
- final int count = readPtNodeCount();
- for (int i = 0; i < count; ++i) {
- updateParentPosIfNotMoved(getPosition(), newParentPos, formatOptions);
- skipPtNode(formatOptions);
- }
- if (!readAndFollowForwardLink()) break;
- } while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS);
- setPosition(originalPos);
- }
-
- private void updateChildrenPos(final int nodePos, final int newChildrenPos,
- final FormatOptions options) {
- final int originalPos = getPosition();
- setPosition(nodePos);
- final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
- PtNodeReader.readParentAddress(mDictBuffer, options);
- BinaryDictIOUtils.skipString(mDictBuffer,
- (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
- if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer);
- final int basePos = getPosition();
- BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newChildrenPos - basePos);
- setPosition(originalPos);
- }
-
- private void updateTerminalPosition(final int terminalId, final int position) {
- if (terminalId == PtNode.NOT_A_TERMINAL
- || terminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE
- >= mTerminalAddressTableBuffer.limit()) return;
- mTerminalAddressTableBuffer.position(terminalId
- * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
- BinaryDictEncoderUtils.writeUIntToDictBuffer(mTerminalAddressTableBuffer, position,
- FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
- }
-
- private void updateForwardLink(final int nodeArrayPos, final int newForwardLink,
- final FormatOptions formatOptions) {
- final int originalPos = getPosition();
- setPosition(nodeArrayPos);
- int jumpCount = 0;
- while (jumpCount++ < DynamicBinaryDictIOUtils.MAX_JUMPS) {
- final int ptNodeCount = readPtNodeCount();
- for (int i = 0; i < ptNodeCount; ++i) {
- skipPtNode(formatOptions);
- }
- final int forwardLinkPos = getPosition();
- if (!readAndFollowForwardLink()) {
- setPosition(forwardLinkPos);
- BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, newForwardLink - forwardLinkPos);
- break;
- }
- }
- setPosition(originalPos);
- }
-
- private void markPtNodeAsMoved(final int nodePos, final int newNodePos,
- final FormatOptions options) {
- final int originalPos = getPosition();
- updateParentPosIfNotMoved(nodePos, newNodePos, options);
- setPosition(nodePos);
- final int currentFlags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
- setPosition(nodePos);
- mDictBuffer.put((byte) (FormatSpec.FLAG_IS_MOVED
- | (currentFlags & (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
- final int offset = newNodePos - nodePos;
- BinaryDictIOUtils.writeSInt24ToBuffer(mDictBuffer, offset);
- setPosition(originalPos);
- }
-
- /**
- * Writes a PtNode to an output stream from a Ver4PtNodeInfo.
- *
- * @param nodePos the position of the head of the PtNode.
- * @param info the PtNode info to be written.
- * @return the size written, in bytes.
- */
- private int writePtNode(final int nodePos, final Ver4PtNodeInfo info) throws IOException {
- int written = 0;
-
- // Write flags.
- mDictStream.write((byte) (info.mFlags & 0xFF));
- written += FormatSpec.PTNODE_FLAGS_SIZE;
-
- // Write the parent position.
- final int parentOffset = info.mParentPos == FormatSpec.NO_PARENT_ADDRESS ?
- FormatSpec.NO_PARENT_ADDRESS : info.mParentPos - nodePos;
- BinaryDictIOUtils.writeSInt24ToStream(mDictStream, parentOffset);
- written += FormatSpec.PARENT_ADDRESS_SIZE;
-
- // Write a string.
- if (((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0)
- != (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters > 1)) {
- throw new RuntimeException("Inconsistent flags : hasMultipleChars = "
- + ((info.mFlags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0) + ", length = "
- + (info.mEndIndexOfCharacters - info.mStartIndexOfCharacters));
- }
- written += CharEncoding.writeCodePoints(mDictStream, info.mCharacters,
- info.mStartIndexOfCharacters, info.mEndIndexOfCharacters);
-
- // Write the terminal id.
- if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
- BinaryDictEncoderUtils.writeUIntToStream(mDictStream, info.mTerminalId,
- FormatSpec.PTNODE_TERMINAL_ID_SIZE);
- written += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
- }
-
- // Write the children position.
- final int childrenOffset = info.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS
- ? 0 : info.mChildrenPos - (nodePos + written);
- BinaryDictIOUtils.writeSInt24ToStream(mDictStream, childrenOffset);
- written += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
-
- return written;
- }
-
- /**
- * Helper method to split and move PtNode.
- *
- * @param ptNodeArrayPos the position of PtNodeArray which contains the split and moved PtNode.
- * @param splittedPtNodeToMovePos the position of the split and moved PtNode.
- * @param newParent the parent PtNode after splitting.
- * @param newChildren the children PtNodes after splitting.
- * @param newParentStartPos where to write the new parent.
- * @param formatOptions the format options.
- */
- private void writeSplittedPtNodes(final int ptNodeArrayPos, final int splittedPtNodeToMovePos,
- final Ver4PtNodeInfo newParent, final Ver4PtNodeInfo[] newChildren,
- final int newParentStartPos,
- final FormatOptions formatOptions) throws IOException {
- updateTerminalPosition(newParent.mTerminalId,
- newParentStartPos + 1 /* size of PtNodeCount */);
- int written = writePtNodeArray(newParentStartPos, new Ver4PtNodeInfo[] { newParent },
- FormatSpec.NO_FORWARD_LINK_ADDRESS);
- final int childrenStartPos = newParentStartPos + written;
- writePtNodeArray(childrenStartPos, newChildren, FormatSpec.NO_FORWARD_LINK_ADDRESS);
- int childrenNodePos = childrenStartPos + 1 /* size of PtNodeCount */;
- for (final Ver4PtNodeInfo info : newChildren) {
- updateTerminalPosition(info.mTerminalId, childrenNodePos);
- childrenNodePos += computePtNodeSize(info.mCharacters, info.mStartIndexOfCharacters,
- info.mEndIndexOfCharacters,
- (info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0);
- }
-
- // Mark as moved.
- markPtNodeAsMoved(splittedPtNodeToMovePos, newParentStartPos + 1 /* size of PtNodeCount */,
- formatOptions);
- updateForwardLink(ptNodeArrayPos, newParentStartPos, formatOptions);
- }
-
- /**
- * Writes a node array to the stream.
- *
- * @param nodeArrayPos the position of the head of the node array.
- * @param infos an array of Ver4PtNodeInfo to be written.
- * @return the written length in bytes.
- */
- private int writePtNodeArray(final int nodeArrayPos, final Ver4PtNodeInfo[] infos,
- final int forwardLink) throws IOException {
- int written = BinaryDictIOUtils.writePtNodeCount(mDictStream, infos.length);
- for (int i = 0; i < infos.length; ++i) {
- written += writePtNode(nodeArrayPos + written, infos[i]);
- }
- BinaryDictIOUtils.writeSInt24ToStream(mDictStream, forwardLink);
- written += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
- return written;
- }
-
- private int computePtNodeSize(final int[] codePoints, final int startIndex, final int endIndex,
- final boolean isTerminal) {
- return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
- + CharEncoding.getCharArraySize(codePoints, startIndex, endIndex)
- + (endIndex - startIndex > 1 ? FormatSpec.PTNODE_TERMINATOR_SIZE : 0)
- + (isTerminal ? FormatSpec.PTNODE_TERMINAL_ID_SIZE : 0)
- + FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
- }
-
- private void writeNewSinglePtNodeWithAttributes(final int[] codePoints,
- final boolean hasShortcuts, final int terminalId, final boolean hasBigrams,
- final boolean isNotAWord, final boolean isBlackListEntry, final int parentPos,
- final FormatOptions formatOptions) throws IOException {
- final int newNodeArrayPos = mDictBuffer.limit();
- final int newNodeFlags = BinaryDictEncoderUtils.makePtNodeFlags(codePoints.length > 1,
- terminalId != PtNode.NOT_A_TERMINAL, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts,
- hasBigrams, isNotAWord, isBlackListEntry, formatOptions);
- final Ver4PtNodeInfo info = new Ver4PtNodeInfo(newNodeFlags, codePoints, terminalId,
- FormatSpec.NO_CHILDREN_ADDRESS, parentPos, 0 /* nodeSize */);
- writePtNodeArray(newNodeArrayPos, new Ver4PtNodeInfo[] { info },
- FormatSpec.NO_FORWARD_LINK_ADDRESS);
- }
-
- private int setMultipleCharsInFlags(final int currentFlags, final boolean hasMultipleChars) {
- final int flags;
- if (hasMultipleChars) {
- flags = currentFlags | FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
- } else {
- flags = currentFlags & (~FormatSpec.FLAG_HAS_MULTIPLE_CHARS);
- }
- return flags;
- }
-
- private int setIsNotAWordInFlags(final int currentFlags, final boolean isNotAWord) {
- final int flags;
- if (isNotAWord) {
- flags = currentFlags | FormatSpec.FLAG_IS_NOT_A_WORD;
- } else {
- flags = currentFlags & (~FormatSpec.FLAG_IS_NOT_A_WORD);
- }
- return flags;
- }
-
- private int setIsBlackListEntryInFlags(final int currentFlags, final boolean isBlackListEntry) {
- final int flags;
- if (isBlackListEntry) {
- flags = currentFlags | FormatSpec.FLAG_IS_BLACKLISTED;
- } else {
- flags = currentFlags & (~FormatSpec.FLAG_IS_BLACKLISTED);
- }
- return flags;
- }
-
- /**
- * Splits a PtNode.
- *
- * abcd - ef
- *
- * -> inserting "abc"
- *
- * abc - d - ef
- *
- * @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split.
- * @param nodeToSplitPos the position of the PtNode to split.
- * @param nodeToSplitInfo the information of the PtNode to split.
- * @param indexToSplit the index where to split in the code points array.
- * @param parentOfNodeToSplitPos the absolute position of a parent of the node to split.
- * @param newTerminalId the terminal id of the inserted node (corresponds to "d").
- * @param hasShortcuts whether the inserted word should have shortcuts.
- * @param hasBigrams whether the inserted word should have bigrams.
- * @param isNotAWord whether the inserted word should be not a word.
- * @param isBlackListEntry whether the inserted word should be a black list entry.
- * @param formatOptions the format options.
- */
- private void splitOnly(final int nodeArrayToSplitPos, final int nodeToSplitPos,
- final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit,
- final int parentOfNodeToSplitPos, final int newTerminalId, final boolean hasShortcuts,
- final boolean hasBigrams, final boolean isNotAWord, final boolean isBlackListEntry,
- final FormatOptions formatOptions) throws IOException {
- final int parentNodeArrayStartPos = mDictBuffer.limit();
- final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
- final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags(indexToSplit > 1,
- true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams,
- isNotAWord, isBlackListEntry, formatOptions);
- final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags,
- nodeToSplitInfo.mCharacters, newTerminalId, parentNodeStartPos
- + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, true)
- + FormatSpec.FORWARD_LINK_ADDRESS_SIZE,
- parentOfNodeToSplitPos, 0 /* nodeSize */);
- parentInfo.mStartIndexOfCharacters = 0;
- parentInfo.mEndIndexOfCharacters = indexToSplit;
-
- // Write the child.
- final int childrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags,
- nodeToSplitInfo.mCharacters.length - indexToSplit > 1);
- final Ver4PtNodeInfo childrenInfo = new Ver4PtNodeInfo(childrenFlags,
- nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId,
- nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */);
- childrenInfo.mStartIndexOfCharacters = indexToSplit;
- childrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length;
- if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
- updateParentPositions(nodeToSplitInfo.mChildrenPos,
- parentInfo.mChildrenPos + 1 /* size of PtNodeCount */, formatOptions);
- }
-
- writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo,
- new Ver4PtNodeInfo[] { childrenInfo }, parentNodeArrayStartPos, formatOptions);
- }
-
- /**
- * Split and branch a PtNode.
- *
- * ab - cd
- *
- * -> inserting "ac"
- *
- * a - b - cd
- * |
- * - c
- *
- * @param nodeArrayToSplitPos the position of PtNodeArray which contains the PtNode to split.
- * @param nodeToSplitPos the position of the PtNode to split.
- * @param nodeToSplitInfo the information of the PtNode to split.
- * @param indexToSplit the index where to split in the code points array.
- * @param parentOfNodeToSplitPos the absolute position of parent of the node to split.
- * @param newWordSuffixCodePoints the suffix of the newly inserted word (corresponds to "c").
- * @param startIndexOfNewWordSuffixCodePoints the start index in newWordSuffixCodePoints where
- * the suffix starts.
- * @param newTerminalId the terminal id of the inserted node (correspond to "c").
- * @param hasShortcuts whether the inserted word should have shortcuts.
- * @param hasBigrams whether the inserted word should have bigrams.
- * @param isNotAWord whether the inserted word should be not a word.
- * @param isBlackListEntry whether the inserted word should be a black list entry.
- * @param formatOptions the format options.
- */
- private void splitAndBranch(final int nodeArrayToSplitPos, final int nodeToSplitPos,
- final Ver4PtNodeInfo nodeToSplitInfo, final int indexToSplit,
- final int parentOfNodeToSplitPos, final int[] newWordSuffixCodePoints,
- final int startIndexOfNewWordSuffixCodePoints,
- final int newTerminalId,
- final boolean hasShortcuts, final boolean hasBigrams, final boolean isNotAWord,
- final boolean isBlackListEntry, final FormatOptions formatOptions) throws IOException {
- final int parentNodeArrayStartPos = mDictBuffer.limit();
- final int parentNodeStartPos = parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
- final int parentFlags = BinaryDictEncoderUtils.makePtNodeFlags(
- indexToSplit > 1,
- false /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED,
- false /* hasShortcut */, false /* hasBigrams */,
- false /* isNotAWord */, false /* isBlackListEntry */, formatOptions);
- final Ver4PtNodeInfo parentInfo = new Ver4PtNodeInfo(parentFlags,
- nodeToSplitInfo.mCharacters, PtNode.NOT_A_TERMINAL,
- parentNodeStartPos
- + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false)
- + FormatSpec.FORWARD_LINK_ADDRESS_SIZE,
- parentOfNodeToSplitPos, 0 /* nodeSize */);
- parentInfo.mStartIndexOfCharacters = 0;
- parentInfo.mEndIndexOfCharacters = indexToSplit;
-
- final int childrenNodeArrayStartPos = parentNodeStartPos
- + computePtNodeSize(nodeToSplitInfo.mCharacters, 0, indexToSplit, false)
- + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
- final int firstChildrenFlags = BinaryDictEncoderUtils.makePtNodeFlags(
- newWordSuffixCodePoints.length - startIndexOfNewWordSuffixCodePoints > 1,
- true /* isTerminal */, FormatSpec.FLAG_IS_NOT_MOVED, hasShortcuts, hasBigrams,
- isNotAWord, isBlackListEntry, formatOptions);
- final Ver4PtNodeInfo firstChildrenInfo = new Ver4PtNodeInfo(firstChildrenFlags,
- newWordSuffixCodePoints, newTerminalId,
- FormatSpec.NO_CHILDREN_ADDRESS, parentNodeStartPos,
- 0 /* nodeSize */);
- firstChildrenInfo.mStartIndexOfCharacters = startIndexOfNewWordSuffixCodePoints;
- firstChildrenInfo.mEndIndexOfCharacters = newWordSuffixCodePoints.length;
-
- final int secondChildrenStartPos = childrenNodeArrayStartPos + 1 /* size of ptNodeCount */
- + computePtNodeSize(newWordSuffixCodePoints, startIndexOfNewWordSuffixCodePoints,
- newWordSuffixCodePoints.length, true /* isTerminal */);
- final int secondChildrenFlags = setMultipleCharsInFlags(nodeToSplitInfo.mFlags,
- nodeToSplitInfo.mCharacters.length - indexToSplit > 1);
- final Ver4PtNodeInfo secondChildrenInfo = new Ver4PtNodeInfo(secondChildrenFlags,
- nodeToSplitInfo.mCharacters, nodeToSplitInfo.mTerminalId,
- nodeToSplitInfo.mChildrenPos, parentNodeStartPos, 0 /* nodeSize */);
- secondChildrenInfo.mStartIndexOfCharacters = indexToSplit;
- secondChildrenInfo.mEndIndexOfCharacters = nodeToSplitInfo.mCharacters.length;
- if (nodeToSplitInfo.mChildrenPos != FormatSpec.NO_CHILDREN_ADDRESS) {
- updateParentPositions(nodeToSplitInfo.mChildrenPos, secondChildrenStartPos,
- formatOptions);
- }
-
- writeSplittedPtNodes(nodeArrayToSplitPos, nodeToSplitPos, parentInfo,
- new Ver4PtNodeInfo[] { firstChildrenInfo, secondChildrenInfo },
- parentNodeArrayStartPos, formatOptions);
- }
-
- /**
- * Inserts a word into the trie file and returns the position of inserted terminal node.
- * If the insertion is failed, returns FormatSpec.NOT_VALID_WORD.
- */
- @UsedForTesting
- private int insertWordToTrie(final String word, final int newTerminalId,
- final boolean isNotAWord, final boolean isBlackListEntry, final boolean hasBigrams,
- final boolean hasShortcuts) throws IOException, UnsupportedFormatException {
- setPosition(0);
- final FileHeader header = readHeader();
-
- final int[] codePoints = FusionDictionary.getCodePoints(word);
- final int wordLen = codePoints.length;
-
- int wordPos = 0;
- for (int depth = 0; depth < FormatSpec.MAX_WORD_LENGTH; /* nop */) {
- final int nodeArrayPos = getPosition();
- final int ptNodeCount = readPtNodeCount();
- boolean goToChildren = false;
- int parentPos = FormatSpec.NO_PARENT_ADDRESS;
- for (int i = 0; i < ptNodeCount; ++i) {
- final int nodePos = getPosition();
- final Ver4PtNodeInfo nodeInfo = readVer4PtNodeInfo(nodePos, header.mFormatOptions);
- if (BinaryDictIOUtils.isMovedPtNode(nodeInfo.mFlags, header.mFormatOptions)) {
- continue;
- }
- if (nodeInfo.mParentPos != FormatSpec.NO_PARENT_ADDRESS) {
- parentPos = nodePos + nodeInfo.mParentPos;
- }
-
- final boolean firstCharacterMatched =
- codePoints[wordPos] == nodeInfo.mCharacters[0];
- boolean allCharactersMatched = true;
- int firstDifferentCharacterIndex = -1;
- for (int p = 0; p < nodeInfo.mCharacters.length; ++p) {
- if (wordPos + p >= codePoints.length) break;
- if (codePoints[wordPos + p] != nodeInfo.mCharacters[p]) {
- if (firstDifferentCharacterIndex == -1) {
- firstDifferentCharacterIndex = p;
- }
- allCharactersMatched = false;
- }
- }
-
- if (!firstCharacterMatched) {
- // Go to the next sibling node.
- continue;
- }
-
- if (!allCharactersMatched) {
- final int parentNodeArrayStartPos = mDictBuffer.limit();
- splitAndBranch(nodeArrayPos, nodePos, nodeInfo, firstDifferentCharacterIndex,
- parentPos, codePoints, wordPos + firstDifferentCharacterIndex,
- newTerminalId, hasShortcuts, hasBigrams, isNotAWord,
- isBlackListEntry, header.mFormatOptions);
-
- return parentNodeArrayStartPos + computePtNodeSize(codePoints, wordPos,
- wordPos + firstDifferentCharacterIndex, false)
- + FormatSpec.FORWARD_LINK_ADDRESS_SIZE + 1 /* size of PtNodeCount */;
- }
-
- if (wordLen - wordPos < nodeInfo.mCharacters.length) {
- final int parentNodeArrayStartPos = mDictBuffer.limit();
- splitOnly(nodeArrayPos, nodePos, nodeInfo, wordLen - wordPos, parentPos,
- newTerminalId, hasShortcuts, hasBigrams, isNotAWord, isBlackListEntry,
- header.mFormatOptions);
-
- // Return the position of the inserted word.
- return parentNodeArrayStartPos + 1 /* size of PtNodeCount */;
- }
-
- wordPos += nodeInfo.mCharacters.length;
- if (wordPos == wordLen) {
- // This dictionary already contains the word.
- Log.e(TAG, "Something went wrong. If the word is already contained, "
- + " there is no need to insert new PtNode.");
- return FormatSpec.NOT_VALID_WORD;
- }
- if (nodeInfo.mChildrenPos == FormatSpec.NO_CHILDREN_ADDRESS) {
- // There are no children.
- // We need to add a new node as a child of this node.
- final int newNodeArrayPos = mDictBuffer.limit();
- final int[] newNodeCodePoints = Arrays.copyOfRange(codePoints, wordPos,
- codePoints.length);
- writeNewSinglePtNodeWithAttributes(newNodeCodePoints, hasShortcuts,
- newTerminalId, hasBigrams, isNotAWord, isBlackListEntry, nodePos,
- header.mFormatOptions);
- updateChildrenPos(nodePos, newNodeArrayPos, header.mFormatOptions);
- return newNodeArrayPos + 1 /* size of PtNodeCount */;
- } else {
- // Found the matched node.
- // Go to the children of this node.
- setPosition(nodeInfo.mChildrenPos);
- goToChildren = true;
- depth++;
- break;
- }
- }
-
- if (goToChildren) continue;
- if (!readAndFollowForwardLink()) {
- // Add a new node that contains [wordPos, word.length()-1].
- // and update the forward link.
- final int newNodeArrayPos = mDictBuffer.limit();
- final int[] newCodePoints = Arrays.copyOfRange(codePoints, wordPos,
- codePoints.length);
- writeNewSinglePtNodeWithAttributes(newCodePoints, hasShortcuts, newTerminalId,
- hasBigrams, isNotAWord, isBlackListEntry, parentPos, header.mFormatOptions);
- updateForwardLink(nodeArrayPos, newNodeArrayPos, header.mFormatOptions);
- return newNodeArrayPos + 1 /* size of PtNodeCount */;
- }
- }
- return FormatSpec.NOT_VALID_WORD;
- }
-
- private void updateFrequency(final int terminalId, final int frequency) {
- mFrequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
- BinaryDictEncoderUtils.writeUIntToDictBuffer(mFrequencyBuffer, frequency,
- FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
- }
-
- private void insertFrequency(final int frequency) throws IOException {
- final OutputStream frequencyStream = new FileOutputStream(mFrequencyFile,
- true /* append */);
- BinaryDictEncoderUtils.writeUIntToStream(frequencyStream, frequency,
- FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
- frequencyStream.close();
- }
-
- private void insertTerminalPosition(final int posOfTerminal) throws IOException,
- UnsupportedFormatException {
- final OutputStream terminalPosStream = new FileOutputStream(
- getFile(FILETYPE_TERMINAL_ADDRESS_TABLE), true /* append */);
- BinaryDictEncoderUtils.writeUIntToStream(terminalPosStream, posOfTerminal,
- FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
- terminalPosStream.close();
- }
-
- private void insertBigrams(final int terminalId, final int frequency,
- final ArrayList<PendingAttribute> bigramAddresses)
- throws IOException, UnsupportedFormatException {
- openDictBuffer();
- final BigramContentUpdater updater = new BigramContentUpdater(mDictDirectory.getName(),
- mDictDirectory, false);
-
- // Convert addresses to terminal ids.
- final ArrayList<PendingAttribute> bigrams = CollectionUtils.newArrayList();
- mDictBuffer.position(0);
- final FileHeader header = readHeader();
- for (PendingAttribute attr : bigramAddresses) {
- mDictBuffer.position(attr.mAddress);
- final Ver4PtNodeInfo info = readVer4PtNodeInfo(attr.mAddress, header.mFormatOptions);
- if (info.mTerminalId == PtNode.NOT_A_TERMINAL) {
- throw new RuntimeException("We can't have a bigram target that's not a terminal.");
- }
- bigrams.add(new PendingAttribute(frequency, info.mTerminalId));
- }
- updater.insertBigramEntries(terminalId, frequency, bigrams);
- close();
- }
-
- private void insertShortcuts(final int terminalId, final ArrayList<WeightedString> shortcuts)
- throws IOException {
- final ShortcutContentUpdater updater = new ShortcutContentUpdater(mDictDirectory.getName(),
- mDictDirectory);
- updater.insertShortcuts(terminalId, shortcuts);
- }
-
- private void openBuffersAndStream() throws IOException, UnsupportedFormatException {
- openDictBuffer();
- mDictStream = new FileOutputStream(getFile(FILETYPE_TRIE), true /* append */);
- }
-
- private void close() throws IOException {
- if (mDictStream != null) {
- mDictStream.close();
- mDictStream = null;
- }
- mDictBuffer = null;
- mFrequencyBuffer = null;
- mTerminalAddressTableBuffer = null;
- }
-
- private void updateAttributes(final int posOfWord, final int frequency,
- final ArrayList<WeightedString> bigramStrings,
- final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
- final boolean isBlackListEntry) throws IOException, UnsupportedFormatException {
- mDictBuffer.position(0);
- final FileHeader header = readHeader();
- mDictBuffer.position(posOfWord);
- final Ver4PtNodeInfo info = readVer4PtNodeInfo(posOfWord, header.mFormatOptions);
- final int terminalId = info.mTerminalId;
-
- // Update the flags.
- final int newFlags = setIsNotAWordInFlags(
- setIsBlackListEntryInFlags(info.mFlags, isBlackListEntry), isNotAWord);
- mDictBuffer.position(posOfWord);
- mDictBuffer.put((byte) newFlags);
-
- updateFrequency(terminalId, frequency);
- insertBigrams(terminalId, frequency,
- DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings));
- insertShortcuts(terminalId, shortcuts);
- }
-
- @Override @UsedForTesting
+ @Override
public void insertWord(final String word, final int frequency,
final ArrayList<WeightedString> bigramStrings, final ArrayList<WeightedString> shortcuts,
final boolean isNotAWord, final boolean isBlackListEntry)
throws IOException, UnsupportedFormatException {
- final int newTerminalId = getNewTerminalId();
-
- openBuffersAndStream();
- final int posOfWord = getTerminalPosition(word);
- if (posOfWord != FormatSpec.NOT_VALID_WORD) {
- // The word is already contained in the dictionary.
- updateAttributes(posOfWord, frequency, bigramStrings, shortcuts, isNotAWord,
- isBlackListEntry);
- close();
- return;
- }
-
- // Insert new PtNode into trie.
- final int posOfTerminal = insertWordToTrie(word, newTerminalId, isNotAWord,
- isBlackListEntry, bigramStrings != null && !bigramStrings.isEmpty(),
- shortcuts != null && !shortcuts.isEmpty());
- insertFrequency(frequency);
- insertTerminalPosition(posOfTerminal);
- close();
-
- insertBigrams(newTerminalId, frequency,
- DynamicBinaryDictIOUtils.resolveBigramPositions(this, bigramStrings));
- insertShortcuts(newTerminalId, shortcuts);
+ // TODO: Implement this method.
}
}