diff options
23 files changed, 529 insertions, 359 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java new file mode 100644 index 000000000..9f7f502ea --- /dev/null +++ b/java/src/com/android/inputmethod/latin/makedict/AbstractDictDecoder.java @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.makedict; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; +import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; +import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.TreeMap; + +/** + * A base class of the binary dictionary decoder. + */ +public abstract class AbstractDictDecoder implements DictDecoder { + protected FileHeader readHeader(final DictBuffer dictBuffer) + throws IOException, UnsupportedFormatException { + if (dictBuffer == null) { + openDictBuffer(); + } + + final int version = HeaderReader.readVersion(dictBuffer); + if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION + || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { + throw new UnsupportedFormatException("Unsupported version : " + version); + } + // TODO: Remove this field. + final int optionsFlags = HeaderReader.readOptionFlags(dictBuffer); + + final int headerSize = HeaderReader.readHeaderSize(dictBuffer); + + if (headerSize < 0) { + throw new UnsupportedFormatException("header size can't be negative."); + } + + final HashMap<String, String> attributes = HeaderReader.readAttributes(dictBuffer, + headerSize); + + final FileHeader header = new FileHeader(headerSize, + new FusionDictionary.DictionaryOptions(attributes, + 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG), + 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)), + new FormatOptions(version, + 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE))); + return header; + } + + @Override @UsedForTesting + public int getTerminalPosition(final String word) + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + return BinaryDictIOUtils.getTerminalPosition(this, word); + } + + @Override @UsedForTesting + public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, + final TreeMap<Integer, Integer> frequencies, + final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) + throws IOException, UnsupportedFormatException { + if (!isDictBufferOpen()) { + openDictBuffer(); + } + BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams); + } + + /** + * A utility class for reading a file header. + */ + protected static class HeaderReader { + protected static int readVersion(final DictBuffer dictBuffer) + throws IOException, UnsupportedFormatException { + return BinaryDictDecoderUtils.checkFormatVersion(dictBuffer); + } + + protected static int readOptionFlags(final DictBuffer dictBuffer) { + return dictBuffer.readUnsignedShort(); + } + + protected static int readHeaderSize(final DictBuffer dictBuffer) { + return dictBuffer.readInt(); + } + + protected static HashMap<String, String> readAttributes(final DictBuffer dictBuffer, + final int headerSize) { + final HashMap<String, String> attributes = new HashMap<String, String>(); + while (dictBuffer.position() < headerSize) { + // We can avoid an infinite loop here since dictBuffer.position() is always + // increased by calling CharEncoding.readString. + final String key = CharEncoding.readString(dictBuffer); + final String value = CharEncoding.readString(dictBuffer); + attributes.put(key, value); + } + dictBuffer.position(headerSize); + return attributes; + } + } + + /** + * A utility class for reading a PtNode. + */ + protected static class PtNodeReader { + protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) { + return dictBuffer.readUnsignedByte(); + } + + protected static int readParentAddress(final DictBuffer dictBuffer, + final FormatOptions formatOptions) { + if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { + return BinaryDictDecoderUtils.readSInt24(dictBuffer); + } else { + return FormatSpec.NO_PARENT_ADDRESS; + } + } + + protected static int readChildrenAddress(final DictBuffer dictBuffer, final int optionFlags, + final FormatOptions formatOptions) { + if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { + final int address = BinaryDictDecoderUtils.readSInt24(dictBuffer); + if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS; + return address; + } else { + switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: + return dictBuffer.readUnsignedByte(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: + return dictBuffer.readUnsignedShort(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: + return dictBuffer.readUnsignedInt24(); + case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: + default: + return FormatSpec.NO_CHILDREN_ADDRESS; + } + } + } + + // Reads shortcuts and returns the read length. + protected static int readShortcut(final DictBuffer dictBuffer, + final ArrayList<WeightedString> shortcutTargets) { + final int pointerBefore = dictBuffer.position(); + dictBuffer.readUnsignedShort(); // skip the size + while (true) { + final int targetFlags = dictBuffer.readUnsignedByte(); + final String word = CharEncoding.readString(dictBuffer); + shortcutTargets.add(new WeightedString(word, + targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); + if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return dictBuffer.position() - pointerBefore; + } + + protected static int readBigramAddresses(final DictBuffer dictBuffer, + final ArrayList<PendingAttribute> bigrams, final int baseAddress) { + int readLength = 0; + int bigramCount = 0; + while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { + final int bigramFlags = dictBuffer.readUnsignedByte(); + ++readLength; + final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) + ? 1 : -1; + int bigramAddress = baseAddress + readLength; + switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: + bigramAddress += sign * dictBuffer.readUnsignedByte(); + readLength += 1; + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: + bigramAddress += sign * dictBuffer.readUnsignedShort(); + readLength += 2; + break; + case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: + bigramAddress += sign * dictBuffer.readUnsignedInt24(); + readLength += 3; + break; + default: + throw new RuntimeException("Has bigrams with no address"); + } + bigrams.add(new PendingAttribute( + bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, + bigramAddress)); + if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return readLength; + } + } +} diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java index 2c3d1346f..216492b4d 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -23,11 +23,11 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; @@ -219,14 +219,14 @@ public final class BinaryDictDecoderUtils { } /** - * Writes a string with our character format to a ByteArrayOutputStream. + * Writes a string with our character format to an OutputStream. * * This will also write the terminator byte. * - * @param buffer the ByteArrayOutputStream to write to. + * @param buffer the OutputStream to write to. * @param word the string to write. */ - static void writeString(final ByteArrayOutputStream buffer, final String word) { + static void writeString(final OutputStream buffer, final String word) throws IOException { final int length = word.length(); for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java index b6024243f..f761829de 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java @@ -383,8 +383,8 @@ public class BinaryDictEncoderUtils { nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize + size, ptNode.mChildren)); } - nodeSize += getShortcutListSize(ptNode.mShortcutTargets); if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) { + nodeSize += getShortcutListSize(ptNode.mShortcutTargets); if (null != ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, diff --git a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java index e251f7df7..3dbeee099 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictDecoder.java @@ -17,11 +17,9 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; -import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.ByteArrayDictBuffer; import java.io.File; @@ -32,50 +30,17 @@ import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; -import java.util.HashMap; import java.util.TreeMap; /** - * The base class of binary dictionary decoders. + * An interface of binary dictionary decoders. */ -public abstract class DictDecoder { - - protected FileHeader readHeader(final DictBuffer dictBuffer) - throws IOException, UnsupportedFormatException { - if (dictBuffer == null) { - openDictBuffer(); - } - - final int version = HeaderReader.readVersion(dictBuffer); - if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION - || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { - throw new UnsupportedFormatException("Unsupported version : " + version); - } - // TODO: Remove this field. - final int optionsFlags = HeaderReader.readOptionFlags(dictBuffer); - - final int headerSize = HeaderReader.readHeaderSize(dictBuffer); - - if (headerSize < 0) { - throw new UnsupportedFormatException("header size can't be negative."); - } - - final HashMap<String, String> attributes = HeaderReader.readAttributes(dictBuffer, - headerSize); - - final FileHeader header = new FileHeader(headerSize, - new FusionDictionary.DictionaryOptions(attributes, - 0 != (optionsFlags & FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG), - 0 != (optionsFlags & FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG)), - new FormatOptions(version, - 0 != (optionsFlags & FormatSpec.SUPPORTS_DYNAMIC_UPDATE))); - return header; - } +public interface DictDecoder { /** * Reads and returns the file header. */ - public abstract FileHeader readHeader() throws IOException, UnsupportedFormatException; + public FileHeader readHeader() throws IOException, UnsupportedFormatException; /** * Reads PtNode from nodeAddress. @@ -83,7 +48,7 @@ public abstract class DictDecoder { * @param formatOptions the format options. * @return PtNodeInfo. */ - public abstract PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions formatOptions); + public PtNodeInfo readPtNode(final int ptNodePos, final FormatOptions formatOptions); /** * Reads a buffer and returns the memory representation of the dictionary. @@ -98,7 +63,7 @@ public abstract class DictDecoder { * @return the created (or merged) dictionary. */ @UsedForTesting - public abstract FusionDictionary readDictionaryBinary(final FusionDictionary dict, + public FusionDictionary readDictionaryBinary(final FusionDictionary dict, final boolean deleteDictIfBroken) throws FileNotFoundException, IOException, UnsupportedFormatException; @@ -113,12 +78,7 @@ public abstract class DictDecoder { */ @UsedForTesting public int getTerminalPosition(final String word) - throws IOException, UnsupportedFormatException { - if (!isDictBufferOpen()) { - openDictBuffer(); - } - return BinaryDictIOUtils.getTerminalPosition(this, word); - } + throws IOException, UnsupportedFormatException; /** * Reads unigrams and bigrams from the binary file. @@ -134,47 +94,42 @@ public abstract class DictDecoder { public void readUnigramsAndBigramsBinary(final TreeMap<Integer, String> words, final TreeMap<Integer, Integer> frequencies, final TreeMap<Integer, ArrayList<PendingAttribute>> bigrams) - throws IOException, UnsupportedFormatException { - if (!isDictBufferOpen()) { - openDictBuffer(); - } - BinaryDictIOUtils.readUnigramsAndBigramsBinary(this, words, frequencies, bigrams); - } + throws IOException, UnsupportedFormatException; /** * Sets the position of the buffer to the given value. * * @param newPos the new position */ - public abstract void setPosition(final int newPos); + public void setPosition(final int newPos); /** * Gets the position of the buffer. * * @return the position */ - public abstract int getPosition(); + public int getPosition(); /** * Reads and returns the PtNode count out of a buffer and forwards the pointer. */ - public abstract int readPtNodeCount(); + public int readPtNodeCount(); /** * Reads the forward link and advances the position. * * @return true if this method moves the file pointer, false otherwise. */ - public abstract boolean readAndFollowForwardLink(); - public abstract boolean hasNextPtNodeArray(); + public boolean readAndFollowForwardLink(); + public boolean hasNextPtNodeArray(); /** * Opens the dictionary file and makes DictBuffer. */ @UsedForTesting - public abstract void openDictBuffer() throws FileNotFoundException, IOException; + public void openDictBuffer() throws FileNotFoundException, IOException; @UsedForTesting - public abstract boolean isDictBufferOpen(); + public boolean isDictBufferOpen(); // Constants for DictionaryBufferFactory. public static final int USE_READONLY_BYTEBUFFER = 0x01000000; @@ -272,125 +227,5 @@ public abstract class DictDecoder { } } - /** - * A utility class for reading a file header. - */ - protected static class HeaderReader { - protected static int readVersion(final DictBuffer dictBuffer) - throws IOException, UnsupportedFormatException { - return BinaryDictDecoderUtils.checkFormatVersion(dictBuffer); - } - - protected static int readOptionFlags(final DictBuffer dictBuffer) { - return dictBuffer.readUnsignedShort(); - } - - protected static int readHeaderSize(final DictBuffer dictBuffer) { - return dictBuffer.readInt(); - } - - protected static HashMap<String, String> readAttributes(final DictBuffer dictBuffer, - final int headerSize) { - final HashMap<String, String> attributes = new HashMap<String, String>(); - while (dictBuffer.position() < headerSize) { - // We can avoid an infinite loop here since dictBuffer.position() is always - // increased by calling CharEncoding.readString. - final String key = CharEncoding.readString(dictBuffer); - final String value = CharEncoding.readString(dictBuffer); - attributes.put(key, value); - } - dictBuffer.position(headerSize); - return attributes; - } - } - - /** - * A utility class for reading a PtNode. - */ - protected static class PtNodeReader { - protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) { - return dictBuffer.readUnsignedByte(); - } - - protected static int readParentAddress(final DictBuffer dictBuffer, - final FormatOptions formatOptions) { - if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { - return BinaryDictDecoderUtils.readSInt24(dictBuffer); - } else { - return FormatSpec.NO_PARENT_ADDRESS; - } - } - - protected static int readChildrenAddress(final DictBuffer dictBuffer, final int optionFlags, - final FormatOptions formatOptions) { - if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) { - final int address = BinaryDictDecoderUtils.readSInt24(dictBuffer); - if (address == 0) return FormatSpec.NO_CHILDREN_ADDRESS; - return address; - } else { - switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: - return dictBuffer.readUnsignedByte(); - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: - return dictBuffer.readUnsignedShort(); - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: - return dictBuffer.readUnsignedInt24(); - case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: - default: - return FormatSpec.NO_CHILDREN_ADDRESS; - } - } - } - - // Reads shortcuts and returns the read length. - protected static int readShortcut(final DictBuffer dictBuffer, - final ArrayList<WeightedString> shortcutTargets) { - final int pointerBefore = dictBuffer.position(); - dictBuffer.readUnsignedShort(); // skip the size - while (true) { - final int targetFlags = dictBuffer.readUnsignedByte(); - final String word = CharEncoding.readString(dictBuffer); - shortcutTargets.add(new WeightedString(word, - targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); - if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; - } - return dictBuffer.position() - pointerBefore; - } - - protected static int readBigramAddresses(final DictBuffer dictBuffer, - final ArrayList<PendingAttribute> bigrams, final int baseAddress) { - int readLength = 0; - int bigramCount = 0; - while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { - final int bigramFlags = dictBuffer.readUnsignedByte(); - ++readLength; - final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) - ? 1 : -1; - int bigramAddress = baseAddress + readLength; - switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) { - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE: - bigramAddress += sign * dictBuffer.readUnsignedByte(); - readLength += 1; - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES: - bigramAddress += sign * dictBuffer.readUnsignedShort(); - readLength += 2; - break; - case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES: - bigramAddress += sign * dictBuffer.readUnsignedInt24(); - readLength += 3; - break; - default: - throw new RuntimeException("Has bigrams with no address"); - } - bigrams.add(new PendingAttribute( - bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, - bigramAddress)); - if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; - } - return readLength; - } - } - - public abstract void skipPtNode(final FormatOptions formatOptions); + public void skipPtNode(final FormatOptions formatOptions); } diff --git a/java/src/com/android/inputmethod/latin/makedict/DictUpdater.java b/java/src/com/android/inputmethod/latin/makedict/DictUpdater.java index 413d0301c..709ea3310 100644 --- a/java/src/com/android/inputmethod/latin/makedict/DictUpdater.java +++ b/java/src/com/android/inputmethod/latin/makedict/DictUpdater.java @@ -24,7 +24,7 @@ import java.util.ArrayList; /** * An interface of a binary dictionary updater. */ -public interface DictUpdater { +public interface DictUpdater extends DictDecoder { /** * Deletes the word from the binary dictionary. diff --git a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java index a5516bd41..5a5d7af6b 100644 --- a/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +++ b/java/src/com/android/inputmethod/latin/makedict/FormatSpec.java @@ -266,15 +266,28 @@ public final class FormatSpec { // tat = Terminal Address Table static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String BIGRAM_FILE_EXTENSION = ".bigram"; + static final String SHORTCUT_FILE_EXTENSION = ".shortcut"; static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; + + // With the English main dictionary as of October 2013, the size of bigram address table is + // is 584KB with the block size being 4. + // This is 91% of that of full address table. static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_CONTENT_COUNT = 1; static final int BIGRAM_FREQ_CONTENT_INDEX = 0; static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; + static final int SHORTCUT_CONTENT_COUNT = 1; + static final int SHORTCUT_CONTENT_INDEX = 0; + // With the English main dictionary as of October 2013, the size of shortcut address table is + // 29KB with the block size being 64. + // This is only 4.4% of that of full address table. + static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; + static final String SHORTCUT_CONTENT_ID = "_shortcut"; + static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_PARENT_ADDRESS = 0; static final int NO_FORWARD_LINK_ADDRESS = 0; diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java index b87259c38..acab4f8a5 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver3DictDecoder.java @@ -37,7 +37,7 @@ import java.util.Arrays; * An implementation of DictDecoder for version 3 binary dictionary. */ @UsedForTesting -public class Ver3DictDecoder extends DictDecoder { +public class Ver3DictDecoder extends AbstractDictDecoder { private static final String TAG = Ver3DictDecoder.class.getSimpleName(); static { @@ -47,7 +47,7 @@ public class Ver3DictDecoder extends DictDecoder { // TODO: implement something sensical instead of just a phony method private static native int doNothing(); - protected static class PtNodeReader extends DictDecoder.PtNodeReader { + protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { private static int readFrequency(final DictBuffer dictBuffer) { return dictBuffer.readUnsignedByte(); } diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java index 5089687da..bab24e301 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java @@ -23,6 +23,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; +import com.android.inputmethod.latin.utils.CollectionUtils; import android.util.Log; @@ -36,13 +37,14 @@ import java.util.Arrays; * An implementation of binary dictionary decoder for version 4 binary dictionary. */ @UsedForTesting -public class Ver4DictDecoder extends DictDecoder { +public class Ver4DictDecoder extends AbstractDictDecoder { private static final String TAG = Ver4DictDecoder.class.getSimpleName(); private static final int FILETYPE_TRIE = 1; private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; private static final int FILETYPE_BIGRAM_FREQ = 4; + private static final int FILETYPE_SHORTCUT = 5; private final File mDictDirectory; private final DictionaryBufferFactory mBufferFactory; @@ -50,7 +52,9 @@ public class Ver4DictDecoder extends DictDecoder { private DictBuffer mFrequencyBuffer; private DictBuffer mTerminalAddressTableBuffer; private DictBuffer mBigramBuffer; + private DictBuffer mShortcutBuffer; private SparseTable mBigramAddressTable; + private SparseTable mShortcutAddressTable; @UsedForTesting /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { @@ -89,6 +93,10 @@ public class Ver4DictDecoder extends DictDecoder { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.BIGRAM_FREQ_CONTENT_ID); + } else if (fileType == FILETYPE_SHORTCUT) { + return new File(mDictDirectory, + mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + + FormatSpec.SHORTCUT_CONTENT_ID); } else { throw new RuntimeException("Unsupported kind of file : " + fileType); } @@ -102,6 +110,8 @@ public class Ver4DictDecoder extends DictDecoder { getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); loadBigramAddressSparseTable(); + mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT)); + loadShortcutAddressSparseTable(); } @Override @@ -136,7 +146,18 @@ public class Ver4DictDecoder extends DictDecoder { FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); } - protected static class PtNodeReader extends DictDecoder.PtNodeReader { + // TODO: Let's have something like SparseTableContentsReader in this class. + private void loadShortcutAddressSparseTable() throws IOException { + final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); + final File contentFile = new File(mDictDirectory, mDictDirectory.getName() + + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + + FormatSpec.SHORTCUT_CONTENT_ID); + mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile, + new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE); + } + + protected static class PtNodeReader extends AbstractDictDecoder.PtNodeReader { protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); return frequencyBuffer.readUnsignedByte(); @@ -147,6 +168,23 @@ public class Ver4DictDecoder extends DictDecoder { } } + private ArrayList<WeightedString> readShortcuts(final int terminalId) { + if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null; + + final ArrayList<WeightedString> ret = CollectionUtils.newArrayList(); + final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX, + terminalId); + mShortcutBuffer.position(posOfShortcuts); + while (true) { + final int flags = mShortcutBuffer.readUnsignedByte(); + final String word = CharEncoding.readString(mShortcutBuffer); + ret.add(new WeightedString(word, + flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); + if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; + } + return ret; + } + // TODO: Make this buffer thread safe. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; @@ -197,14 +235,7 @@ public class Ver4DictDecoder extends DictDecoder { childrenAddress += addressPointer; } addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); - final ArrayList<WeightedString> shortcutTargets; - if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { - // readShortcut will add shortcuts to shortcutTargets. - shortcutTargets = new ArrayList<WeightedString>(); - addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); - } else { - shortcutTargets = null; - } + final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); final ArrayList<PendingAttribute> bigrams; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { diff --git a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java index b38c33019..f9dcacf77 100644 --- a/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java +++ b/java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java @@ -49,6 +49,7 @@ public class Ver4DictEncoder implements DictEncoder { private File mDictDir; private String mBaseFilename; private BigramContentWriter mBigramWriter; + private ShortcutContentWriter mShortcutWriter; @UsedForTesting public Ver4DictEncoder(final File dictPlacedDir) { @@ -152,6 +153,39 @@ public class Ver4DictEncoder implements DictEncoder { } } + private static class ShortcutContentWriter extends SparseTableContentWriter { + public ShortcutContentWriter(final String name, final int initialCapacity, + final File baseDir) { + super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT, + initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, + new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, + new String[] { FormatSpec.SHORTCUT_CONTENT_ID }); + } + + public void writeShortcutForOneWord(final int terminalId, + final Iterator<WeightedString> shortcutIterator) throws IOException { + write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, + new SparseTableContentWriterInterface() { + @Override + public void write(final OutputStream outStream) throws IOException { + writeShortcutForOneWordInternal(outStream, shortcutIterator); + } + }); + } + + private void writeShortcutForOneWordInternal(final OutputStream outStream, + final Iterator<WeightedString> shortcutIterator) throws IOException { + while (shortcutIterator.hasNext()) { + final WeightedString target = shortcutIterator.next(); + final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( + shortcutIterator.hasNext(), target.mFrequency); + BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + CharEncoding.writeString(outStream, target.mWord); + } + } + } + private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) throws FileNotFoundException, IOException { final FileHeader header = new FileHeader(0, dictOptions, formatOptions); @@ -225,6 +259,8 @@ public class Ver4DictEncoder implements DictEncoder { writeTerminalData(flatNodes, terminalCount); mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); writeBigrams(flatNodes, dict); + mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir); + writeShortcuts(flatNodes); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; @@ -306,29 +342,6 @@ public class Ver4DictEncoder implements DictEncoder { } } - private void writeShortcuts(ArrayList<WeightedString> shortcuts) { - if (null == shortcuts || shortcuts.isEmpty()) return; - - final int indexOfShortcutByteSize = mTriePos; - mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; - final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); - while (shortcutIterator.hasNext()) { - final WeightedString target = shortcutIterator.next(); - final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( - shortcutIterator.hasNext(), target.mFrequency); - mTrieBuf[mTriePos++] = (byte)shortcutFlags; - final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos, - target.mWord); - mTriePos += shortcutShift; - } - final int shortcutByteSize = mTriePos - indexOfShortcutByteSize; - if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { - throw new RuntimeException("Shortcut list too large : " + shortcutByteSize); - } - BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize, - shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); - } - private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) throws IOException { mBigramWriter.openStreams(); @@ -343,6 +356,19 @@ public class Ver4DictEncoder implements DictEncoder { mBigramWriter.closeStreams(); } + private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException { + mShortcutWriter.openStreams(); + for (final PtNodeArray nodeArray : flatNodes) { + for (final PtNode ptNode : nodeArray.mData) { + if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) { + mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId, + ptNode.mShortcutTargets.iterator()); + } + } + } + mShortcutWriter.closeStreams(); + } + @Override public void writeForwardLinkAddress(int forwardLinkAddress) { mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, @@ -359,7 +385,6 @@ public class Ver4DictEncoder implements DictEncoder { writeTerminalId(ptNode.mTerminalId); } writeChildrenPosition(ptNode, formatOptions); - writeShortcuts(ptNode.mShortcutTargets); } private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h index a6829b476..5492c6070 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -37,6 +37,8 @@ class DictionaryHeaderStructurePolicy { virtual float getMultiWordCostMultiplier() const = 0; + virtual int getLastDecayedTime() const = 0; + virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const = 0; diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp index 8753c6eb0..b1170e251 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -360,13 +360,13 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( } bool DynamicBigramListPolicy::updateProbabilityForDecay( - BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, + const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const { *outRemoved = false; if (mIsDecayingDict) { // Update bigram probability for decaying. const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave( - BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags)); + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy); if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { // Write new probability. const BigramListReadWriteUtils::BigramFlags updatedBigramFlags = diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h index b358b4ed5..0504b59d5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -27,6 +27,7 @@ namespace latinime { class BufferWithExtendableBuffer; +class DictionaryHeaderStructurePolicy; class DictionaryShortcutsStructurePolicy; /* @@ -34,10 +35,12 @@ class DictionaryShortcutsStructurePolicy; */ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { public: - DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer, + DynamicBigramListPolicy(const DictionaryHeaderStructurePolicy *const headerPolicy, + BufferWithExtendableBuffer *const buffer, const DictionaryShortcutsStructurePolicy *const shortcutPolicy, const bool isDecayingDict) - : mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {} + : mHeaderPolicy(headerPolicy), mBuffer(buffer), mShortcutPolicy(shortcutPolicy), + mIsDecayingDict(isDecayingDict) {} ~DynamicBigramListPolicy() {} @@ -74,6 +77,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT; static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT; + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; BufferWithExtendableBuffer *const mBuffer; const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; const bool mIsDecayingDict; @@ -81,7 +85,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { // Follow bigram link and return the position of bigram target PtNode that is currently valid. int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const; - bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags, + bool updateProbabilityForDecay(const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const; }; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp index 324b53062..a17a0acf6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -16,6 +16,7 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -29,7 +30,8 @@ bool DynamicPatriciaTrieGcEventListeners bool isUselessPtNode = !node->isTerminal(); if (node->isTerminal() && mIsDecayingDict) { const int newProbability = - ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability()); + ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability(), + mHeaderPolicy); int writingPos = node->getProbabilityFieldPos(); // Update probability. if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h index 463715af5..3ca2f2a01 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -29,6 +29,8 @@ namespace latinime { +class DictionaryHeaderStructurePolicy; + class DynamicPatriciaTrieGcEventListeners { public: // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or @@ -38,10 +40,12 @@ class DynamicPatriciaTrieGcEventListeners { : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + const DictionaryHeaderStructurePolicy *const headerPolicy, DynamicPatriciaTrieWritingHelper *const writingHelper, BufferWithExtendableBuffer *const buffer, const bool isDecayingDict) - : mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict), - mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {} + : mHeaderPolicy(headerPolicy), mWritingHelper(writingHelper), mBuffer(buffer), + mIsDecayingDict(isDecayingDict), mValueStack(), mChildrenValue(0), + mValidUnigramCount(0) {} ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; @@ -72,9 +76,10 @@ class DynamicPatriciaTrieGcEventListeners { DISALLOW_IMPLICIT_CONSTRUCTORS( TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + const DictionaryHeaderStructurePolicy *const mHeaderPolicy; DynamicPatriciaTrieWritingHelper *const mWritingHelper; BufferWithExtendableBuffer *const mBuffer; - const int mIsDecayingDict; + const bool mIsDecayingDict; std::vector<int> mValueStack; int mChildrenValue; int mValidUnigramCount; @@ -85,7 +90,8 @@ class DynamicPatriciaTrieGcEventListeners { class TraversePolicyToUpdateBigramProbability : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: - TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy) + TraversePolicyToUpdateBigramProbability( + DynamicBigramListPolicy *const bigramPolicy) : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {} bool onAscend() { return true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 60d0db0c0..31e3fb42f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -42,7 +42,6 @@ const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUER const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024; -const int DynamicPatriciaTriePolicy::DECAY_INTERVAL_FOR_DECAYING_DICTS = 2 * 60 * 60; void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { @@ -314,15 +313,15 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); return; } - const bool runGCwithDecay = needsToDecay(); - DynamicBigramListPolicy bigramListPolicyForGC(&mBufferWithExtendableBuffer, - &mShortcutListPolicy, runGCwithDecay); + const bool needsToDecay = mHeaderPolicy.isDecayingDict() + && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy)); + DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer, + &mShortcutListPolicy, needsToDecay); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &bigramListPolicyForGC, &mShortcutListPolicy, runGCwithDecay); + &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay); writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); - if (runGCwithDecay) { - mNeedsToDecayForTesting = false; - } + mNeedsToDecayForTesting = false; } bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -344,16 +343,8 @@ bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy.isDecayingDict()) { - if (mUnigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { - // Unigram count exceeds the limit. - return true; - } else if (mBigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { - // Bigram count exceeds the limit. - return true; - } else if (mindsBlockByGC && needsToDecay()) { - // Time to update probabilities for decaying. - return true; - } + return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( + mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy); } return false; } @@ -369,9 +360,4 @@ void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const } } -bool DynamicPatriciaTriePolicy::needsToDecay() const { - return mHeaderPolicy.isDecayingDict() && (mNeedsToDecayForTesting - || mHeaderPolicy.getLastDecayedTime() + DECAY_INTERVAL_FOR_DECAYING_DICTS < time(0)); -} - } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index c3bbe9977..903f65e8e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -37,7 +37,7 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mShortcutListPolicy(&mBufferWithExtendableBuffer), - mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy, + mBigramListPolicy(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()), mUnigramCount(mHeaderPolicy.getUnigramCount()), mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {} @@ -105,7 +105,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; static const int MAX_DICT_EXTENDED_REGION_SIZE; static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; - static const int DECAY_INTERVAL_FOR_DECAYING_DICTS; const MmappedBuffer *const mBuffer; const HeaderPolicy mHeaderPolicy; @@ -115,8 +114,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int mUnigramCount; int mBigramCount; int mNeedsToDecayForTesting; - - bool needsToDecay() const; }; } // namespace latinime #endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp index 70a9ee564..067c8ec98 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -165,7 +165,10 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNod MAX_DICTIONARY_SIZE); int unigramCount = 0; int bigramCount = 0; - if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) { + if (mNeedsToDecay) { + ForgettingCurveUtils::sTimeKeeper.setCurrentTime(); + } + if (!runGC(rootPtNodeArrayPos, headerPolicy, &newDictBuffer, &unigramCount, &bigramCount)) { return; } BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); @@ -481,14 +484,14 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( } bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, - int *const outBigramCount) { + const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite, + int *const outUnigramCount, int *const outBigramCount) { DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); DynamicPatriciaTrieGcEventListeners ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - this, mBuffer, mNeedsToDecay); + headerPolicy, this, mBuffer, mNeedsToDecay); if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { return false; @@ -505,7 +508,6 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, &traversePolicyToUpdateBigramProbability)) { return false; } - if (mNeedsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { // TODO: Remove more bigrams. @@ -524,7 +526,7 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, // Create policy instance for the GCed dictionary. DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); - DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy, + DynamicBigramListPolicy newDictBigramPolicy(headerPolicy, bufferToWrite, &newDictShortcutPolicy, mNeedsToDecay); // Create reading helper for the GCed dictionary. DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h index 0caf29120..ca8664729 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -128,8 +128,9 @@ class DynamicPatriciaTrieWritingHelper { const int probabilityOfNewPtNode, const int *const newNodeCodePoints, const int newNodeCodePointCount); - bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite, - int *const outUnigramCount, int *const outBigramCount); + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, + int *const outBigramCount); int getUpdatedProbability(const int originalProbability, const int newProbability); }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index b502fe25d..19ca35481 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -15,10 +15,12 @@ */ #include <cmath> +#include <ctime> #include <stdlib.h> #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { @@ -35,8 +37,14 @@ const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; // Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected // duration of the decay is approximately 66hours. const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; +const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; +ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper; + +void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { + mCurrentTime = time(0); +} /* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, const int encodedBigramProbability) { @@ -76,19 +84,44 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; } -/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability) { - const int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); +/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime(); + const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1); + int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); // TODO: Implement the decay in more proper way. - const float currentRate = static_cast<float>(currentEncodedProbability) - / static_cast<float>(MAX_ENCODED_PROBABILITY); - const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY - + (1.0f - MIN_PROBABILITY_TO_DECAY) * (1.0f - currentRate); - const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); - if (thresholdToDecay < randValue) { - return max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, 0); - } else { - return currentEncodedProbability; + for (int i = 0; i < decayIterationCount; ++i) { + const float currentRate = static_cast<float>(currentEncodedProbability) + / static_cast<float>(MAX_ENCODED_PROBABILITY); + const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY + + (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; + const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); + if (thresholdToDecay < randValue) { + currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, + 0); + } + } + return currentEncodedProbability; +} + +/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, + const int unigramCount, const int bigramCount, + const DictionaryHeaderStructurePolicy *const headerPolicy) { + if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { + // Unigram count exceeds the limit. + return true; + } else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { + // Bigram count exceeds the limit. + return true; + } + if (mindsBlockByDecay) { + return false; + } + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) { + // Time to decay. + return true; } + return false; } /* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index d666f22aa..2ad423874 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -23,16 +23,32 @@ namespace latinime { +class DictionaryHeaderStructurePolicy; + // TODO: Check the elapsed time and decrease the probability depending on the time. Time field is // required to introduced to each terminal PtNode and bigram entry. // TODO: Quit using bigram probability to indicate the delta. class ForgettingCurveUtils { public: + class TimeKeeper { + public: + TimeKeeper() : mCurrentTime(0) {} + void setCurrentTime(); + int peekCurrentTime() const { return mCurrentTime; }; + + private: + DISALLOW_COPY_AND_ASSIGN(TimeKeeper); + + int mCurrentTime; + }; + static const int MAX_UNIGRAM_COUNT; static const int MAX_UNIGRAM_COUNT_AFTER_GC; static const int MAX_BIGRAM_COUNT; static const int MAX_BIGRAM_COUNT_AFTER_GC; + static TimeKeeper sTimeKeeper; + static int getProbability(const int encodedUnigramProbability, const int encodedBigramProbability); @@ -41,7 +57,11 @@ class ForgettingCurveUtils { static int isValidEncodedProbability(const int encodedProbability); - static int getEncodedProbabilityToSave(const int encodedProbability); + static int getEncodedProbabilityToSave(const int encodedProbability, + const DictionaryHeaderStructurePolicy *const headerPolicy); + + static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, + const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy); private: DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); @@ -68,6 +88,7 @@ class ForgettingCurveUtils { static const int MIN_VALID_ENCODED_PROBABILITY; static const int ENCODED_PROBABILITY_STEP; static const float MIN_PROBABILITY_TO_DECAY; + static const int DECAY_INTERVAL_SECONDS; static const ProbabilityTable sProbabilityTable; diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index ecceb60d3..66637ac4b 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -27,30 +27,30 @@ const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4; -const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.132f; -const float ScoringParams::PROXIMITY_COST = 0.095f; -const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.102f; -const float ScoringParams::FIRST_PROXIMITY_COST = 0.019f; -const float ScoringParams::OMISSION_COST = 0.458f; -const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.491f; -const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.582f; -const float ScoringParams::INSERTION_COST = 0.730f; -const float ScoringParams::TERMINAL_INSERTION_COST = 0.93f; -const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.586f; -const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.70f; -const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.623f; -const float ScoringParams::TRANSPOSITION_COST = 0.526f; -const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.319f; -const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.380f; -const float ScoringParams::SUBSTITUTION_COST = 0.383f; -const float ScoringParams::COST_NEW_WORD = 0.042f; -const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.25f; -const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.123f; -const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.545f; -const float ScoringParams::COST_LOOKAHEAD = 0.073f; -const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.093f; -const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.041f; -const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.447f; +const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f; +const float ScoringParams::PROXIMITY_COST = 0.0694f; +const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.072f; +const float ScoringParams::FIRST_PROXIMITY_COST = 0.07788f; +const float ScoringParams::OMISSION_COST = 0.4676f; +const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.399f; +const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.5256f; +const float ScoringParams::INSERTION_COST = 0.7248f; +const float ScoringParams::TERMINAL_INSERTION_COST = 0.9828f; +const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f; +const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f; +const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f; +const float ScoringParams::TRANSPOSITION_COST = 0.5608f; +const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.339f; +const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.4576f; +const float ScoringParams::SUBSTITUTION_COST = 0.3806f; +const float ScoringParams::COST_NEW_WORD = 0.0292f; +const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f; +const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f; +const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.4786f; +const float ScoringParams::COST_LOOKAHEAD = 0.00624f; +const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.06836f; +const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f; +const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.4182f; const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f; const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f; const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.045f; diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index b2d31c21f..ded8eaa97 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -50,8 +50,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { } private void forcePassingShortTime(final BinaryDictionary binaryDictionary) { - // Entries having low probability would be suppressed once in 2 GCs. - final int count = 2; + // Entries having low probability would be suppressed once in 3 GCs. + final int count = 3; for (int i = 0; i < count; i++) { binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); binaryDictionary.flushWithGC(); diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 6a21522f9..5b8f0e977 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -18,6 +18,7 @@ package com.android.inputmethod.latin; import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; +import android.text.TextUtils; import android.util.Pair; import com.android.inputmethod.latin.makedict.CodePointUtils; @@ -126,7 +127,7 @@ public class BinaryDictionaryTests extends AndroidTestCase { public void testRandomlyAddUnigramWord() { final int wordCount = 1000; final int codePointSetSize = 50; - final int seed = 123456789; + final long seed = System.currentTimeMillis(); File dictFile = null; try { @@ -223,7 +224,8 @@ public class BinaryDictionaryTests extends AndroidTestCase { final int wordCount = 100; final int bigramCount = 1000; final int codePointSetSize = 50; - final int seed = 11111; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); File dictFile = null; try { @@ -234,43 +236,42 @@ public class BinaryDictionaryTests extends AndroidTestCase { BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + final ArrayList<String> words = new ArrayList<String>(); - // Test a word that isn't contained within the dictionary. - final Random random = new Random(seed); + final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); - final int[] unigramProbabilities = new int[wordCount]; + final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); + final HashMap<Pair<String, String>, Integer> bigramProbabilities = + new HashMap<Pair<String, String>, Integer>(); + for (int i = 0; i < wordCount; ++i) { final String word = CodePointUtils.generateWord(random, codePointSet); words.add(word); final int unigramProbability = random.nextInt(0xFF); - unigramProbabilities[i] = unigramProbability; + unigramProbabilities.put(word, unigramProbability); binaryDictionary.addUnigramWord(word, unigramProbability); } - final int[][] probabilities = new int[wordCount][wordCount]; - - for (int i = 0; i < wordCount; ++i) { - for (int j = 0; j < wordCount; ++j) { - probabilities[i][j] = Dictionary.NOT_A_PROBABILITY; - } - } - for (int i = 0; i < bigramCount; i++) { - final int word0Index = random.nextInt(wordCount); - final int word1Index = random.nextInt(wordCount); - final String word0 = words.get(word0Index); - final String word1 = words.get(word1Index); + final String word0 = words.get(random.nextInt(wordCount)); + final String word1 = words.get(random.nextInt(wordCount)); + if (TextUtils.equals(word0, word1)) { + continue; + } + final Pair<String, String> bigram = new Pair<String, String>(word0, word1); + bigramWords.add(bigram); final int bigramProbability = random.nextInt(0xF); - probabilities[word0Index][word1Index] = binaryDictionary.calculateProbability( - unigramProbabilities[word1Index], bigramProbability); + bigramProbabilities.put(bigram, bigramProbability); binaryDictionary.addBigramWords(word0, word1, bigramProbability); } - for (int i = 0; i < words.size(); i++) { - for (int j = 0; j < words.size(); j++) { - assertEquals(probabilities[i][j], - binaryDictionary.getBigramProbability(words.get(i), words.get(j))); - } + for (final Pair<String, String> bigram : bigramWords) { + final int unigramProbability = unigramProbabilities.get(bigram.second); + final int bigramProbability = bigramProbabilities.get(bigram); + final int probability = binaryDictionary.calculateProbability(unigramProbability, + bigramProbability); + assertEquals(probability, + binaryDictionary.getBigramProbability(bigram.first, bigram.second)); } dictFile.delete(); @@ -419,8 +420,8 @@ public class BinaryDictionaryTests extends AndroidTestCase { final int wordCount = 100; final int bigramCount = 1000; final int codePointSetSize = 30; - // TODO: Use various seeds such as a current timestamp to make this test more random. - final int seed = 314159265; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); File dictFile = null; try { @@ -432,35 +433,32 @@ public class BinaryDictionaryTests extends AndroidTestCase { BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + final ArrayList<String> words = new ArrayList<String>(); - // Test a word that isn't contained within the dictionary. - final Random random = new Random(seed); + final ArrayList<Pair<String, String>> bigramWords = new ArrayList<Pair<String,String>>(); final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); - final int[] unigramProbabilities = new int[wordCount]; + final HashMap<String, Integer> unigramProbabilities = new HashMap<String, Integer>(); + final HashMap<Pair<String, String>, Integer> bigramProbabilities = + new HashMap<Pair<String, String>, Integer>(); + for (int i = 0; i < wordCount; ++i) { final String word = CodePointUtils.generateWord(random, codePointSet); words.add(word); final int unigramProbability = random.nextInt(0xFF); - unigramProbabilities[i] = unigramProbability; + unigramProbabilities.put(word, unigramProbability); binaryDictionary.addUnigramWord(word, unigramProbability); } - final int[][] probabilities = new int[wordCount][wordCount]; - - for (int i = 0; i < wordCount; ++i) { - for (int j = 0; j < wordCount; ++j) { - probabilities[i][j] = Dictionary.NOT_A_PROBABILITY; - } - } - for (int i = 0; i < bigramCount; i++) { - final int word0Index = random.nextInt(wordCount); - final int word1Index = random.nextInt(wordCount); - final String word0 = words.get(word0Index); - final String word1 = words.get(word1Index); + final String word0 = words.get(random.nextInt(wordCount)); + final String word1 = words.get(random.nextInt(wordCount)); + if (TextUtils.equals(word0, word1)) { + continue; + } + final Pair<String, String> bigram = new Pair<String, String>(word0, word1); + bigramWords.add(bigram); final int bigramProbability = random.nextInt(0xF); - probabilities[word0Index][word1Index] = binaryDictionary.calculateProbability( - unigramProbabilities[word1Index], bigramProbability); + bigramProbabilities.put(bigram, bigramProbability); binaryDictionary.addBigramWords(word0, word1, bigramProbability); } @@ -470,12 +468,15 @@ public class BinaryDictionaryTests extends AndroidTestCase { 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); - for (int i = 0; i < words.size(); i++) { - for (int j = 0; j < words.size(); j++) { - assertEquals(probabilities[i][j], - binaryDictionary.getBigramProbability(words.get(i), words.get(j))); - } + for (final Pair<String, String> bigram : bigramWords) { + final int unigramProbability = unigramProbabilities.get(bigram.second); + final int bigramProbability = bigramProbabilities.get(bigram); + final int probability = binaryDictionary.calculateProbability(unigramProbability, + bigramProbability); + assertEquals(probability, + binaryDictionary.getBigramProbability(bigram.first, bigram.second)); } + dictFile.delete(); } @@ -487,8 +488,8 @@ public class BinaryDictionaryTests extends AndroidTestCase { final float addBigramProb = 0.8f; final float removeBigramProb = 0.2f; final int codePointSetSize = 30; - final int seed = 141421356; + final long seed = System.currentTimeMillis(); final Random random = new Random(seed); File dictFile = null; @@ -539,6 +540,9 @@ public class BinaryDictionaryTests extends AndroidTestCase { } final String word0 = words.get(word0Index); final String word1 = words.get(word1Index); + if (TextUtils.equals(word0, word1)) { + continue; + } final int bigramProbability = random.nextInt(0xF); final Pair<String, String> bigram = new Pair<String, String>(word0, word1); bigramWords.add(bigram); @@ -586,8 +590,8 @@ public class BinaryDictionaryTests extends AndroidTestCase { public void testAddManyUnigramsAndFlushWithGC() { final int flashWithGCIterationCount = 3; final int codePointSetSize = 50; - final int seed = 22360679; + final long seed = System.currentTimeMillis(); final Random random = new Random(seed); File dictFile = null; @@ -632,8 +636,7 @@ public class BinaryDictionaryTests extends AndroidTestCase { final int codePointSetSize = 50; final int unigramCountPerIteration = 1000; final int bigramCountPerIteration = 2000; - final int seed = 1123581321; - + final long seed = System.currentTimeMillis(); final Random random = new Random(seed); File dictFile = null; @@ -661,6 +664,9 @@ public class BinaryDictionaryTests extends AndroidTestCase { for (int j = 0; j < bigramCountPerIteration; j++) { final String word0 = words.get(random.nextInt(words.size())); final String word1 = words.get(random.nextInt(words.size())); + if (TextUtils.equals(word0, word1)) { + continue; + } bigrams.add(new Pair<String, String>(word0, word1)); final int bigramProbability = random.nextInt(0xF); binaryDictionary.addBigramWords(word0, word1, bigramProbability); |