aboutsummaryrefslogtreecommitdiffstats
path: root/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-03-27 15:30:32 +0900
committerKeisuke Kuroyanagi <ksk@google.com>2014-03-27 15:30:32 +0900
commit93cda5bb396c22f1781e390debaf75d54cf7c0dc (patch)
tree391903a9f8875520980da23f1bc5911c8fa78fc2 /tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
parent37b9562fd7b593c90d7ab383ec650f39a7c0f621 (diff)
downloadlatinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.gz
latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.xz
latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.zip
Move code only used for dicttool and tests under tests.
Bug: 13035567 Change-Id: I13c6df013ef2b67c9bf67455d9c32d283bf9ea2e
Diffstat (limited to 'tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java')
-rw-r--r--tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java306
1 files changed, 306 insertions, 0 deletions
diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
new file mode 100644
index 000000000..42a50be66
--- /dev/null
+++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictIOUtils.java
@@ -0,0 +1,306 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.inputmethod.latin.makedict;
+
+import com.android.inputmethod.annotations.UsedForTesting;
+import com.android.inputmethod.latin.Constants;
+import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Stack;
+
+public final class BinaryDictIOUtils {
+ private static final boolean DBG = false;
+
+ private BinaryDictIOUtils() {
+ // This utility class is not publicly instantiable.
+ }
+
+ /**
+ * Returns new dictionary decoder.
+ *
+ * @param dictFile the dictionary file.
+ * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
+ * @return new dictionary decoder if the dictionary file exists, otherwise null.
+ */
+ public static DictDecoder getDictDecoder(final File dictFile, final long offset,
+ final long length, final int bufferType) {
+ if (dictFile.isDirectory()) {
+ return new Ver4DictDecoder(dictFile, bufferType);
+ } else if (dictFile.isFile()) {
+ return new Ver2DictDecoder(dictFile, offset, length, bufferType);
+ }
+ return null;
+ }
+
+ public static DictDecoder getDictDecoder(final File dictFile, final long offset,
+ final long length, final DictionaryBufferFactory factory) {
+ if (dictFile.isDirectory()) {
+ return new Ver4DictDecoder(dictFile, factory);
+ } else if (dictFile.isFile()) {
+ return new Ver2DictDecoder(dictFile, offset, length, factory);
+ }
+ return null;
+ }
+
+ public static DictDecoder getDictDecoder(final File dictFile, final long offset,
+ final long length) {
+ return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
+ }
+
+ private static final class Position {
+ public static final int NOT_READ_PTNODE_COUNT = -1;
+
+ public int mAddress;
+ public int mNumOfPtNode;
+ public int mPosition;
+ public int mLength;
+
+ public Position(int address, int length) {
+ mAddress = address;
+ mLength = length;
+ mNumOfPtNode = NOT_READ_PTNODE_COUNT;
+ }
+ }
+
+ /**
+ * Retrieves all node arrays without recursive call.
+ */
+ private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
+ final int bodyOffset, final Map<Integer, String> words,
+ final Map<Integer, Integer> frequencies,
+ final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
+ int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
+
+ Stack<Position> stack = new Stack<Position>();
+ int index = 0;
+
+ Position initPos = new Position(bodyOffset, 0);
+ stack.push(initPos);
+
+ while (!stack.empty()) {
+ Position p = stack.peek();
+
+ if (DBG) {
+ MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
+ p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
+ }
+
+ if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
+ if (index != p.mLength) index = p.mLength;
+
+ if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
+ p.mNumOfPtNode = dictDecoder.readPtNodeCount();
+ p.mAddress = dictDecoder.getPosition();
+ p.mPosition = 0;
+ }
+ if (p.mNumOfPtNode == 0) {
+ stack.pop();
+ continue;
+ }
+ final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
+ for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
+ pushedChars[index++] = ptNodeInfo.mCharacters[i];
+ }
+ p.mPosition++;
+ if (ptNodeInfo.isTerminal()) {// found word
+ words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
+ frequencies.put(
+ ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
+ if (ptNodeInfo.mBigrams != null) {
+ bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
+ }
+ }
+
+ if (p.mPosition == p.mNumOfPtNode) {
+ stack.pop();
+ } else {
+ // The PtNode array has more PtNodes.
+ p.mAddress = dictDecoder.getPosition();
+ }
+
+ if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
+ final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
+ stack.push(childrenPos);
+ }
+ }
+ }
+
+ /**
+ * Reads unigrams and bigrams from the binary file.
+ * Doesn't store a full memory representation of the dictionary.
+ *
+ * @param dictDecoder the dict decoder.
+ * @param words the map to store the address as a key and the word as a value.
+ * @param frequencies the map to store the address as a key and the frequency as a value.
+ * @param bigrams the map to store the address as a key and the list of address as a value.
+ * @throws IOException if the file can't be read.
+ * @throws UnsupportedFormatException if the format of the file is not recognized.
+ */
+ /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
+ final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
+ final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
+ UnsupportedFormatException {
+ // Read header
+ final DictionaryHeader header = dictDecoder.readHeader();
+ readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
+ frequencies, bigrams);
+ }
+
+ /**
+ * Gets the address of the last PtNode of the exact matching word in the dictionary.
+ * If no match is found, returns NOT_VALID_WORD.
+ *
+ * @param dictDecoder the dict decoder.
+ * @param word the word we search for.
+ * @return the address of the terminal node.
+ * @throws IOException if the file can't be read.
+ * @throws UnsupportedFormatException if the format of the file is not recognized.
+ */
+ @UsedForTesting
+ /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
+ final String word) throws IOException, UnsupportedFormatException {
+ if (word == null) return FormatSpec.NOT_VALID_WORD;
+ dictDecoder.setPosition(0);
+ dictDecoder.readHeader();
+ int wordPos = 0;
+ final int wordLen = word.codePointCount(0, word.length());
+ for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
+ if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
+
+ do {
+ final int ptNodeCount = dictDecoder.readPtNodeCount();
+ boolean foundNextPtNode = false;
+ for (int i = 0; i < ptNodeCount; ++i) {
+ final int ptNodePos = dictDecoder.getPosition();
+ final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
+ boolean same = true;
+ for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
+ p < currentInfo.mCharacters.length;
+ ++p, j = word.offsetByCodePoints(j, 1)) {
+ if (wordPos + p >= wordLen
+ || word.codePointAt(j) != currentInfo.mCharacters[p]) {
+ same = false;
+ break;
+ }
+ }
+
+ if (same) {
+ // found the PtNode matches the word.
+ if (wordPos + currentInfo.mCharacters.length == wordLen) {
+ if (!currentInfo.isTerminal()) {
+ return FormatSpec.NOT_VALID_WORD;
+ } else {
+ return ptNodePos;
+ }
+ }
+ wordPos += currentInfo.mCharacters.length;
+ if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
+ return FormatSpec.NOT_VALID_WORD;
+ }
+ foundNextPtNode = true;
+ dictDecoder.setPosition(currentInfo.mChildrenAddress);
+ break;
+ }
+ }
+ if (foundNextPtNode) break;
+ return FormatSpec.NOT_VALID_WORD;
+ } while(true);
+ }
+ return FormatSpec.NOT_VALID_WORD;
+ }
+
+ /**
+ * Writes a PtNodeCount to the stream.
+ *
+ * @param destination the stream to write.
+ * @param ptNodeCount the count.
+ * @return the size written in bytes.
+ */
+ @UsedForTesting
+ static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
+ throws IOException {
+ final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
+ // the count must fit on one byte or two bytes.
+ // Please see comments in FormatSpec.
+ if (countSize != 1 && countSize != 2) {
+ throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
+ }
+ final int encodedPtNodeCount = (countSize == 2) ?
+ (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
+ BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
+ return countSize;
+ }
+
+ /**
+ * Helper method to hide the actual value of the no children address.
+ */
+ public static boolean hasChildrenAddress(final int address) {
+ return FormatSpec.NO_CHILDREN_ADDRESS != address;
+ }
+
+ /**
+ * Compute the binary size of the node count
+ * @param count the node count
+ * @return the size of the node count, either 1 or 2 bytes.
+ */
+ public static int getPtNodeCountSize(final int count) {
+ if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
+ return 1;
+ } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
+ return 2;
+ } else {
+ throw new RuntimeException("Can't have more than "
+ + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
+ + count + ")");
+ }
+ }
+
+ static int getChildrenAddressSize(final int optionFlags) {
+ switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
+ case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
+ return 1;
+ case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
+ return 2;
+ case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
+ return 3;
+ case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
+ default:
+ return 0;
+ }
+ }
+
+ /**
+ * Calculate bigram frequency from compressed value
+ *
+ * @param unigramFrequency
+ * @param bigramFrequency compressed frequency
+ * @return approximate bigram frequency
+ */
+ @UsedForTesting
+ public static int reconstructBigramFrequency(final int unigramFrequency,
+ final int bigramFrequency) {
+ final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
+ / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
+ final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
+ return (int)resultFreqFloat;
+ }
+}