Move code only used for dicttool and tests under tests.

Bug: 13035567 Change-Id: I13c6df013ef2b67c9bf67455d9c32d283bf9ea2e
author: Keisuke Kuroyanagi <ksk@google.com> 2014-03-27 15:30:32 +0900
committer: Keisuke Kuroyanagi <ksk@google.com> 2014-03-27 15:30:32 +0900
commit: 93cda5bb396c22f1781e390debaf75d54cf7c0dc (patch)
tree: 391903a9f8875520980da23f1bc5911c8fa78fc2 /tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
parent: 37b9562fd7b593c90d7ab383ec650f39a7c0f621 (diff)
download: latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.gz
latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.xz
latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.zip
1 files changed, 364 insertions, 0 deletions
diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
new file mode 100644
index 000000000..6f8b07a34
--- /dev/null
+++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
@@ -0,0 +1,364 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.inputmethod.latin.makedict;
+
+import com.android.inputmethod.annotations.UsedForTesting;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+
+/**
+ * Decodes binary files for a FusionDictionary.
+ *
+ * All the methods in this class are static.
+ *
+ * TODO: Move this file to makedict/internal.
+ * TODO: Rename this class to DictDecoderUtils.
+ */
+public final class BinaryDictDecoderUtils {
+
+    private static final boolean DBG = MakedictLog.DBG;
+
+    private BinaryDictDecoderUtils() {
+        // This utility class is not publicly instantiable.
+    }
+
+    @UsedForTesting
+    public interface DictBuffer {
+        public int readUnsignedByte();
+        public int readUnsignedShort();
+        public int readUnsignedInt24();
+        public int readInt();
+        public int position();
+        public void position(int newPosition);
+        @UsedForTesting
+        public void put(final byte b);
+        public int limit();
+        @UsedForTesting
+        public int capacity();
+    }
+
+    public static final class ByteBufferDictBuffer implements DictBuffer {
+        private ByteBuffer mBuffer;
+
+        public ByteBufferDictBuffer(final ByteBuffer buffer) {
+            mBuffer = buffer;
+        }
+
+        @Override
+        public int readUnsignedByte() {
+            return mBuffer.get() & 0xFF;
+        }
+
+        @Override
+        public int readUnsignedShort() {
+            return mBuffer.getShort() & 0xFFFF;
+        }
+
+        @Override
+        public int readUnsignedInt24() {
+            final int retval = readUnsignedByte();
+            return (retval << 16) + readUnsignedShort();
+        }
+
+        @Override
+        public int readInt() {
+            return mBuffer.getInt();
+        }
+
+        @Override
+        public int position() {
+            return mBuffer.position();
+        }
+
+        @Override
+        public void position(int newPos) {
+            mBuffer.position(newPos);
+        }
+
+        @Override
+        public void put(final byte b) {
+            mBuffer.put(b);
+        }
+
+        @Override
+        public int limit() {
+            return mBuffer.limit();
+        }
+
+        @Override
+        public int capacity() {
+            return mBuffer.capacity();
+        }
+    }
+
+    /**
+     * A class grouping utility function for our specific character encoding.
+     */
+    static final class CharEncoding {
+        private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
+        private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
+
+        /**
+         * Helper method to find out whether this code fits on one byte
+         */
+        private static boolean fitsOnOneByte(final int character) {
+            return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
+                    && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
+        }
+
+        /**
+         * Compute the size of a character given its character code.
+         *
+         * Char format is:
+         * 1 byte = bbbbbbbb match
+         * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
+         * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
+         *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
+         *       00011111 would be outside unicode.
+         * else: iso-latin-1 code
+         * This allows for the whole unicode range to be encoded, including chars outside of
+         * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
+         * characters which should never happen anyway (and still work, but take 3 bytes).
+         *
+         * @param character the character code.
+         * @return the size in binary encoded-form, either 1 or 3 bytes.
+         */
+        static int getCharSize(final int character) {
+            // See char encoding in FusionDictionary.java
+            if (fitsOnOneByte(character)) return 1;
+            if (FormatSpec.INVALID_CHARACTER == character) return 1;
+            return 3;
+        }
+
+        /**
+         * Compute the byte size of a character array.
+         */
+        static int getCharArraySize(final int[] chars) {
+            int size = 0;
+            for (int character : chars) size += getCharSize(character);
+            return size;
+        }
+
+        /**
+         * Writes a char array to a byte buffer.
+         *
+         * @param codePoints the code point array to write.
+         * @param buffer the byte buffer to write to.
+         * @param index the index in buffer to write the character array to.
+         * @return the index after the last character.
+         */
+        static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
+            for (int codePoint : codePoints) {
+                if (1 == getCharSize(codePoint)) {
+                    buffer[index++] = (byte)codePoint;
+                } else {
+                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
+                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
+                    buffer[index++] = (byte)(0xFF & codePoint);
+                }
+            }
+            return index;
+        }
+
+        /**
+         * Writes a string with our character format to a byte buffer.
+         *
+         * This will also write the terminator byte.
+         *
+         * @param buffer the byte buffer to write to.
+         * @param origin the offset to write from.
+         * @param word the string to write.
+         * @return the size written, in bytes.
+         */
+        static int writeString(final byte[] buffer, final int origin, final String word) {
+            final int length = word.length();
+            int index = origin;
+            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
+                final int codePoint = word.codePointAt(i);
+                if (1 == getCharSize(codePoint)) {
+                    buffer[index++] = (byte)codePoint;
+                } else {
+                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
+                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
+                    buffer[index++] = (byte)(0xFF & codePoint);
+                }
+            }
+            buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
+            return index - origin;
+        }
+
+        /**
+         * Writes a string with our character format to an OutputStream.
+         *
+         * This will also write the terminator byte.
+         *
+         * @param stream the OutputStream to write to.
+         * @param word the string to write.
+         * @return the size written, in bytes.
+         */
+        static int writeString(final OutputStream stream, final String word) throws IOException {
+            final int length = word.length();
+            int written = 0;
+            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
+                final int codePoint = word.codePointAt(i);
+                final int charSize = getCharSize(codePoint);
+                if (1 == charSize) {
+                    stream.write((byte) codePoint);
+                } else {
+                    stream.write((byte) (0xFF & (codePoint >> 16)));
+                    stream.write((byte) (0xFF & (codePoint >> 8)));
+                    stream.write((byte) (0xFF & codePoint));
+                }
+                written += charSize;
+            }
+            stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
+            written += FormatSpec.PTNODE_TERMINATOR_SIZE;
+            return written;
+        }
+
+        /**
+         * Reads a string from a DictBuffer. This is the converse of the above method.
+         */
+        static String readString(final DictBuffer dictBuffer) {
+            final StringBuilder s = new StringBuilder();
+            int character = readChar(dictBuffer);
+            while (character != FormatSpec.INVALID_CHARACTER) {
+                s.appendCodePoint(character);
+                character = readChar(dictBuffer);
+            }
+            return s.toString();
+        }
+
+        /**
+         * Reads a character from the buffer.
+         *
+         * This follows the character format documented earlier in this source file.
+         *
+         * @param dictBuffer the buffer, positioned over an encoded character.
+         * @return the character code.
+         */
+        static int readChar(final DictBuffer dictBuffer) {
+            int character = dictBuffer.readUnsignedByte();
+            if (!fitsOnOneByte(character)) {
+                if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
+                    return FormatSpec.INVALID_CHARACTER;
+                }
+                character <<= 16;
+                character += dictBuffer.readUnsignedShort();
+            }
+            return character;
+        }
+    }
+
+    /**
+     * Reads and returns the PtNode count out of a buffer and forwards the pointer.
+     */
+    /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
+        final int msb = dictBuffer.readUnsignedByte();
+        if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
+            return msb;
+        } else {
+            return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
+                    + dictBuffer.readUnsignedByte();
+        }
+    }
+
+    /**
+     * Finds, as a string, the word at the position passed as an argument.
+     *
+     * @param dictDecoder the dict decoder.
+     * @param headerSize the size of the header.
+     * @param pos the position to seek.
+     * @return the word with its frequency, as a weighted string.
+     */
+    @UsedForTesting
+    /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
+            final int headerSize, final int pos) {
+        final WeightedString result;
+        final int originalPos = dictDecoder.getPosition();
+        dictDecoder.setPosition(pos);
+        result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
+        dictDecoder.setPosition(originalPos);
+        return result;
+    }
+
+    private static WeightedString getWordAtPositionWithoutParentAddress(
+            final DictDecoder dictDecoder, final int headerSize, final int pos) {
+        dictDecoder.setPosition(headerSize);
+        final int count = dictDecoder.readPtNodeCount();
+        int groupPos = dictDecoder.getPosition();
+        final StringBuilder builder = new StringBuilder();
+        WeightedString result = null;
+
+        PtNodeInfo last = null;
+        for (int i = count - 1; i >= 0; --i) {
+            PtNodeInfo info = dictDecoder.readPtNode(groupPos);
+            groupPos = info.mEndAddress;
+            if (info.mOriginalAddress == pos) {
+                builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
+                result = new WeightedString(builder.toString(), info.mProbabilityInfo);
+                break; // and return
+            }
+            if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
+                if (info.mChildrenAddress > pos) {
+                    if (null == last) continue;
+                    builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
+                    dictDecoder.setPosition(last.mChildrenAddress);
+                    i = dictDecoder.readPtNodeCount();
+                    groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
+                    last = null;
+                    continue;
+                }
+                last = info;
+            }
+            if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
+                builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
+                dictDecoder.setPosition(last.mChildrenAddress);
+                i = dictDecoder.readPtNodeCount();
+                groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
+                last = null;
+                continue;
+            }
+        }
+        return result;
+    }
+
+    /**
+     * Helper method to pass a file name instead of a File object to isBinaryDictionary.
+     */
+    public static boolean isBinaryDictionary(final String filename) {
+        final File file = new File(filename);
+        return isBinaryDictionary(file);
+    }
+
+    /**
+     * Basic test to find out whether the file is a binary dictionary or not.
+     *
+     * @param file The file to test.
+     * @return true if it's a binary dictionary, false otherwise
+     */
+    public static boolean isBinaryDictionary(final File file) {
+        final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
+        if (dictDecoder == null) {
+            return false;
+        }
+        return dictDecoder.hasValidRawBinaryDictionary();
+    }
+}
author	Keisuke Kuroyanagi <ksk@google.com>	2014-03-27 15:30:32 +0900
committer	Keisuke Kuroyanagi <ksk@google.com>	2014-03-27 15:30:32 +0900
commit	93cda5bb396c22f1781e390debaf75d54cf7c0dc (patch)
tree	391903a9f8875520980da23f1bc5911c8fa78fc2 /tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
parent	37b9562fd7b593c90d7ab383ec650f39a7c0f621 (diff)
download	latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.gz latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.tar.xz latinime-93cda5bb396c22f1781e390debaf75d54cf7c0dc.zip