diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java')
-rw-r--r-- | java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java | 365 |
1 files changed, 0 insertions, 365 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java deleted file mode 100644 index b534bcb09..000000000 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +++ /dev/null @@ -1,365 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.makedict; - -import com.android.inputmethod.annotations.UsedForTesting; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; - -/** - * Decodes binary files for a FusionDictionary. - * - * All the methods in this class are static. - * - * TODO: Remove calls from classes except Ver3DictDecoder - * TODO: Move this file to makedict/internal. - * TODO: Rename this class to DictDecoderUtils. - */ -public final class BinaryDictDecoderUtils { - - private static final boolean DBG = MakedictLog.DBG; - - private BinaryDictDecoderUtils() { - // This utility class is not publicly instantiable. - } - - @UsedForTesting - public interface DictBuffer { - public int readUnsignedByte(); - public int readUnsignedShort(); - public int readUnsignedInt24(); - public int readInt(); - public int position(); - public void position(int newPosition); - @UsedForTesting - public void put(final byte b); - public int limit(); - @UsedForTesting - public int capacity(); - } - - public static final class ByteBufferDictBuffer implements DictBuffer { - private ByteBuffer mBuffer; - - public ByteBufferDictBuffer(final ByteBuffer buffer) { - mBuffer = buffer; - } - - @Override - public int readUnsignedByte() { - return mBuffer.get() & 0xFF; - } - - @Override - public int readUnsignedShort() { - return mBuffer.getShort() & 0xFFFF; - } - - @Override - public int readUnsignedInt24() { - final int retval = readUnsignedByte(); - return (retval << 16) + readUnsignedShort(); - } - - @Override - public int readInt() { - return mBuffer.getInt(); - } - - @Override - public int position() { - return mBuffer.position(); - } - - @Override - public void position(int newPos) { - mBuffer.position(newPos); - } - - @Override - public void put(final byte b) { - mBuffer.put(b); - } - - @Override - public int limit() { - return mBuffer.limit(); - } - - @Override - public int capacity() { - return mBuffer.capacity(); - } - } - - /** - * A class grouping utility function for our specific character encoding. - */ - static final class CharEncoding { - private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; - private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; - - /** - * Helper method to find out whether this code fits on one byte - */ - private static boolean fitsOnOneByte(final int character) { - return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE - && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; - } - - /** - * Compute the size of a character given its character code. - * - * Char format is: - * 1 byte = bbbbbbbb match - * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte - * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because - * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with - * 00011111 would be outside unicode. - * else: iso-latin-1 code - * This allows for the whole unicode range to be encoded, including chars outside of - * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control - * characters which should never happen anyway (and still work, but take 3 bytes). - * - * @param character the character code. - * @return the size in binary encoded-form, either 1 or 3 bytes. - */ - static int getCharSize(final int character) { - // See char encoding in FusionDictionary.java - if (fitsOnOneByte(character)) return 1; - if (FormatSpec.INVALID_CHARACTER == character) return 1; - return 3; - } - - /** - * Compute the byte size of a character array. - */ - static int getCharArraySize(final int[] chars) { - int size = 0; - for (int character : chars) size += getCharSize(character); - return size; - } - - /** - * Writes a char array to a byte buffer. - * - * @param codePoints the code point array to write. - * @param buffer the byte buffer to write to. - * @param index the index in buffer to write the character array to. - * @return the index after the last character. - */ - static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { - for (int codePoint : codePoints) { - if (1 == getCharSize(codePoint)) { - buffer[index++] = (byte)codePoint; - } else { - buffer[index++] = (byte)(0xFF & (codePoint >> 16)); - buffer[index++] = (byte)(0xFF & (codePoint >> 8)); - buffer[index++] = (byte)(0xFF & codePoint); - } - } - return index; - } - - /** - * Writes a string with our character format to a byte buffer. - * - * This will also write the terminator byte. - * - * @param buffer the byte buffer to write to. - * @param origin the offset to write from. - * @param word the string to write. - * @return the size written, in bytes. - */ - static int writeString(final byte[] buffer, final int origin, final String word) { - final int length = word.length(); - int index = origin; - for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { - final int codePoint = word.codePointAt(i); - if (1 == getCharSize(codePoint)) { - buffer[index++] = (byte)codePoint; - } else { - buffer[index++] = (byte)(0xFF & (codePoint >> 16)); - buffer[index++] = (byte)(0xFF & (codePoint >> 8)); - buffer[index++] = (byte)(0xFF & codePoint); - } - } - buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; - return index - origin; - } - - /** - * Writes a string with our character format to an OutputStream. - * - * This will also write the terminator byte. - * - * @param stream the OutputStream to write to. - * @param word the string to write. - * @return the size written, in bytes. - */ - static int writeString(final OutputStream stream, final String word) throws IOException { - final int length = word.length(); - int written = 0; - for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { - final int codePoint = word.codePointAt(i); - final int charSize = getCharSize(codePoint); - if (1 == charSize) { - stream.write((byte) codePoint); - } else { - stream.write((byte) (0xFF & (codePoint >> 16))); - stream.write((byte) (0xFF & (codePoint >> 8))); - stream.write((byte) (0xFF & codePoint)); - } - written += charSize; - } - stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); - written += FormatSpec.PTNODE_TERMINATOR_SIZE; - return written; - } - - /** - * Reads a string from a DictBuffer. This is the converse of the above method. - */ - static String readString(final DictBuffer dictBuffer) { - final StringBuilder s = new StringBuilder(); - int character = readChar(dictBuffer); - while (character != FormatSpec.INVALID_CHARACTER) { - s.appendCodePoint(character); - character = readChar(dictBuffer); - } - return s.toString(); - } - - /** - * Reads a character from the buffer. - * - * This follows the character format documented earlier in this source file. - * - * @param dictBuffer the buffer, positioned over an encoded character. - * @return the character code. - */ - static int readChar(final DictBuffer dictBuffer) { - int character = dictBuffer.readUnsignedByte(); - if (!fitsOnOneByte(character)) { - if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { - return FormatSpec.INVALID_CHARACTER; - } - character <<= 16; - character += dictBuffer.readUnsignedShort(); - } - return character; - } - } - - /** - * Reads and returns the PtNode count out of a buffer and forwards the pointer. - */ - /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) { - final int msb = dictBuffer.readUnsignedByte(); - if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) { - return msb; - } else { - return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8) - + dictBuffer.readUnsignedByte(); - } - } - - /** - * Finds, as a string, the word at the position passed as an argument. - * - * @param dictDecoder the dict decoder. - * @param headerSize the size of the header. - * @param pos the position to seek. - * @return the word with its frequency, as a weighted string. - */ - @UsedForTesting - /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, - final int headerSize, final int pos) { - final WeightedString result; - final int originalPos = dictDecoder.getPosition(); - dictDecoder.setPosition(pos); - result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos); - dictDecoder.setPosition(originalPos); - return result; - } - - private static WeightedString getWordAtPositionWithoutParentAddress( - final DictDecoder dictDecoder, final int headerSize, final int pos) { - dictDecoder.setPosition(headerSize); - final int count = dictDecoder.readPtNodeCount(); - int groupPos = dictDecoder.getPosition(); - final StringBuilder builder = new StringBuilder(); - WeightedString result = null; - - PtNodeInfo last = null; - for (int i = count - 1; i >= 0; --i) { - PtNodeInfo info = dictDecoder.readPtNode(groupPos); - groupPos = info.mEndAddress; - if (info.mOriginalAddress == pos) { - builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); - result = new WeightedString(builder.toString(), info.mProbabilityInfo); - break; // and return - } - if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { - if (info.mChildrenAddress > pos) { - if (null == last) continue; - builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); - dictDecoder.setPosition(last.mChildrenAddress); - i = dictDecoder.readPtNodeCount(); - groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); - last = null; - continue; - } - last = info; - } - if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { - builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); - dictDecoder.setPosition(last.mChildrenAddress); - i = dictDecoder.readPtNodeCount(); - groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); - last = null; - continue; - } - } - return result; - } - - /** - * Helper method to pass a file name instead of a File object to isBinaryDictionary. - */ - public static boolean isBinaryDictionary(final String filename) { - final File file = new File(filename); - return isBinaryDictionary(file); - } - - /** - * Basic test to find out whether the file is a binary dictionary or not. - * - * @param file The file to test. - * @return true if it's a binary dictionary, false otherwise - */ - public static boolean isBinaryDictionary(final File file) { - final DictDecoder dictDecoder = FormatSpec.getDictDecoder(file, 0, file.length()); - if (dictDecoder == null) { - return false; - } - return dictDecoder.hasValidRawBinaryDictionary(); - } -} |