From e9a0e66716dab4dd3184d009d8920de1961efdfa Mon Sep 17 00:00:00 2001 From: Amin Bandali Date: Mon, 16 Dec 2024 21:45:41 -0500 Subject: Rename to Kelar Keyboard (org.kelar.inputmethod.latin) --- .../latin/makedict/BinaryDictDecoderUtils.java | 425 +++++++++++++++++++++ 1 file changed, 425 insertions(+) create mode 100644 tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java (limited to 'tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java') diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java new file mode 100644 index 000000000..18199ae21 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java @@ -0,0 +1,425 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.LinkedList; + +import javax.annotation.Nonnull; + +/** + * Decodes binary files for a FusionDictionary. + * + * All the methods in this class are static. + * + * TODO: Move this file to makedict/internal. + * TODO: Rename this class to DictDecoderUtils. + */ +public final class BinaryDictDecoderUtils { + private BinaryDictDecoderUtils() { + // This utility class is not publicly instantiable. + } + + @UsedForTesting + public interface DictBuffer { + public int readUnsignedByte(); + public int readUnsignedShort(); + public int readUnsignedInt24(); + public int readInt(); + public int position(); + public void position(int newPosition); + @UsedForTesting + public void put(final byte b); + public int limit(); + @UsedForTesting + public int capacity(); + } + + public static final class ByteBufferDictBuffer implements DictBuffer { + private ByteBuffer mBuffer; + + public ByteBufferDictBuffer(final ByteBuffer buffer) { + mBuffer = buffer; + } + + @Override + public int readUnsignedByte() { + return mBuffer.get() & 0xFF; + } + + @Override + public int readUnsignedShort() { + return mBuffer.getShort() & 0xFFFF; + } + + @Override + public int readUnsignedInt24() { + final int retval = readUnsignedByte(); + return (retval << 16) + readUnsignedShort(); + } + + @Override + public int readInt() { + return mBuffer.getInt(); + } + + @Override + public int position() { + return mBuffer.position(); + } + + @Override + public void position(int newPos) { + mBuffer.position(newPos); + } + + @Override + public void put(final byte b) { + mBuffer.put(b); + } + + @Override + public int limit() { + return mBuffer.limit(); + } + + @Override + public int capacity() { + return mBuffer.capacity(); + } + } + + /** + * A class grouping utility function for our specific character encoding. + */ + static final class CharEncoding { + + /** + * Helper method to find out whether this code fits on one byte + */ + private static boolean fitsOnOneByte(final int character, + final HashMap codePointToOneByteCodeMap) { + int codePoint = character; + if (codePointToOneByteCodeMap != null) { + if (codePointToOneByteCodeMap.containsKey(character)) { + codePoint = codePointToOneByteCodeMap.get(character); + } + } + return codePoint >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE + && codePoint <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE; + } + + /** + * Compute the size of a character given its character code. + * + * Char format is: + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + * + * @param character the character code. + * @return the size in binary encoded-form, either 1 or 3 bytes. + */ + static int getCharSize(final int character, + final HashMap codePointToOneByteCodeMap) { + // See char encoding in FusionDictionary.java + if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1; + if (FormatSpec.INVALID_CHARACTER == character) return 1; + return 3; + } + + /** + * Compute the byte size of a character array. + */ + static int getCharArraySize(final int[] chars, + final HashMap codePointToOneByteCodeMap) { + int size = 0; + for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap); + return size; + } + + /** + * Writes a char array to a byte buffer. + * + * @param codePoints the code point array to write. + * @param buffer the byte buffer to write to. + * @param fromIndex the index in buffer to write the character array to. + * @param codePointToOneByteCodeMap the map to convert the code point. + * @return the index after the last character. + */ + static int writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex, + final HashMap codePointToOneByteCodeMap) { + int index = fromIndex; + for (int codePoint : codePoints) { + if (codePointToOneByteCodeMap != null) { + if (codePointToOneByteCodeMap.containsKey(codePoint)) { + // Convert code points + codePoint = codePointToOneByteCodeMap.get(codePoint); + } + } + if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { + buffer[index++] = (byte)codePoint; + } else { + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); + } + } + return index; + } + + /** + * Writes a string with our character format to a byte buffer. + * + * This will also write the terminator byte. + * + * @param buffer the byte buffer to write to. + * @param origin the offset to write from. + * @param word the string to write. + * @return the size written, in bytes. + */ + static int writeString(final byte[] buffer, final int origin, final String word, + final HashMap codePointToOneByteCodeMap) { + final int length = word.length(); + int index = origin; + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + int codePoint = word.codePointAt(i); + if (codePointToOneByteCodeMap != null) { + if (codePointToOneByteCodeMap.containsKey(codePoint)) { + // Convert code points + codePoint = codePointToOneByteCodeMap.get(codePoint); + } + } + if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { + buffer[index++] = (byte)codePoint; + } else { + buffer[index++] = (byte)(0xFF & (codePoint >> 16)); + buffer[index++] = (byte)(0xFF & (codePoint >> 8)); + buffer[index++] = (byte)(0xFF & codePoint); + } + } + buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + return index - origin; + } + + /** + * Writes a string with our character format to an OutputStream. + * + * This will also write the terminator byte. + * + * @param stream the OutputStream to write to. + * @param word the string to write. + * @return the size written, in bytes. + */ + static int writeString(final OutputStream stream, final String word, + final HashMap codePointToOneByteCodeMap) throws IOException { + final int length = word.length(); + int written = 0; + for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { + final int codePoint = word.codePointAt(i); + final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap); + if (1 == charSize) { + stream.write((byte) codePoint); + } else { + stream.write((byte) (0xFF & (codePoint >> 16))); + stream.write((byte) (0xFF & (codePoint >> 8))); + stream.write((byte) (0xFF & codePoint)); + } + written += charSize; + } + stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); + written += FormatSpec.PTNODE_TERMINATOR_SIZE; + return written; + } + + /** + * Reads a string from a DictBuffer. This is the converse of the above method. + */ + static String readString(final DictBuffer dictBuffer) { + final StringBuilder s = new StringBuilder(); + int character = readChar(dictBuffer); + while (character != FormatSpec.INVALID_CHARACTER) { + s.appendCodePoint(character); + character = readChar(dictBuffer); + } + return s.toString(); + } + + /** + * Reads a character from the buffer. + * + * This follows the character format documented earlier in this source file. + * + * @param dictBuffer the buffer, positioned over an encoded character. + * @return the character code. + */ + static int readChar(final DictBuffer dictBuffer) { + int character = dictBuffer.readUnsignedByte(); + if (!fitsOnOneByte(character, null)) { + if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { + return FormatSpec.INVALID_CHARACTER; + } + character <<= 16; + character += dictBuffer.readUnsignedShort(); + } + return character; + } + } + + /** + * Reads and returns the PtNode count out of a buffer and forwards the pointer. + */ + /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) { + final int msb = dictBuffer.readUnsignedByte(); + if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) { + return msb; + } + return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8) + + dictBuffer.readUnsignedByte(); + } + + /** + * Finds, as a string, the word at the position passed as an argument. + * + * @param dictDecoder the dict decoder. + * @param headerSize the size of the header. + * @param pos the position to seek. + * @return the word with its frequency, as a weighted string. + */ + @UsedForTesting + /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, + final int headerSize, final int pos) { + final WeightedString result; + final int originalPos = dictDecoder.getPosition(); + dictDecoder.setPosition(pos); + result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos); + dictDecoder.setPosition(originalPos); + return result; + } + + private static WeightedString getWordAtPositionWithoutParentAddress( + final DictDecoder dictDecoder, final int headerSize, final int pos) { + dictDecoder.setPosition(headerSize); + final int count = dictDecoder.readPtNodeCount(); + int groupPos = dictDecoder.getPosition(); + final StringBuilder builder = new StringBuilder(); + WeightedString result = null; + + PtNodeInfo last = null; + for (int i = count - 1; i >= 0; --i) { + PtNodeInfo info = dictDecoder.readPtNode(groupPos); + groupPos = info.mEndAddress; + if (info.mOriginalAddress == pos) { + builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); + result = new WeightedString(builder.toString(), info.mProbabilityInfo); + break; // and return + } + if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { + if (info.mChildrenAddress > pos) { + if (null == last) continue; + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); + groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); + last = null; + continue; + } + last = info; + } + if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { + builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); + dictDecoder.setPosition(last.mChildrenAddress); + i = dictDecoder.readPtNodeCount(); + groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); + last = null; + continue; + } + } + return result; + } + + /** + * Helper method that brutally decodes a header from a byte array. + * + * @param headerBuffer a buffer containing the bytes of the header. + * @return a hashmap of the attributes stored in the header + */ + @Nonnull + public static HashMap decodeHeaderAttributes(@Nonnull final byte[] headerBuffer) + throws UnsupportedFormatException { + final StringBuilder sb = new StringBuilder(); + final LinkedList keyValues = new LinkedList<>(); + int index = 0; + while (index < headerBuffer.length) { + if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) { + keyValues.add(sb.toString()); + sb.setLength(0); + } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF, + null /* codePointTable */)) { + sb.appendCodePoint(headerBuffer[index] & 0xFF); + } else { + sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16) + + ((headerBuffer[index + 1] & 0xFF) << 8) + + (headerBuffer[index + 2] & 0xFF)); + index += 2; + } + index += 1; + } + if ((keyValues.size() & 1) != 0) { + throw new UnsupportedFormatException("Odd number of attributes"); + } + final HashMap attributes = new HashMap<>(); + for (int i = 0; i < keyValues.size(); i += 2) { + attributes.put(keyValues.get(i), keyValues.get(i + 1)); + } + return attributes; + } + + /** + * Helper method to pass a file name instead of a File object to isBinaryDictionary. + */ + public static boolean isBinaryDictionary(final String filename) { + final File file = new File(filename); + return isBinaryDictionary(file); + } + + /** + * Basic test to find out whether the file is a binary dictionary or not. + * + * @param file The file to test. + * @return true if it's a binary dictionary, false otherwise + */ + public static boolean isBinaryDictionary(final File file) { + final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length()); + if (dictDecoder == null) { + return false; + } + return dictDecoder.hasValidRawBinaryDictionary(); + } +} -- cgit v1.2.3-83-g751a