aboutsummaryrefslogtreecommitdiffstats
path: root/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
diff options
context:
space:
mode:
authorAmin Bandali <bandali@kelar.org>2024-12-16 21:45:41 -0500
committerAmin Bandali <bandali@kelar.org>2025-01-11 14:17:35 -0500
commite9a0e66716dab4dd3184d009d8920de1961efdfa (patch)
tree02dcc096643d74645bf28459c2834c3d4a2ad7f2 /tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
parentfb3b9360d70596d7e921de8bf7d3ca99564a077e (diff)
downloadlatinime-e9a0e66716dab4dd3184d009d8920de1961efdfa.tar.gz
latinime-e9a0e66716dab4dd3184d009d8920de1961efdfa.tar.xz
latinime-e9a0e66716dab4dd3184d009d8920de1961efdfa.zip
Rename to Kelar Keyboard (org.kelar.inputmethod.latin)
Diffstat (limited to 'tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java')
-rw-r--r--tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java425
1 files changed, 425 insertions, 0 deletions
diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
new file mode 100644
index 000000000..18199ae21
--- /dev/null
+++ b/tests/src/org/kelar/inputmethod/latin/makedict/BinaryDictDecoderUtils.java
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.kelar.inputmethod.latin.makedict;
+
+import org.kelar.inputmethod.annotations.UsedForTesting;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.LinkedList;
+
+import javax.annotation.Nonnull;
+
+/**
+ * Decodes binary files for a FusionDictionary.
+ *
+ * All the methods in this class are static.
+ *
+ * TODO: Move this file to makedict/internal.
+ * TODO: Rename this class to DictDecoderUtils.
+ */
+public final class BinaryDictDecoderUtils {
+ private BinaryDictDecoderUtils() {
+ // This utility class is not publicly instantiable.
+ }
+
+ @UsedForTesting
+ public interface DictBuffer {
+ public int readUnsignedByte();
+ public int readUnsignedShort();
+ public int readUnsignedInt24();
+ public int readInt();
+ public int position();
+ public void position(int newPosition);
+ @UsedForTesting
+ public void put(final byte b);
+ public int limit();
+ @UsedForTesting
+ public int capacity();
+ }
+
+ public static final class ByteBufferDictBuffer implements DictBuffer {
+ private ByteBuffer mBuffer;
+
+ public ByteBufferDictBuffer(final ByteBuffer buffer) {
+ mBuffer = buffer;
+ }
+
+ @Override
+ public int readUnsignedByte() {
+ return mBuffer.get() & 0xFF;
+ }
+
+ @Override
+ public int readUnsignedShort() {
+ return mBuffer.getShort() & 0xFFFF;
+ }
+
+ @Override
+ public int readUnsignedInt24() {
+ final int retval = readUnsignedByte();
+ return (retval << 16) + readUnsignedShort();
+ }
+
+ @Override
+ public int readInt() {
+ return mBuffer.getInt();
+ }
+
+ @Override
+ public int position() {
+ return mBuffer.position();
+ }
+
+ @Override
+ public void position(int newPos) {
+ mBuffer.position(newPos);
+ }
+
+ @Override
+ public void put(final byte b) {
+ mBuffer.put(b);
+ }
+
+ @Override
+ public int limit() {
+ return mBuffer.limit();
+ }
+
+ @Override
+ public int capacity() {
+ return mBuffer.capacity();
+ }
+ }
+
+ /**
+ * A class grouping utility function for our specific character encoding.
+ */
+ static final class CharEncoding {
+
+ /**
+ * Helper method to find out whether this code fits on one byte
+ */
+ private static boolean fitsOnOneByte(final int character,
+ final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
+ int codePoint = character;
+ if (codePointToOneByteCodeMap != null) {
+ if (codePointToOneByteCodeMap.containsKey(character)) {
+ codePoint = codePointToOneByteCodeMap.get(character);
+ }
+ }
+ return codePoint >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE
+ && codePoint <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
+ }
+
+ /**
+ * Compute the size of a character given its character code.
+ *
+ * Char format is:
+ * 1 byte = bbbbbbbb match
+ * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
+ * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
+ * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
+ * 00011111 would be outside unicode.
+ * else: iso-latin-1 code
+ * This allows for the whole unicode range to be encoded, including chars outside of
+ * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
+ * characters which should never happen anyway (and still work, but take 3 bytes).
+ *
+ * @param character the character code.
+ * @return the size in binary encoded-form, either 1 or 3 bytes.
+ */
+ static int getCharSize(final int character,
+ final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
+ // See char encoding in FusionDictionary.java
+ if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1;
+ if (FormatSpec.INVALID_CHARACTER == character) return 1;
+ return 3;
+ }
+
+ /**
+ * Compute the byte size of a character array.
+ */
+ static int getCharArraySize(final int[] chars,
+ final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
+ int size = 0;
+ for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap);
+ return size;
+ }
+
+ /**
+ * Writes a char array to a byte buffer.
+ *
+ * @param codePoints the code point array to write.
+ * @param buffer the byte buffer to write to.
+ * @param fromIndex the index in buffer to write the character array to.
+ * @param codePointToOneByteCodeMap the map to convert the code point.
+ * @return the index after the last character.
+ */
+ static int writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex,
+ final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
+ int index = fromIndex;
+ for (int codePoint : codePoints) {
+ if (codePointToOneByteCodeMap != null) {
+ if (codePointToOneByteCodeMap.containsKey(codePoint)) {
+ // Convert code points
+ codePoint = codePointToOneByteCodeMap.get(codePoint);
+ }
+ }
+ if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
+ buffer[index++] = (byte)codePoint;
+ } else {
+ buffer[index++] = (byte)(0xFF & (codePoint >> 16));
+ buffer[index++] = (byte)(0xFF & (codePoint >> 8));
+ buffer[index++] = (byte)(0xFF & codePoint);
+ }
+ }
+ return index;
+ }
+
+ /**
+ * Writes a string with our character format to a byte buffer.
+ *
+ * This will also write the terminator byte.
+ *
+ * @param buffer the byte buffer to write to.
+ * @param origin the offset to write from.
+ * @param word the string to write.
+ * @return the size written, in bytes.
+ */
+ static int writeString(final byte[] buffer, final int origin, final String word,
+ final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
+ final int length = word.length();
+ int index = origin;
+ for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
+ int codePoint = word.codePointAt(i);
+ if (codePointToOneByteCodeMap != null) {
+ if (codePointToOneByteCodeMap.containsKey(codePoint)) {
+ // Convert code points
+ codePoint = codePointToOneByteCodeMap.get(codePoint);
+ }
+ }
+ if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
+ buffer[index++] = (byte)codePoint;
+ } else {
+ buffer[index++] = (byte)(0xFF & (codePoint >> 16));
+ buffer[index++] = (byte)(0xFF & (codePoint >> 8));
+ buffer[index++] = (byte)(0xFF & codePoint);
+ }
+ }
+ buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
+ return index - origin;
+ }
+
+ /**
+ * Writes a string with our character format to an OutputStream.
+ *
+ * This will also write the terminator byte.
+ *
+ * @param stream the OutputStream to write to.
+ * @param word the string to write.
+ * @return the size written, in bytes.
+ */
+ static int writeString(final OutputStream stream, final String word,
+ final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException {
+ final int length = word.length();
+ int written = 0;
+ for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
+ final int codePoint = word.codePointAt(i);
+ final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap);
+ if (1 == charSize) {
+ stream.write((byte) codePoint);
+ } else {
+ stream.write((byte) (0xFF & (codePoint >> 16)));
+ stream.write((byte) (0xFF & (codePoint >> 8)));
+ stream.write((byte) (0xFF & codePoint));
+ }
+ written += charSize;
+ }
+ stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
+ written += FormatSpec.PTNODE_TERMINATOR_SIZE;
+ return written;
+ }
+
+ /**
+ * Reads a string from a DictBuffer. This is the converse of the above method.
+ */
+ static String readString(final DictBuffer dictBuffer) {
+ final StringBuilder s = new StringBuilder();
+ int character = readChar(dictBuffer);
+ while (character != FormatSpec.INVALID_CHARACTER) {
+ s.appendCodePoint(character);
+ character = readChar(dictBuffer);
+ }
+ return s.toString();
+ }
+
+ /**
+ * Reads a character from the buffer.
+ *
+ * This follows the character format documented earlier in this source file.
+ *
+ * @param dictBuffer the buffer, positioned over an encoded character.
+ * @return the character code.
+ */
+ static int readChar(final DictBuffer dictBuffer) {
+ int character = dictBuffer.readUnsignedByte();
+ if (!fitsOnOneByte(character, null)) {
+ if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
+ return FormatSpec.INVALID_CHARACTER;
+ }
+ character <<= 16;
+ character += dictBuffer.readUnsignedShort();
+ }
+ return character;
+ }
+ }
+
+ /**
+ * Reads and returns the PtNode count out of a buffer and forwards the pointer.
+ */
+ /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
+ final int msb = dictBuffer.readUnsignedByte();
+ if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
+ return msb;
+ }
+ return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
+ + dictBuffer.readUnsignedByte();
+ }
+
+ /**
+ * Finds, as a string, the word at the position passed as an argument.
+ *
+ * @param dictDecoder the dict decoder.
+ * @param headerSize the size of the header.
+ * @param pos the position to seek.
+ * @return the word with its frequency, as a weighted string.
+ */
+ @UsedForTesting
+ /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
+ final int headerSize, final int pos) {
+ final WeightedString result;
+ final int originalPos = dictDecoder.getPosition();
+ dictDecoder.setPosition(pos);
+ result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
+ dictDecoder.setPosition(originalPos);
+ return result;
+ }
+
+ private static WeightedString getWordAtPositionWithoutParentAddress(
+ final DictDecoder dictDecoder, final int headerSize, final int pos) {
+ dictDecoder.setPosition(headerSize);
+ final int count = dictDecoder.readPtNodeCount();
+ int groupPos = dictDecoder.getPosition();
+ final StringBuilder builder = new StringBuilder();
+ WeightedString result = null;
+
+ PtNodeInfo last = null;
+ for (int i = count - 1; i >= 0; --i) {
+ PtNodeInfo info = dictDecoder.readPtNode(groupPos);
+ groupPos = info.mEndAddress;
+ if (info.mOriginalAddress == pos) {
+ builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
+ result = new WeightedString(builder.toString(), info.mProbabilityInfo);
+ break; // and return
+ }
+ if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
+ if (info.mChildrenAddress > pos) {
+ if (null == last) continue;
+ builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
+ dictDecoder.setPosition(last.mChildrenAddress);
+ i = dictDecoder.readPtNodeCount();
+ groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
+ last = null;
+ continue;
+ }
+ last = info;
+ }
+ if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
+ builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
+ dictDecoder.setPosition(last.mChildrenAddress);
+ i = dictDecoder.readPtNodeCount();
+ groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
+ last = null;
+ continue;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Helper method that brutally decodes a header from a byte array.
+ *
+ * @param headerBuffer a buffer containing the bytes of the header.
+ * @return a hashmap of the attributes stored in the header
+ */
+ @Nonnull
+ public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer)
+ throws UnsupportedFormatException {
+ final StringBuilder sb = new StringBuilder();
+ final LinkedList<String> keyValues = new LinkedList<>();
+ int index = 0;
+ while (index < headerBuffer.length) {
+ if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) {
+ keyValues.add(sb.toString());
+ sb.setLength(0);
+ } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF,
+ null /* codePointTable */)) {
+ sb.appendCodePoint(headerBuffer[index] & 0xFF);
+ } else {
+ sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16)
+ + ((headerBuffer[index + 1] & 0xFF) << 8)
+ + (headerBuffer[index + 2] & 0xFF));
+ index += 2;
+ }
+ index += 1;
+ }
+ if ((keyValues.size() & 1) != 0) {
+ throw new UnsupportedFormatException("Odd number of attributes");
+ }
+ final HashMap<String, String> attributes = new HashMap<>();
+ for (int i = 0; i < keyValues.size(); i += 2) {
+ attributes.put(keyValues.get(i), keyValues.get(i + 1));
+ }
+ return attributes;
+ }
+
+ /**
+ * Helper method to pass a file name instead of a File object to isBinaryDictionary.
+ */
+ public static boolean isBinaryDictionary(final String filename) {
+ final File file = new File(filename);
+ return isBinaryDictionary(file);
+ }
+
+ /**
+ * Basic test to find out whether the file is a binary dictionary or not.
+ *
+ * @param file The file to test.
+ * @return true if it's a binary dictionary, false otherwise
+ */
+ public static boolean isBinaryDictionary(final File file) {
+ final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
+ if (dictDecoder == null) {
+ return false;
+ }
+ return dictDecoder.hasValidRawBinaryDictionary();
+ }
+}