diff options
author | 2024-12-16 21:45:41 -0500 | |
---|---|---|
committer | 2025-01-11 14:17:35 -0500 | |
commit | e9a0e66716dab4dd3184d009d8920de1961efdfa (patch) | |
tree | 02dcc096643d74645bf28459c2834c3d4a2ad7f2 /tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java | |
parent | fb3b9360d70596d7e921de8bf7d3ca99564a077e (diff) | |
download | latinime-e9a0e66716dab4dd3184d009d8920de1961efdfa.tar.gz latinime-e9a0e66716dab4dd3184d009d8920de1961efdfa.tar.xz latinime-e9a0e66716dab4dd3184d009d8920de1961efdfa.zip |
Rename to Kelar Keyboard (org.kelar.inputmethod.latin)
Diffstat (limited to 'tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java')
-rw-r--r-- | tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java b/tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java new file mode 100644 index 000000000..8b3636208 --- /dev/null +++ b/tests/src/org/kelar/inputmethod/latin/makedict/Ver2DictEncoder.java @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kelar.inputmethod.latin.makedict; + +import org.kelar.inputmethod.annotations.UsedForTesting; +import org.kelar.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; +import org.kelar.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; +import org.kelar.inputmethod.latin.makedict.FormatSpec.FormatOptions; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNode; +import org.kelar.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map.Entry; +import java.util.Objects; + +/** + * An implementation of DictEncoder for version 2 binary dictionary. + */ +@UsedForTesting +public class Ver2DictEncoder implements DictEncoder { + + private final File mDictFile; + private OutputStream mOutStream; + private byte[] mBuffer; + private int mPosition; + private final int mCodePointTableMode; + public static final int CODE_POINT_TABLE_OFF = 0; + public static final int CODE_POINT_TABLE_ON = 1; + + @UsedForTesting + public Ver2DictEncoder(final File dictFile, final int codePointTableMode) { + mDictFile = dictFile; + mOutStream = null; + mBuffer = null; + mCodePointTableMode = codePointTableMode; + } + + // This constructor is used only by BinaryDictOffdeviceUtilsTests. + // If you want to use this in the production code, you should consider keeping consistency of + // the interface of Ver3DictDecoder by using factory. + @UsedForTesting + public Ver2DictEncoder(final OutputStream outStream) { + mDictFile = null; + mOutStream = outStream; + mCodePointTableMode = CODE_POINT_TABLE_OFF; + } + + private void openStream() throws FileNotFoundException { + mOutStream = new FileOutputStream(mDictFile); + } + + private void close() throws IOException { + if (mOutStream != null) { + mOutStream.close(); + mOutStream = null; + } + } + + // Package for testing + static CodePointTable makeCodePointTable(final FusionDictionary dict) { + final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>(); + for (final WordProperty word : dict) { + // Store per code point occurrence + final String wordString = word.mWord; + for (int i = 0; i < wordString.length(); ++i) { + final int codePoint = Character.codePointAt(wordString, i); + if (codePointOccurrenceCounts.containsKey(codePoint)) { + codePointOccurrenceCounts.put(codePoint, + codePointOccurrenceCounts.get(codePoint) + 1); + } else { + codePointOccurrenceCounts.put(codePoint, 1); + } + } + } + final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray = + new ArrayList<>(codePointOccurrenceCounts.entrySet()); + // Descending order sort by occurrence (value side) + Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() { + @Override + public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) { + if (!Objects.equals(a.getValue(), b.getValue())) { + return b.getValue().compareTo(a.getValue()); + } + return b.getKey().compareTo(a.getKey()); + } + }); + int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; + // Temporary map for writing of nodes + final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>(); + for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { + // Put a relation from the original code point to the one byte code. + codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); + if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { + break; + } + } + // codePointToOneByteCodeMap for writing the trie + // codePointOccurrenceArray for writing the header + return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); + } + + @Override + public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) + throws IOException, UnsupportedFormatException { + // We no longer support anything but the latest version of v2. + if (formatOptions.mVersion != FormatSpec.VERSION202) { + throw new UnsupportedFormatException( + "The given format options has wrong version number : " + + formatOptions.mVersion); + } + + if (mOutStream == null) { + openStream(); + } + + // Make code point conversion table ordered by occurrence of code points + // Version 201 or later have codePointTable + final CodePointTable codePointTable; + if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion + < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) { + codePointTable = new CodePointTable(); + } else { + codePointTable = makeCodePointTable(dict); + } + + BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, + codePointTable.mCodePointOccurrenceArray); + + // Addresses are limited to 3 bytes, but since addresses can be relative to each node + // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding + // the order of the PtNode arrays becomes a quite complicated problem, because though the + // dictionary itself does not have a size limit, each node array must still be within 16MB + // of all its children and parents. As long as this is ensured, the dictionary file may + // grow to any size. + + // Leave the choice of the optimal node order to the flattenTree function. + MakedictLog.i("Flattening the tree..."); + ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); + + MakedictLog.i("Computing addresses..."); + BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, + codePointTable.mCodePointToOneByteCodeMap); + MakedictLog.i("Checking PtNode array..."); + if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); + + // Create a buffer that matches the final dictionary size. + final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); + final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; + mBuffer = new byte[bufferSize]; + + MakedictLog.i("Writing file..."); + + for (PtNodeArray nodeArray : flatNodes) { + BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, + codePointTable.mCodePointToOneByteCodeMap); + } + if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); + mOutStream.write(mBuffer, 0, mPosition); + + MakedictLog.i("Done"); + close(); + } + + @Override + public void setPosition(final int position) { + if (mBuffer == null || position < 0 || position >= mBuffer.length) return; + mPosition = position; + } + + @Override + public int getPosition() { + return mPosition; + } + + @Override + public void writePtNodeCount(final int ptNodeCount) { + final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); + if (countSize != 1 && countSize != 2) { + throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); + } + final int encodedPtNodeCount = (countSize == 2) ? + (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, + countSize); + } + + private void writePtNodeFlags(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, + codePointToOneByteCodeMap); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, + BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos), + FormatSpec.PTNODE_FLAGS_SIZE); + } + + private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition, + codePointToOneByteCodeMap); + if (hasSeveralChars) { + mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; + } + } + + private void writeFrequency(final int frequency) { + if (frequency >= 0) { + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, + FormatSpec.PTNODE_FREQUENCY_SIZE); + } + } + + private void writeChildrenPosition(final PtNode ptNode, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, + codePointToOneByteCodeMap); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + childrenPos); + } + + /** + * Write a bigram attributes list to mBuffer. + * + * @param bigrams the bigram attributes list. + * @param dict the dictionary the node array is a part of (for relative offsets). + */ + private void writeBigrams(final ArrayList<WeightedString> bigrams, + final FusionDictionary dict) { + if (bigrams == null) return; + + final Iterator<WeightedString> bigramIterator = bigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = bigramIterator.next(); + final PtNode target = + FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); + final int addressOfBigram = target.mCachedAddressAfterUpdate; + final int unigramFrequencyForThisWord = target.getProbability(); + final int offset = addressOfBigram + - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), + offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord); + mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); + mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, + Math.abs(offset)); + } + } + + @Override + public void writePtNode(final PtNode ptNode, final FusionDictionary dict, + final HashMap<Integer, Integer> codePointToOneByteCodeMap) { + writePtNodeFlags(ptNode, codePointToOneByteCodeMap); + writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap); + writeFrequency(ptNode.getProbability()); + writeChildrenPosition(ptNode, codePointToOneByteCodeMap); + writeBigrams(ptNode.mBigrams, dict); + } +} |