aboutsummaryrefslogtreecommitdiffstats
path: root/tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java
blob: c63b972eb87e72051a671964d2591de0b4c6fd68 (about) (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;

/**
 * An implementation of DictEncoder for version 2 binary dictionary.
 */
@UsedForTesting
public class Ver2DictEncoder implements DictEncoder {

    private final File mDictFile;
    private OutputStream mOutStream;
    private byte[] mBuffer;
    private int mPosition;
    private final int mCodePointTableMode;
    public static final int CODE_POINT_TABLE_OFF = 0;
    public static final int CODE_POINT_TABLE_ON = 1;

    @UsedForTesting
    public Ver2DictEncoder(final File dictFile, final int codePointTableMode) {
        mDictFile = dictFile;
        mOutStream = null;
        mBuffer = null;
        mCodePointTableMode = codePointTableMode;
    }

    // This constructor is used only by BinaryDictOffdeviceUtilsTests.
    // If you want to use this in the production code, you should consider keeping consistency of
    // the interface of Ver3DictDecoder by using factory.
    @UsedForTesting
    public Ver2DictEncoder(final OutputStream outStream) {
        mDictFile = null;
        mOutStream = outStream;
        mCodePointTableMode = CODE_POINT_TABLE_OFF;
    }

    private void openStream() throws FileNotFoundException {
        mOutStream = new FileOutputStream(mDictFile);
    }

    private void close() throws IOException {
        if (mOutStream != null) {
            mOutStream.close();
            mOutStream = null;
        }
    }

    // Package for testing
    static CodePointTable makeCodePointTable(final FusionDictionary dict) {
        final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>();
        for (final WordProperty word : dict) {
            // Store per code point occurrence
            final String wordString = word.mWord;
            for (int i = 0; i < wordString.length(); ++i) {
                final int codePoint = Character.codePointAt(wordString, i);
                if (codePointOccurrenceCounts.containsKey(codePoint)) {
                    codePointOccurrenceCounts.put(codePoint,
                            codePointOccurrenceCounts.get(codePoint) + 1);
                } else {
                    codePointOccurrenceCounts.put(codePoint, 1);
                }
            }
        }
        final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray =
                new ArrayList<>(codePointOccurrenceCounts.entrySet());
        // Descending order sort by occurrence (value side)
        Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() {
            @Override
            public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) {
                if (a.getValue() != b.getValue()) {
                    return b.getValue().compareTo(a.getValue());
                }
                return b.getKey().compareTo(a.getKey());
            }
        });
        int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
        // Temporary map for writing of nodes
        final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>();
        for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
            // Put a relation from the original code point to the one byte code.
            codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex);
            if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
                break;
            }
        }
        // codePointToOneByteCodeMap for writing the trie
        // codePointOccurrenceArray for writing the header
        return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray);
    }

    @Override
    public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
        // We no longer support anything but the latest version of v2.
        if (formatOptions.mVersion != FormatSpec.VERSION202) {
            throw new UnsupportedFormatException(
                    "The given format options has wrong version number : "
                    + formatOptions.mVersion);
        }

        if (mOutStream == null) {
            openStream();
        }

        // Make code point conversion table ordered by occurrence of code points
        // Version 201 or later have codePointTable
        final CodePointTable codePointTable;
        if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion
                < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) {
            codePointTable = new CodePointTable();
        } else {
            codePointTable = makeCodePointTable(dict);
        }

        BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
                codePointTable.mCodePointOccurrenceArray);

        // Addresses are limited to 3 bytes, but since addresses can be relative to each node
        // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
        // the order of the PtNode arrays becomes a quite complicated problem, because though the
        // dictionary itself does not have a size limit, each node array must still be within 16MB
        // of all its children and parents. As long as this is ensured, the dictionary file may
        // grow to any size.

        // Leave the choice of the optimal node order to the flattenTree function.
        MakedictLog.i("Flattening the tree...");
        ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);

        MakedictLog.i("Computing addresses...");
        BinaryDictEncoderUtils.computeAddresses(dict, flatNodes,
                codePointTable.mCodePointToOneByteCodeMap);
        MakedictLog.i("Checking PtNode array...");
        if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);

        // Create a buffer that matches the final dictionary size.
        final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
        final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
        mBuffer = new byte[bufferSize];

        MakedictLog.i("Writing file...");

        for (PtNodeArray nodeArray : flatNodes) {
            BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray,
                    codePointTable.mCodePointToOneByteCodeMap);
        }
        if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes);
        mOutStream.write(mBuffer, 0, mPosition);

        MakedictLog.i("Done");
        close();
    }

    @Override
    public void setPosition(final int position) {
        if (mBuffer == null || position < 0 || position >= mBuffer.length) return;
        mPosition = position;
    }

    @Override
    public int getPosition() {
        return mPosition;
    }

    @Override
    public void writePtNodeCount(final int ptNodeCount) {
        final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
        if (countSize != 1 && countSize != 2) {
            throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
        }
        final int encodedPtNodeCount = (countSize == 2) ?
                (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
        mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount,
                countSize);
    }

    private void writePtNodeFlags(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
                codePointToOneByteCodeMap);
        mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition,
                BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos),
                FormatSpec.PTNODE_FLAGS_SIZE);
    }

    private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition,
                codePointToOneByteCodeMap);
        if (hasSeveralChars) {
            mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
        }
    }

    private void writeFrequency(final int frequency) {
        if (frequency >= 0) {
            mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency,
                    FormatSpec.PTNODE_FREQUENCY_SIZE);
        }
    }

    private void writeChildrenPosition(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
                codePointToOneByteCodeMap);
        mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
                childrenPos);
    }

    /**
     * Write a bigram attributes list to mBuffer.
     *
     * @param bigrams the bigram attributes list.
     * @param dict the dictionary the node array is a part of (for relative offsets).
     */
    private void writeBigrams(final ArrayList<WeightedString> bigrams,
            final FusionDictionary dict) {
        if (bigrams == null) return;

        final Iterator<WeightedString> bigramIterator = bigrams.iterator();
        while (bigramIterator.hasNext()) {
            final WeightedString bigram = bigramIterator.next();
            final PtNode target =
                    FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
            final int addressOfBigram = target.mCachedAddressAfterUpdate;
            final int unigramFrequencyForThisWord = target.getProbability();
            final int offset = addressOfBigram
                    - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
            final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
                    offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord);
            mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags,
                    FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
            mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
                    Math.abs(offset));
        }
    }

    @Override
    public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        writePtNodeFlags(ptNode, codePointToOneByteCodeMap);
        writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap);
        writeFrequency(ptNode.getProbability());
        writeChildrenPosition(ptNode, codePointToOneByteCodeMap);
        writeBigrams(ptNode.mBigrams, dict);
    }
}