1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
|
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.Constants;
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
/**
* Dictionary File Format Specification.
*/
public final class FormatSpec {
/*
* File header layout is as follows:
*
* v |
* e | MAGIC_NUMBER + version of the file format, 2 bytes.
* r |
* sion
*
* o |
* p | not used 4 bits
* t | has bigrams ? 1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
* i | FRENCH_LIGATURE_PROCESSING_FLAG
* o | supports dynamic updates ? 1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
* n | GERMAN_UMLAUT_PROCESSING_FLAG
* f |
* lags
*
* h |
* e | size of the file header, 4bytes
* a | including the size of the magic number, the option flags and the header size
* d |
* ersize
*
* | attributes list
*
* attributes list is:
* <key> = | string of characters at the char format described below, with the terminator used
* | to signal the end of the string.
* <value> = | string of characters at the char format described below, with the terminator used
* | to signal the end of the string.
* if the size of already read < headersize, goto key.
*
*/
/*
* Node array (FusionDictionary.PtNodeArray) layout is as follows:
*
* n |
* o | the number of PtNodes, 1 or 2 bytes.
* d | 1 byte = bbbbbbbb match
* e | case 1xxxxxxx => xxxxxxx << 8 + next byte
* c | otherwise => bbbbbbbb
* o |
* unt
*
* n |
* o | sequence of PtNodes,
* d | the layout of each PtNode is described below.
* e |
* s
*
* f |
* o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
* r | forward link address, 3byte
* w | 1 byte = bbbbbbbb match
* a | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte)
* r | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte
* d |
* linkaddress
*/
/* Node (FusionDictionary.PtNode) layout is as follows:
* | IF !SUPPORTS_DYNAMIC_UPDATE
* | addressType xx : mask with MASK_CHILDREN_ADDRESS_TYPE
* | 2 bits, 00 = no children : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS
* f | 01 = 1 byte : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE
* l | 10 = 2 bytes : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES
* a | 11 = 3 bytes : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES
* g | ELSE
* s | is moved ? 2 bits, 11 = no : FLAG_IS_NOT_MOVED
* | This must be the same as FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES
* | 01 = yes : FLAG_IS_MOVED
* | the new address is stored in the same place as the parent address
* | is deleted? 10 = yes : FLAG_IS_DELETED
* | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS
* | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL
* | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
* | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED
*
* p |
* a | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
* r | parent address, 3byte
* e | 1 byte = bbbbbbbb match
* n | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
* t | otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte
* a | This address is relative to the head of the PtNode.
* d | If the node doesn't have a parent, this field is set to 0.
* d |
* ress
*
* c | IF FLAG_HAS_MULTIPLE_CHARS
* h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers
* a | end 1 byte, = 0
* r | ELSE
* s | char 1 or 3 bytes
* | END
*
* f |
* r | IF FLAG_IS_TERMINAL
* e | frequency 1 byte
* q |
*
* c | IF SUPPORTS_DYNAMIC_UPDATE
* h | children address, 3 bytes
* i | 1 byte = bbbbbbbb match
* l | case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
* d | otherwise => (bbbbbbbb<<16) + (next byte << 8) + next byte
* r | if this node doesn't have children, this field is set to 0.
* e | (see BinaryDictEncoderUtils#writeVariableSignedAddress)
* n | ELSIF 00 = FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS == addressType
* a | // nothing
* d | ELSIF 01 = FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE == addressType
* d | children address, 1 byte
* r | ELSIF 10 = FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES == addressType
* e | children address, 2 bytes
* s | ELSE // 11 = FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = addressType
* s | children address, 3 bytes
* | END
* | This address is relative to the position of this field.
*
* | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
* | shortcut string list
* | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
* | bigrams address list
*
* Char format is:
* 1 byte = bbbbbbbb match
* case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
* else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
* unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
* 00011111 would be outside unicode.
* else: iso-latin-1 code
* This allows for the whole unicode range to be encoded, including chars outside of
* the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
* characters which should never happen anyway (and still work, but take 3 bytes).
*
* bigram address list is:
* <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT
* | addressSign = 1 bit, : FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE
* | 1 = must take -address, 0 = must take +address
* | xx : mask with MASK_BIGRAM_ATTR_ADDRESS_TYPE
* | addressFormat = 2 bits, 00 = unused : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE
* | 01 = 1 byte : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE
* | 10 = 2 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES
* | 11 = 3 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES
* | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY
* <address> | IF (01 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE == addressFormat)
* | read 1 byte, add top 4 bits
* | ELSIF (10 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES == addressFormat)
* | read 2 bytes, add top 4 bits
* | ELSE // 11 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES == addressFormat
* | read 3 bytes, add top 4 bits
* | END
* | if (FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) then address = -address
* if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) goto bigram_and_shortcut_address_list_is
*
* shortcut string list is:
* <byte size> = PTNODE_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
* <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT
* | reserved = 3 bits, must be 0
* | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY
* <shortcut> = | string of characters at the char format described above, with the terminator
* | used to signal the end of the string.
* if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT goto flags
*/
public static final int MAGIC_NUMBER = 0x9BC13AFE;
static final int MINIMUM_SUPPORTED_VERSION = 2;
static final int MAXIMUM_SUPPORTED_VERSION = 3;
static final int NOT_A_VERSION_NUMBER = -1;
static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3;
// These options need to be the same numeric values as the one in the native reading code.
static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
// TODO: Make the native reading code read this variable.
static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
static final int CONTAINS_BIGRAMS_FLAG = 0x8;
// TODO: Make this value adaptative to content data, store it in the header, and
// use it in the reading code.
static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
static final int PARENT_ADDRESS_SIZE = 3;
static final int FORWARD_LINK_ADDRESS_SIZE = 3;
// These flags are used only in the static dictionary.
static final int MASK_CHILDREN_ADDRESS_TYPE = 0xC0;
static final int FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS = 0x00;
static final int FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE = 0x40;
static final int FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES = 0x80;
static final int FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = 0xC0;
static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
static final int FLAG_IS_TERMINAL = 0x10;
static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
static final int FLAG_HAS_BIGRAMS = 0x04;
static final int FLAG_IS_NOT_A_WORD = 0x02;
static final int FLAG_IS_BLACKLISTED = 0x01;
// These flags are used only in the dynamic dictionary.
static final int MASK_MOVE_AND_DELETE_FLAG = 0xC0;
static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40;
static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE;
static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE;
static final int FLAG_IS_DELETED = 0x80;
static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80;
static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40;
static final int MASK_BIGRAM_ATTR_ADDRESS_TYPE = 0x30;
static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE = 0x10;
static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES = 0x20;
static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES = 0x30;
static final int FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY = 0x0F;
static final int PTNODE_CHARACTERS_TERMINATOR = 0x1F;
static final int PTNODE_TERMINATOR_SIZE = 1;
static final int PTNODE_FLAGS_SIZE = 1;
static final int PTNODE_FREQUENCY_SIZE = 1;
static final int PTNODE_MAX_ADDRESS_SIZE = 3;
static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1;
static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2;
static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
static final int NO_PARENT_ADDRESS = 0;
static final int NO_FORWARD_LINK_ADDRESS = 0;
static final int INVALID_CHARACTER = -1;
static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127
static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767
static final int MAX_BIGRAMS_IN_A_PTNODE = 10000;
static final int MAX_TERMINAL_FREQUENCY = 255;
static final int MAX_BIGRAM_FREQUENCY = 15;
public static final int SHORTCUT_WHITELIST_FREQUENCY = 15;
// This option needs to be the same numeric value as the one in binary_format.h.
static final int NOT_VALID_WORD = -99;
static final int SIGNED_CHILDREN_ADDRESS_SIZE = 3;
static final int UINT8_MAX = 0xFF;
static final int UINT16_MAX = 0xFFFF;
static final int UINT24_MAX = 0xFFFFFF;
static final int SINT24_MAX = 0x7FFFFF;
static final int MSB8 = 0x80;
static final int MSB24 = 0x800000;
/**
* Options about file format.
*/
public static final class FormatOptions {
public final int mVersion;
public final boolean mSupportsDynamicUpdate;
@UsedForTesting
public FormatOptions(final int version) {
this(version, false);
}
@UsedForTesting
public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
mVersion = version;
if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
throw new RuntimeException("Dynamic updates are only supported with versions "
+ FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior.");
}
mSupportsDynamicUpdate = supportsDynamicUpdate;
}
}
/**
* Class representing file header.
*/
public static final class FileHeader {
public final int mHeaderSize;
public final DictionaryOptions mDictionaryOptions;
public final FormatOptions mFormatOptions;
private static final String DICTIONARY_VERSION_ATTRIBUTE = "version";
private static final String DICTIONARY_LOCALE_ATTRIBUTE = "locale";
private static final String DICTIONARY_ID_ATTRIBUTE = "dictionary";
private static final String DICTIONARY_DESCRIPTION_ATTRIBUTE = "description";
public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
final FormatOptions formatOptions) {
mHeaderSize = headerSize;
mDictionaryOptions = dictionaryOptions;
mFormatOptions = formatOptions;
}
// Helper method to get the locale as a String
public String getLocaleString() {
return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_LOCALE_ATTRIBUTE);
}
// Helper method to get the version String
public String getVersion() {
return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_VERSION_ATTRIBUTE);
}
// Helper method to get the dictionary ID as a String
public String getId() {
return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_ID_ATTRIBUTE);
}
// Helper method to get the description
public String getDescription() {
// TODO: Right now each dictionary file comes with a description in its own language.
// It will display as is no matter the device's locale. It should be internationalized.
return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_DESCRIPTION_ATTRIBUTE);
}
}
private FormatSpec() {
// This utility class is not publicly instantiable.
}
}
|