aboutsummaryrefslogtreecommitdiffstats
path: root/native/src/binary_format.h
blob: e9f108e25de7d693d5fac477627badfe63490c58 (about) (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/*
 * Copyright (C) 2011 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LATINIME_BINARY_FORMAT_H
#define LATINIME_BINARY_FORMAT_H

namespace latinime {

class BinaryFormat {
private:
    const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
    const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
    const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;

public:
    static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
    static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
    static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
    static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
    static int skipOtherCharacters(const uint8_t* const dict, const int pos);
    static int skipAttributes(const uint8_t* const dict, const int pos);
    static int skipChildrenPosition(const uint8_t flags, const int pos);
    static int skipFrequency(const uint8_t flags, const int pos);
    static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
    static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
            const int pos);
    static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
    static bool hasChildrenInFlags(const uint8_t flags);
    static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
            int *pos);
};

inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
    return dict[(*pos)++];
}

inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
    return dict[(*pos)++];
}

inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
    const int origin = *pos;
    const int32_t character = dict[origin];
    if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
        if (character == CHARACTER_ARRAY_TERMINATOR) {
            *pos = origin + 1;
            return NOT_A_CHARACTER;
        } else {
            *pos = origin + 3;
            const int32_t char_1 = character << 16;
            const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
            return char_2 + dict[origin + 2];
        }
    } else {
        *pos = origin + 1;
        return character;
    }
}

inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
        const int pos) {
    return dict[pos];
}

inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
    int currentPos = pos;
    int32_t character = dict[currentPos++];
    while (CHARACTER_ARRAY_TERMINATOR != character) {
        if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
            currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
        }
        character = dict[currentPos++];
    }
    return currentPos;
}

static inline int attributeAddressSize(const uint8_t flags) {
    static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
    return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
    /* Note: this is a value-dependant optimization of what may probably be
       more readably written this way:
       switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
       case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
       default: return 0;
       }
    */
}

inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
    int currentPos = pos;
    uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
    while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
        currentPos += attributeAddressSize(flags);
        flags = getFlagsAndForwardPointer(dict, &currentPos);
    }
    currentPos += attributeAddressSize(flags);
    return currentPos;
}

static inline int childrenAddressSize(const uint8_t flags) {
    static const int CHILDREN_ADDRESS_SHIFT = 6;
    return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
    /* See the note in attributeAddressSize. The same applies here */
}

inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
    return pos + childrenAddressSize(flags);
}

inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
    return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}

inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
        const int pos) {
    // This function skips all attributes. The format makes provision for future extension
    // with other attributes (notably shortcuts) but for the time being, bigrams are the
    // only attributes that may be found in a character group, so we only look at bigrams
    // in this version.
    if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
        return skipAttributes(dict, pos);
    } else {
        return pos;
    }
}

inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
        const uint8_t flags, const int pos) {
    int currentPos = pos;
    currentPos = skipChildrenPosition(flags, currentPos);
    currentPos = skipAllAttributes(dict, flags, currentPos);
    return currentPos;
}

inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
        const int pos) {
    int offset = 0;
    switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
            offset = dict[pos];
            break;
        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
            offset = dict[pos] << 8;
            offset += dict[pos + 1];
            break;
        case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
            offset = dict[pos] << 16;
            offset += dict[pos + 1] << 8;
            offset += dict[pos + 2];
            break;
        default:
            // If we come here, it means we asked for the children of a word with
            // no children.
            return -1;
    }
    return pos + offset;
}

inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
    return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
            != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
}

inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
        const uint8_t flags, int *pos) {
    int offset = 0;
    const int origin = *pos;
    switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
            offset = dict[origin];
            *pos = origin + 1;
            break;
        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
            offset = dict[origin] << 8;
            offset += dict[origin + 1];
            *pos = origin + 2;
            break;
        case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
            offset = dict[origin] << 16;
            offset += dict[origin + 1] << 8;
            offset += dict[origin + 2];
            *pos = origin + 3;
            break;
    }
    if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
        return origin - offset;
    } else {
        return origin + offset;
    }
}

} // namespace latinime

#endif // LATINIME_BINARY_FORMAT_H