aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/binary_format.h
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r--native/jni/src/binary_format.h98
1 files changed, 65 insertions, 33 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 4155ef401..fdc13bfb9 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -18,13 +18,47 @@
#define LATINIME_BINARY_FORMAT_H
#include <limits>
+#include <map>
#include "bloom_filter.h"
#include "char_utils.h"
-#include "unigram_dictionary.h"
namespace latinime {
class BinaryFormat {
+ public:
+ // Mask and flags for children address type selection.
+ static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
+ static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
+ static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
+ static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
+ static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
+
+ // Flag for single/multiple char group
+ static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
+
+ // Flag for terminal groups
+ static const int FLAG_IS_TERMINAL = 0x10;
+
+ // Flag for shortcut targets presence
+ static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
+ // Flag for bigram presence
+ static const int FLAG_HAS_BIGRAMS = 0x04;
+
+ // Attribute (bigram/shortcut) related flags:
+ // Flag for presence of more attributes
+ static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
+ // Flag for sign of offset. If this flag is set, the offset value must be negated.
+ static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
+
+ // Mask for attribute frequency, stored on 4 bits inside the flags byte.
+ static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
+
+ // Mask and flags for attribute address type selection.
+ static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
@@ -174,13 +208,13 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const in
static inline int attributeAddressSize(const uint8_t flags) {
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
- return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
+ return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
/* Note: this is a value-dependant optimization of what may probably be
more readably written this way:
- switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
+ switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
default: return 0;
}
*/
@@ -189,7 +223,7 @@ static inline int attributeAddressSize(const uint8_t flags) {
static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
int currentPos = pos;
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
- while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
+ while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
currentPos += attributeAddressSize(flags);
flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
}
@@ -199,7 +233,7 @@ static inline int skipExistingBigrams(const uint8_t *const dict, const int pos)
static inline int childrenAddressSize(const uint8_t flags) {
static const int CHILDREN_ADDRESS_SHIFT = 6;
- return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
+ return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
/* See the note in attributeAddressSize. The same applies here */
}
@@ -212,12 +246,12 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos
}
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
- return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
+ return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}
inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
const int pos) {
- if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
+ if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
return pos + shortcutByteSize(dict, pos);
} else {
return pos;
@@ -226,7 +260,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t
inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
const int pos) {
- if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
+ if (FLAG_HAS_BIGRAMS & flags) {
return skipExistingBigrams(dict, pos);
} else {
return pos;
@@ -253,15 +287,15 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
const int pos) {
int offset = 0;
- switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
+ switch (MASK_GROUP_ADDRESS_TYPE & flags) {
+ case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
offset = dict[pos];
break;
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
+ case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
offset = dict[pos] << 8;
offset += dict[pos + 1];
break;
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
+ case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
offset = dict[pos] << 16;
offset += dict[pos + 1] << 8;
offset += dict[pos + 2];
@@ -275,32 +309,31 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const u
}
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
- return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+ return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
}
inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
const uint8_t flags, int *pos) {
int offset = 0;
const int origin = *pos;
- switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
+ switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
offset = dict[origin];
*pos = origin + 1;
break;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
offset = dict[origin] << 8;
offset += dict[origin + 1];
*pos = origin + 2;
break;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
offset = dict[origin] << 16;
offset += dict[origin + 1] << 8;
offset += dict[origin + 2];
*pos = origin + 3;
break;
}
- if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
+ if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
return origin - offset;
} else {
return origin + offset;
@@ -332,7 +365,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
// char within a node, so either we found our match in this node, or there is
// no match and we can return NOT_VALID_WORD. So we will check all the characters
// in this character group indeed does match.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
while (NOT_A_CHARACTER != character) {
++wordPos;
@@ -350,14 +383,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
// If we don't match the length AND don't have children, then a word in the
// dictionary fully matches a prefix of the searched word but not the full word.
++wordPos;
- if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
+ if (FLAG_IS_TERMINAL & flags) {
if (wordPos == length) {
return charGroupPos;
}
- pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
+ pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
}
- if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
+ if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
return NOT_VALID_WORD;
}
// We have children and we are still shorter than the word we are searching for, so
@@ -367,7 +399,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
break;
} else {
// This chargroup does not match, so skip the remaining part and go to the next.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = BinaryFormat::skipOtherCharacters(root, pos);
}
pos = BinaryFormat::skipFrequency(flags, pos);
@@ -420,7 +452,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
// We found the address. Copy the rest of the word in the buffer and return
// the length.
outWord[wordPos] = character;
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
// We count chars in order to avoid infinite loops if the file is broken or
// if there is some other bug
@@ -435,7 +467,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
}
// We need to skip past this char group, so skip any remaining chars after the
// first and possibly the frequency.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = skipOtherCharacters(root, pos);
}
pos = skipFrequency(flags, pos);
@@ -443,8 +475,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
// The fact that this group has children is very important. Since we already know
// that this group does not match, if it has no children we know it is irrelevant
// to what we are searching for.
- const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
- (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+ const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
+ (MASK_GROUP_ADDRESS_TYPE & flags));
// We will write in `found' whether we have passed the children address we are
// searching for. For example if we search for "beer", the children of b are less
// than the address we are searching for and the children of c are greater. When we
@@ -484,7 +516,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
// We copy all the characters in this group to the buffer
outWord[wordPos] = lastChar;
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
int32_t nextChar =
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
int charCount = maxDepth;