diff options
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r-- | native/jni/src/binary_format.h | 98 |
1 files changed, 65 insertions, 33 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 4155ef401..fdc13bfb9 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -18,13 +18,47 @@ #define LATINIME_BINARY_FORMAT_H #include <limits> +#include <map> #include "bloom_filter.h" #include "char_utils.h" -#include "unigram_dictionary.h" namespace latinime { class BinaryFormat { + public: + // Mask and flags for children address type selection. + static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; + static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; + static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; + static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; + static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; + + // Flag for single/multiple char group + static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; + + // Flag for terminal groups + static const int FLAG_IS_TERMINAL = 0x10; + + // Flag for shortcut targets presence + static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; + // Flag for bigram presence + static const int FLAG_HAS_BIGRAMS = 0x04; + + // Attribute (bigram/shortcut) related flags: + // Flag for presence of more attributes + static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; + // Flag for sign of offset. If this flag is set, the offset value must be negated. + static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; + + // Mask for attribute frequency, stored on 4 bits inside the flags byte. + static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; + + // Mask and flags for attribute address type selection. + static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + private: DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; @@ -174,13 +208,13 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const in static inline int attributeAddressSize(const uint8_t flags) { static const int ATTRIBUTE_ADDRESS_SHIFT = 4; - return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; /* Note: this is a value-dependant optimization of what may probably be more readably written this way: - switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; default: return 0; } */ @@ -189,7 +223,7 @@ static inline int attributeAddressSize(const uint8_t flags) { static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) { int currentPos = pos; uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); - while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { + while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) { currentPos += attributeAddressSize(flags); flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); } @@ -199,7 +233,7 @@ static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) static inline int childrenAddressSize(const uint8_t flags) { static const int CHILDREN_ADDRESS_SHIFT = 6; - return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; + return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; /* See the note in attributeAddressSize. The same applies here */ } @@ -212,12 +246,12 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos } inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { - return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; + return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos) { - if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { + if (FLAG_HAS_SHORTCUT_TARGETS & flags) { return pos + shortcutByteSize(dict, pos); } else { return pos; @@ -226,7 +260,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos) { - if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { + if (FLAG_HAS_BIGRAMS & flags) { return skipExistingBigrams(dict, pos); } else { return pos; @@ -253,15 +287,15 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos) { int offset = 0; - switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + switch (MASK_GROUP_ADDRESS_TYPE & flags) { + case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: offset = dict[pos]; break; - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: offset = dict[pos] << 8; offset += dict[pos + 1]; break; - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: offset = dict[pos] << 16; offset += dict[pos + 1] << 8; offset += dict[pos + 2]; @@ -275,32 +309,31 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const u } inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { - return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); + return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags)); } inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos) { int offset = 0; const int origin = *pos; - switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: offset = dict[origin]; *pos = origin + 1; break; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: offset = dict[origin] << 8; offset += dict[origin + 1]; *pos = origin + 2; break; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: offset = dict[origin] << 16; offset += dict[origin + 1] << 8; offset += dict[origin + 2]; *pos = origin + 3; break; } - if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { + if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { return origin - offset; } else { return origin + offset; @@ -332,7 +365,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, // char within a node, so either we found our match in this node, or there is // no match and we can return NOT_VALID_WORD. So we will check all the characters // in this character group indeed does match. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); while (NOT_A_CHARACTER != character) { ++wordPos; @@ -350,14 +383,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, // If we don't match the length AND don't have children, then a word in the // dictionary fully matches a prefix of the searched word but not the full word. ++wordPos; - if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { + if (FLAG_IS_TERMINAL & flags) { if (wordPos == length) { return charGroupPos; } - pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); + pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); } - if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { + if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { return NOT_VALID_WORD; } // We have children and we are still shorter than the word we are searching for, so @@ -367,7 +399,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, break; } else { // This chargroup does not match, so skip the remaining part and go to the next. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = BinaryFormat::skipOtherCharacters(root, pos); } pos = BinaryFormat::skipFrequency(flags, pos); @@ -420,7 +452,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a // We found the address. Copy the rest of the word in the buffer and return // the length. outWord[wordPos] = character; - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug @@ -435,7 +467,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a } // We need to skip past this char group, so skip any remaining chars after the // first and possibly the frequency. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = skipOtherCharacters(root, pos); } pos = skipFrequency(flags, pos); @@ -443,8 +475,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a // The fact that this group has children is very important. Since we already know // that this group does not match, if it has no children we know it is irrelevant // to what we are searching for. - const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != - (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); + const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != + (MASK_GROUP_ADDRESS_TYPE & flags)); // We will write in `found' whether we have passed the children address we are // searching for. For example if we search for "beer", the children of b are less // than the address we are searching for and the children of c are greater. When we @@ -484,7 +516,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); // We copy all the characters in this group to the buffer outWord[wordPos] = lastChar; - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { + if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) { int32_t nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); int charCount = maxDepth; |