diff options
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r-- | native/jni/src/binary_format.h | 178 |
1 files changed, 104 insertions, 74 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 474c854fe..2ee4077c1 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -18,13 +18,47 @@ #define LATINIME_BINARY_FORMAT_H #include <limits> +#include <map> #include "bloom_filter.h" #include "char_utils.h" -#include "unigram_dictionary.h" namespace latinime { class BinaryFormat { + public: + // Mask and flags for children address type selection. + static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; + static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; + static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; + static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; + static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; + + // Flag for single/multiple char group + static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; + + // Flag for terminal groups + static const int FLAG_IS_TERMINAL = 0x10; + + // Flag for shortcut targets presence + static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; + // Flag for bigram presence + static const int FLAG_HAS_BIGRAMS = 0x04; + + // Attribute (bigram/shortcut) related flags: + // Flag for presence of more attributes + static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; + // Flag for sign of offset. If this flag is set, the offset value must be negated. + static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; + + // Mask for attribute frequency, stored on 4 bits inside the flags byte. + static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; + + // Mask and flags for attribute address type selection. + static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + private: DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; @@ -46,29 +80,29 @@ class BinaryFormat { const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; const static int SHORTCUT_LIST_SIZE_SIZE = 2; - static int detectFormat(const uint8_t* const dict); - static unsigned int getHeaderSize(const uint8_t* const dict); - static unsigned int getFlags(const uint8_t* const dict); - static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); - static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); - static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); - static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); - static int skipOtherCharacters(const uint8_t* const dict, const int pos); + static int detectFormat(const uint8_t *const dict); + static unsigned int getHeaderSize(const uint8_t *const dict); + static unsigned int getFlags(const uint8_t *const dict); + static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); + static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); + static int32_t getCharCodeAndForwardPointer(const uint8_t *const dict, int *pos); + static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); + static int skipOtherCharacters(const uint8_t *const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); static int skipFrequency(const uint8_t flags, const int pos); - static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos); - static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos); - static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); - static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, + static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); + static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos); + static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); + static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); - static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos); + static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos); static bool hasChildrenInFlags(const uint8_t flags); - static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, + static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos); - static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, + static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord, const int length, const bool forceLowerCaseSearch); - static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, - uint16_t* outWord, int* outUnigramFrequency); + static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, + uint16_t *outWord, int *outUnigramFrequency); static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); @@ -83,7 +117,7 @@ class BinaryFormat { const static unsigned int NO_FLAGS = 0; }; -inline int BinaryFormat::detectFormat(const uint8_t* const dict) { +inline int BinaryFormat::detectFormat(const uint8_t *const dict) { // The magic number is stored big-endian. const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; switch (magicNumber) { @@ -105,7 +139,7 @@ inline int BinaryFormat::detectFormat(const uint8_t* const dict) { } } -inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) { +inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) { switch (detectFormat(dict)) { case 1: return NO_FLAGS; @@ -114,7 +148,7 @@ inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) { } } -inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { +inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) { switch (detectFormat(dict)) { case 1: return FORMAT_VERSION_1_HEADER_SIZE; @@ -126,17 +160,17 @@ inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { } } -inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { +inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) { const int msb = dict[(*pos)++]; if (msb < 0x80) return msb; return ((msb & 0x7F) << 8) | dict[(*pos)++]; } -inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { +inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict, int *pos) { return dict[(*pos)++]; } -inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) { +inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t *const dict, int *pos) { const int origin = *pos; const int32_t character = dict[origin]; if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { @@ -155,12 +189,12 @@ inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const d } } -inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict, +inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos) { return dict[pos]; } -inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) { +inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) { int currentPos = pos; int32_t character = dict[currentPos++]; while (CHARACTER_ARRAY_TERMINATOR != character) { @@ -174,22 +208,22 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const in static inline int attributeAddressSize(const uint8_t flags) { static const int ATTRIBUTE_ADDRESS_SHIFT = 4; - return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; /* Note: this is a value-dependant optimization of what may probably be more readably written this way: - switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; default: return 0; } */ } -static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) { +static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) { int currentPos = pos; uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); - while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { + while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) { currentPos += attributeAddressSize(flags); flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); } @@ -199,11 +233,11 @@ static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) static inline int childrenAddressSize(const uint8_t flags) { static const int CHILDREN_ADDRESS_SHIFT = 6; - return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; + return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; /* See the note in attributeAddressSize. The same applies here */ } -static inline int shortcutByteSize(const uint8_t* const dict, const int pos) { +static inline int shortcutByteSize(const uint8_t *const dict, const int pos) { return ((int)(dict[pos] << 8)) + (dict[pos + 1]); } @@ -212,28 +246,28 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos } inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { - return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; + return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } -inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos) { - if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { + if (FLAG_HAS_SHORTCUT_TARGETS & flags) { return pos + shortcutByteSize(dict, pos); } else { return pos; } } -inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos) { - if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { + if (FLAG_HAS_BIGRAMS & flags) { return skipExistingBigrams(dict, pos); } else { return pos; } } -inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) { // This function skips all attributes: shortcuts and bigrams. int newPos = pos; @@ -242,7 +276,7 @@ inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint return newPos; } -inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, +inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) { int currentPos = pos; currentPos = skipChildrenPosition(flags, currentPos); @@ -250,18 +284,18 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, return currentPos; } -inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos) { int offset = 0; - switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + switch (MASK_GROUP_ADDRESS_TYPE & flags) { + case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: offset = dict[pos]; break; - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: offset = dict[pos] << 8; offset += dict[pos + 1]; break; - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: offset = dict[pos] << 16; offset += dict[pos + 1] << 8; offset += dict[pos + 2]; @@ -275,32 +309,31 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const u } inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { - return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); + return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags)); } -inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict, +inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos) { int offset = 0; const int origin = *pos; - switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: offset = dict[origin]; *pos = origin + 1; break; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: offset = dict[origin] << 8; offset += dict[origin + 1]; *pos = origin + 2; break; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: offset = dict[origin] << 16; offset += dict[origin + 1] << 8; offset += dict[origin + 2]; *pos = origin + 3; break; } - if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { + if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { return origin - offset; } else { return origin + offset; @@ -309,8 +342,8 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con // This function gets the byte position of the last chargroup of the exact matching word in the // dictionary. If no match is found, it returns NOT_VALID_WORD. -inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, - const int32_t* const inWord, const int length, const bool forceLowerCaseSearch) { +inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, + const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) { int pos = 0; int wordPos = 0; @@ -332,7 +365,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, // char within a node, so either we found our match in this node, or there is // no match and we can return NOT_VALID_WORD. So we will check all the characters // in this character group indeed does match. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); while (NOT_A_CHARACTER != character) { ++wordPos; @@ -350,14 +383,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, // If we don't match the length AND don't have children, then a word in the // dictionary fully matches a prefix of the searched word but not the full word. ++wordPos; - if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { + if (FLAG_IS_TERMINAL & flags) { if (wordPos == length) { return charGroupPos; } - pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); + pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); } - if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { + if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { return NOT_VALID_WORD; } // We have children and we are still shorter than the word we are searching for, so @@ -367,7 +399,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, break; } else { // This chargroup does not match, so skip the remaining part and go to the next. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = BinaryFormat::skipOtherCharacters(root, pos); } pos = BinaryFormat::skipFrequency(flags, pos); @@ -396,8 +428,8 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, * outUnigramFrequency: a pointer to an int to write the frequency into. * Return value : the length of the word, of 0 if the word was not found. */ -inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, - const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { +inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, + const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) { int pos = 0; int wordPos = 0; @@ -420,7 +452,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a // We found the address. Copy the rest of the word in the buffer and return // the length. outWord[wordPos] = character; - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug @@ -435,7 +467,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a } // We need to skip past this char group, so skip any remaining chars after the // first and possibly the frequency. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = skipOtherCharacters(root, pos); } pos = skipFrequency(flags, pos); @@ -443,8 +475,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a // The fact that this group has children is very important. Since we already know // that this group does not match, if it has no children we know it is irrelevant // to what we are searching for. - const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != - (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); + const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != + (MASK_GROUP_ADDRESS_TYPE & flags)); // We will write in `found' whether we have passed the children address we are // searching for. For example if we search for "beer", the children of b are less // than the address we are searching for and the children of c are greater. When we @@ -484,7 +516,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); // We copy all the characters in this group to the buffer outWord[wordPos] = lastChar; - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { + if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) { int32_t nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); int charCount = maxDepth; @@ -540,8 +572,8 @@ inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const // 0 for the bigram frequency represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. - const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); - return (int)(unigramFreq + (bigramFreq + 1) * stepSize); + const float stepSize = (static_cast<float>(MAX_FREQ) - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); + return static_cast<int>(unigramFreq + (bigramFreq + 1) * stepSize); } // This returns a probability in log space. @@ -557,7 +589,5 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int, return backoff(unigramFreq); } } - } // namespace latinime - #endif // LATINIME_BINARY_FORMAT_H |