diff options
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r-- | native/jni/src/binary_format.h | 114 |
1 files changed, 58 insertions, 56 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index eec52e323..9a8c315f7 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -28,10 +28,6 @@ class BinaryFormat { public: // Mask and flags for children address type selection. static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; - static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; - static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; - static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; - static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; // Flag for single/multiple char group static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; @@ -61,36 +57,21 @@ class BinaryFormat { // Mask and flags for attribute address type selection. static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; - const static int UNKNOWN_FORMAT = -1; - // Originally, format version 1 had a 16-bit magic number, then the version number `01' - // then options that must be 0. Hence the first 32-bits of the format are always as follow - // and it's okay to consider them a magic number as a whole. - const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; - const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; - // The versions of Latin IME that only handle format version 1 only test for the magic - // number, so we had to change it so that version 2 files would be rejected by older - // implementations. On this occasion, we made the magic number 32 bits long. - const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; - - const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; - const static int SHORTCUT_LIST_SIZE_SIZE = 2; + static const int UNKNOWN_FORMAT = -1; + static const int SHORTCUT_LIST_SIZE_SIZE = 2; static int detectFormat(const uint8_t *const dict); static unsigned int getHeaderSize(const uint8_t *const dict); static unsigned int getFlags(const uint8_t *const dict); static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); - static int32_t getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); + static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); static int skipOtherCharacters(const uint8_t *const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); static int skipFrequency(const uint8_t flags, const int pos); static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); - static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos); static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos); @@ -98,10 +79,10 @@ class BinaryFormat { static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos); static int getAttributeFrequencyFromFlags(const int flags); - static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord, + static int getTerminalPosition(const uint8_t *const root, const int *const inWord, const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, - uint16_t *outWord, int *outUnigramFrequency); + int *outWord, int *outUnigramFrequency); static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); @@ -113,17 +94,37 @@ class BinaryFormat { REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1, REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4 }; - const static unsigned int NO_FLAGS = 0; private: DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); - const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; - const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; - const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; + static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; + static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; + static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; + static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + + // Originally, format version 1 had a 16-bit magic number, then the version number `01' + // then options that must be 0. Hence the first 32-bits of the format are always as follow + // and it's okay to consider them a magic number as a whole. + static const uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; + static const unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; + // The versions of Latin IME that only handle format version 1 only test for the magic + // number, so we had to change it so that version 2 files would be rejected by older + // implementations. On this occasion, we made the magic number 32 bits long. + static const uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; + + static const int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; + static const int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + static const int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; + static const int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; + static const unsigned int NO_FLAGS = 0; static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); + static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos); }; -inline int BinaryFormat::detectFormat(const uint8_t *const dict) { +AK_FORCE_INLINE int BinaryFormat::detectFormat(const uint8_t *const dict) { // The magic number is stored big-endian. const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; switch (magicNumber) { @@ -148,7 +149,7 @@ inline int BinaryFormat::detectFormat(const uint8_t *const dict) { inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) { switch (detectFormat(dict)) { case 1: - return NO_FLAGS; + return NO_FLAGS; // TODO: NO_FLAGS is unused anywhere else? default: return (dict[6] << 8) + dict[7]; } @@ -166,7 +167,8 @@ inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) { } } -inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) { +AK_FORCE_INLINE int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, + int *pos) { const int msb = dict[(*pos)++]; if (msb < 0x80) return msb; return ((msb & 0x7F) << 8) | dict[(*pos)++]; @@ -176,17 +178,18 @@ inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict return dict[(*pos)++]; } -inline int32_t BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, int *pos) { +AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, + int *pos) { const int origin = *pos; - const int32_t codePoint = dict[origin]; + const int codePoint = dict[origin]; if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { if (codePoint == CHARACTER_ARRAY_TERMINATOR) { *pos = origin + 1; return NOT_A_CODE_POINT; } else { *pos = origin + 3; - const int32_t char_1 = codePoint << 16; - const int32_t char_2 = char_1 + (dict[origin + 1] << 8); + const int char_1 = codePoint << 16; + const int char_2 = char_1 + (dict[origin + 1] << 8); return char_2 + dict[origin + 2]; } } else { @@ -200,9 +203,9 @@ inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const return dict[pos]; } -inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) { +AK_FORCE_INLINE int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) { int currentPos = pos; - int32_t character = dict[currentPos++]; + int character = dict[currentPos++]; while (CHARACTER_ARRAY_TERMINATOR != character) { if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; @@ -226,7 +229,7 @@ static inline int attributeAddressSize(const uint8_t flags) { */ } -static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) { +static AK_FORCE_INLINE int skipExistingBigrams(const uint8_t *const dict, const int pos) { int currentPos = pos; uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) { @@ -255,7 +258,7 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } -inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, +AK_FORCE_INLINE int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos) { if (FLAG_HAS_SHORTCUT_TARGETS & flags) { return pos + shortcutByteSize(dict, pos); @@ -264,7 +267,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t } } -inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, +AK_FORCE_INLINE int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos) { if (FLAG_HAS_BIGRAMS & flags) { return skipExistingBigrams(dict, pos); @@ -273,7 +276,7 @@ inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t fl } } -inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags, +AK_FORCE_INLINE int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) { // This function skips all attributes: shortcuts and bigrams. int newPos = pos; @@ -282,7 +285,7 @@ inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint return newPos; } -inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, +AK_FORCE_INLINE int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) { int currentPos = pos; currentPos = skipChildrenPosition(flags, currentPos); @@ -290,8 +293,8 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, return currentPos; } -inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags, - const int pos) { +AK_FORCE_INLINE int BinaryFormat::readChildrenPosition(const uint8_t *const dict, + const uint8_t flags, const int pos) { int offset = 0; switch (MASK_GROUP_ADDRESS_TYPE & flags) { case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: @@ -318,7 +321,7 @@ inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags)); } -inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict, +AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos) { int offset = 0; const int origin = *pos; @@ -352,8 +355,8 @@ inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) { // This function gets the byte position of the last chargroup of the exact matching word in the // dictionary. If no match is found, it returns NOT_VALID_WORD. -inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, - const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) { +AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, + const int *const inWord, const int length, const bool forceLowerCaseSearch) { int pos = 0; int wordPos = 0; @@ -362,14 +365,14 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, // there was no match (or we would have found it). if (wordPos >= length) return NOT_VALID_WORD; int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); - const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos]; + const int wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos]; while (true) { // If there are no more character groups in this node, it means we could not // find a matching character for this depth, therefore there is no match. if (0 >= charGroupCount) return NOT_VALID_WORD; const int charGroupPos = pos; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - int32_t character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); + int character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); if (character == wChar) { // This is the correct node. Only one character group may start with the same // char within a node, so either we found our match in this node, or there is @@ -438,8 +441,8 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, * outUnigramFrequency: a pointer to an int to write the frequency into. * Return value : the length of the word, of 0 if the word was not found. */ -inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, - const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) { +AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, + const int maxDepth, int *outWord, int *outUnigramFrequency) { int pos = 0; int wordPos = 0; @@ -457,13 +460,13 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a --charGroupCount) { const int startPos = pos; const uint8_t flags = getFlagsAndForwardPointer(root, &pos); - const int32_t character = getCodePointAndForwardPointer(root, &pos); + const int character = getCodePointAndForwardPointer(root, &pos); if (address == startPos) { // We found the address. Copy the rest of the word in the buffer and return // the length. outWord[wordPos] = character; if (FLAG_HAS_MULTIPLE_CHARS & flags) { - int32_t nextChar = getCodePointAndForwardPointer(root, &pos); + int nextChar = getCodePointAndForwardPointer(root, &pos); // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug int charCount = maxDepth; @@ -522,13 +525,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a if (0 != lastCandidateGroupPos) { const uint8_t lastFlags = getFlagsAndForwardPointer(root, &lastCandidateGroupPos); - const int32_t lastChar = + const int lastChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); // We copy all the characters in this group to the buffer outWord[wordPos] = lastChar; if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) { - int32_t nextChar = - getCodePointAndForwardPointer(root, &lastCandidateGroupPos); + int nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); int charCount = maxDepth; while (-1 != nextChar && --charCount > 0) { outWord[++wordPos] = nextChar; |