diff options
Diffstat (limited to 'native/src/binary_format.h')
-rw-r--r-- | native/src/binary_format.h | 69 |
1 files changed, 54 insertions, 15 deletions
diff --git a/native/src/binary_format.h b/native/src/binary_format.h index 6f65088db..ab033ad90 100644 --- a/native/src/binary_format.h +++ b/native/src/binary_format.h @@ -17,22 +17,31 @@ #ifndef LATINIME_BINARY_FORMAT_H #define LATINIME_BINARY_FORMAT_H +#include <limits> #include "unigram_dictionary.h" namespace latinime { class BinaryFormat { -private: + private: const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; -public: + public: const static int UNKNOWN_FORMAT = -1; - const static int FORMAT_VERSION_1 = 1; - const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; + // Originally, format version 1 had a 16-bit magic number, then the version number `01' + // then options that must be 0. Hence the first 32-bits of the format are always as follow + // and it's okay to consider them a magic number as a whole. + const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; + const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; + // The versions of Latin IME that only handle format version 1 only test for the magic + // number, so we had to change it so that version 2 files would be rejected by older + // implementations. On this occasion, we made the magic number 32 bits long. + const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; static int detectFormat(const uint8_t* const dict); + static unsigned int getHeaderSize(const uint8_t* const dict); static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); @@ -55,13 +64,43 @@ public: }; inline int BinaryFormat::detectFormat(const uint8_t* const dict) { - const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian - if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1; - return UNKNOWN_FORMAT; + // The magic number is stored big-endian. + const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; + switch (magicNumber) { + case FORMAT_VERSION_1_MAGIC_NUMBER: + // Format 1 header is exactly 5 bytes long and looks like: + // Magic number (2 bytes) 0x78 0xB1 + // Version number (1 byte) 0x01 + // Options (2 bytes) must be 0x00 0x00 + return 1; + case FORMAT_VERSION_2_MAGIC_NUMBER: + // Format 2 header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Version number (2 bytes) 0x00 0x02 + // Options (2 bytes) must be 0x00 0x00 + // Header size (4 bytes) : integer, big endian + return (dict[4] << 8) + dict[5]; + default: + return UNKNOWN_FORMAT; + } +} + +inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { + switch (detectFormat(dict)) { + case 1: + return FORMAT_VERSION_1_HEADER_SIZE; + case 2: + // See the format of the header in the comment in detectFormat() above + return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11]; + default: + return std::numeric_limits<unsigned int>::max(); + } } inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { - return dict[(*pos)++]; + const int msb = dict[(*pos)++]; + if (msb < 0x80) return msb; + return ((msb & 0x7F) << 8) | dict[(*pos)++]; } inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { @@ -145,15 +184,15 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) { - // This function skips all attributes. The format makes provision for future extension - // with other attributes (notably shortcuts) but for the time being, bigrams are the - // only attributes that may be found in a character group, so we only look at bigrams - // in this version. + // This function skips all attributes: shortcuts and bigrams. + int newPos = pos; + if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { + newPos = skipAttributes(dict, newPos); + } if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { - return skipAttributes(dict, pos); - } else { - return pos; + newPos = skipAttributes(dict, newPos); } + return newPos; } inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, |