diff options
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r-- | native/jni/src/binary_format.h | 61 |
1 files changed, 51 insertions, 10 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index ab033ad90..f59302460 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -40,16 +40,21 @@ class BinaryFormat { // implementations. On this occasion, we made the magic number 32 bits long. const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; + const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; + const static int SHORTCUT_LIST_SIZE_SIZE = 2; + static int detectFormat(const uint8_t* const dict); static unsigned int getHeaderSize(const uint8_t* const dict); + static unsigned int getFlags(const uint8_t* const dict); static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); static int skipOtherCharacters(const uint8_t* const dict, const int pos); - static int skipAttributes(const uint8_t* const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); static int skipFrequency(const uint8_t flags, const int pos); + static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos); + static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos); static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); @@ -61,6 +66,15 @@ class BinaryFormat { const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, uint16_t* outWord); + + // Flags for special processing + // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or + // something very bad (like, the apocalypse) will happen. Please update both at the same time. + enum { + REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1, + REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4 + }; + const static unsigned int NO_FLAGS = 0; }; inline int BinaryFormat::detectFormat(const uint8_t* const dict) { @@ -77,7 +91,7 @@ inline int BinaryFormat::detectFormat(const uint8_t* const dict) { // Format 2 header is as follows: // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE // Version number (2 bytes) 0x00 0x02 - // Options (2 bytes) must be 0x00 0x00 + // Options (2 bytes) // Header size (4 bytes) : integer, big endian return (dict[4] << 8) + dict[5]; default: @@ -85,6 +99,15 @@ inline int BinaryFormat::detectFormat(const uint8_t* const dict) { } } +inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) { + switch (detectFormat(dict)) { + case 1: + return NO_FLAGS; + default: + return (dict[6] << 8) + dict[7]; + } +} + inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { switch (detectFormat(dict)) { case 1: @@ -157,12 +180,12 @@ static inline int attributeAddressSize(const uint8_t flags) { */ } -inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) { +static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) { int currentPos = pos; - uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos); + uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { currentPos += attributeAddressSize(flags); - flags = getFlagsAndForwardPointer(dict, ¤tPos); + flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); } currentPos += attributeAddressSize(flags); return currentPos; @@ -174,6 +197,10 @@ static inline int childrenAddressSize(const uint8_t flags) { /* See the note in attributeAddressSize. The same applies here */ } +static inline int shortcutByteSize(const uint8_t* const dict, const int pos) { + return ((int)(dict[pos] << 8)) + (dict[pos + 1]); +} + inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { return pos + childrenAddressSize(flags); } @@ -182,16 +209,30 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } -inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos) { - // This function skips all attributes: shortcuts and bigrams. - int newPos = pos; if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { - newPos = skipAttributes(dict, newPos); + return pos + shortcutByteSize(dict, pos); + } else { + return pos; } +} + +inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags, + const int pos) { if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { - newPos = skipAttributes(dict, newPos); + return skipExistingBigrams(dict, pos); + } else { + return pos; } +} + +inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, + const int pos) { + // This function skips all attributes: shortcuts and bigrams. + int newPos = pos; + newPos = skipShortcuts(dict, flags, newPos); + newPos = skipBigrams(dict, flags, newPos); return newPos; } |