1 files changed, 125 insertions, 57 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index eec52e323..400389615 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -17,6 +17,7 @@
 #ifndef LATINIME_BINARY_FORMAT_H
 #define LATINIME_BINARY_FORMAT_H
 
+#include <cstdlib>
 #include <limits>
 #include <map>
 #include "bloom_filter.h"
@@ -28,10 +29,6 @@ class BinaryFormat {
  public:
     // Mask and flags for children address type selection.
     static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
-    static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
-    static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
-    static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
-    static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
 
     // Flag for single/multiple char group
     static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
@@ -61,36 +58,24 @@ class BinaryFormat {
 
     // Mask and flags for attribute address type selection.
     static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
-    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
-    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
-    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
-
-    const static int UNKNOWN_FORMAT = -1;
-    // Originally, format version 1 had a 16-bit magic number, then the version number `01'
-    // then options that must be 0. Hence the first 32-bits of the format are always as follow
-    // and it's okay to consider them a magic number as a whole.
-    const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
-    const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
-    // The versions of Latin IME that only handle format version 1 only test for the magic
-    // number, so we had to change it so that version 2 files would be rejected by older
-    // implementations. On this occasion, we made the magic number 32 bits long.
-    const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
 
-    const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
-    const static int SHORTCUT_LIST_SIZE_SIZE = 2;
+    static const int UNKNOWN_FORMAT = -1;
+    static const int SHORTCUT_LIST_SIZE_SIZE = 2;
 
     static int detectFormat(const uint8_t *const dict);
     static unsigned int getHeaderSize(const uint8_t *const dict);
     static unsigned int getFlags(const uint8_t *const dict);
+    static void readHeaderValue(const uint8_t *const dict, const char *const key,
+            int *outValue, const int outValueSize);
+    static int readHeaderValueInt(const uint8_t *const dict, const char *const key);
     static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
     static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
-    static int32_t getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
+    static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
     static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos);
     static int skipOtherCharacters(const uint8_t *const dict, const int pos);
     static int skipChildrenPosition(const uint8_t flags, const int pos);
     static int skipFrequency(const uint8_t flags, const int pos);
     static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
-    static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
     static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
             const int pos);
     static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos);
@@ -98,10 +83,10 @@ class BinaryFormat {
     static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags,
             int *pos);
     static int getAttributeFrequencyFromFlags(const int flags);
-    static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord,
+    static int getTerminalPosition(const uint8_t *const root, const int *const inWord,
             const int length, const bool forceLowerCaseSearch);
     static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
-            uint16_t *outWord, int *outUnigramFrequency);
+            int *outWord, int *outUnigramFrequency);
     static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
     static int getProbability(const int position, const std::map<int, int> *bigramMap,
             const uint8_t *bigramFilter, const int unigramFreq);
@@ -113,17 +98,37 @@ class BinaryFormat {
         REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
         REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
     };
-    const static unsigned int NO_FLAGS = 0;
 
  private:
     DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
-    const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
-    const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
-    const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
+    static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
+    static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
+    static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
+    static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
+    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
+    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
+    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+
+    // Originally, format version 1 had a 16-bit magic number, then the version number `01'
+    // then options that must be 0. Hence the first 32-bits of the format are always as follow
+    // and it's okay to consider them a magic number as a whole.
+    static const uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
+    static const unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
+    // The versions of Latin IME that only handle format version 1 only test for the magic
+    // number, so we had to change it so that version 2 files would be rejected by older
+    // implementations. On this occasion, we made the magic number 32 bits long.
+    static const uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
+
+    static const int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
+    static const int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
+    static const int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
+    static const int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
+    static const unsigned int NO_FLAGS = 0;
     static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
+    static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
 };
 
-inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
+AK_FORCE_INLINE int BinaryFormat::detectFormat(const uint8_t *const dict) {
     // The magic number is stored big-endian.
     const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
     switch (magicNumber) {
@@ -148,7 +153,7 @@ inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
 inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) {
     switch (detectFormat(dict)) {
     case 1:
-        return NO_FLAGS;
+        return NO_FLAGS; // TODO: NO_FLAGS is unused anywhere else?
     default:
         return (dict[6] << 8) + dict[7];
     }
@@ -166,7 +171,70 @@ inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) {
     }
 }
 
-inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) {
+inline void BinaryFormat::readHeaderValue(const uint8_t *const dict, const char *const key,
+        int *outValue, const int outValueSize) {
+    int outValueIndex = 0;
+    // Only format 2 and above have header attributes as {key,value} string pairs. For prior
+    // formats, we just return an empty string, as if the key wasn't found.
+    if (2 <= detectFormat(dict)) {
+        const int headerOptionsOffset = 4 /* magic number */
+                + 2 /* dictionary version */ + 2 /* flags */;
+        const int headerSize =
+                (dict[headerOptionsOffset] << 24) + (dict[headerOptionsOffset + 1] << 16)
+                + (dict[headerOptionsOffset + 2] << 8) + dict[headerOptionsOffset + 3];
+        const int headerEnd = headerOptionsOffset + 4 + headerSize;
+        int index = headerOptionsOffset + 4;
+        while (index < headerEnd) {
+            int keyIndex = 0;
+            int codePoint = getCodePointAndForwardPointer(dict, &index);
+            while (codePoint != NOT_A_CODE_POINT) {
+                if (codePoint != key[keyIndex++]) {
+                    break;
+                }
+                codePoint = getCodePointAndForwardPointer(dict, &index);
+            }
+            if (codePoint == NOT_A_CODE_POINT && key[keyIndex] == 0) {
+                // We found the key! Copy and return the value.
+                codePoint = getCodePointAndForwardPointer(dict, &index);
+                while (codePoint != NOT_A_CODE_POINT
+                        && outValueIndex < outValueSize) {
+                    outValue[outValueIndex++] = codePoint;
+                    codePoint = getCodePointAndForwardPointer(dict, &index);
+                }
+                // Finished copying. Break to go to the termination code.
+                break;
+            }
+            // We didn't find the key, skip the remainder of it and its value
+            while (codePoint != NOT_A_CODE_POINT) {
+                codePoint = getCodePointAndForwardPointer(dict, &index);
+            }
+            codePoint = getCodePointAndForwardPointer(dict, &index);
+            while (codePoint != NOT_A_CODE_POINT) {
+                codePoint = getCodePointAndForwardPointer(dict, &index);
+            }
+        }
+        // We couldn't find it - fall through and return an empty value.
+    }
+    // Put a terminator 0 if possible at all (always unless outValueSize is <= 0)
+    if (outValueIndex >= outValueSize) outValueIndex = outValueSize - 1;
+    if (outValueIndex >= 0) outValue[outValueIndex] = 0;
+}
+
+inline int BinaryFormat::readHeaderValueInt(const uint8_t *const dict, const char *const key) {
+    const int bufferSize = LARGEST_INT_DIGIT_COUNT;
+    int intBuffer[bufferSize];
+    char charBuffer[bufferSize];
+    BinaryFormat::readHeaderValue(dict, key, intBuffer, bufferSize);
+    for (int i = 0; i < bufferSize; ++i) {
+        charBuffer[i] = intBuffer[i];
+    }
+    // If not a number, return S_INT_MIN
+    if (!isdigit(charBuffer[0])) return S_INT_MIN;
+    return atoi(charBuffer);
+}
+
+AK_FORCE_INLINE int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict,
+        int *pos) {
     const int msb = dict[(*pos)++];
     if (msb < 0x80) return msb;
     return ((msb & 0x7F) << 8) | dict[(*pos)++];
@@ -176,17 +244,18 @@ inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict
     return dict[(*pos)++];
 }
 
-inline int32_t BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, int *pos) {
+AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict,
+        int *pos) {
     const int origin = *pos;
-    const int32_t codePoint = dict[origin];
+    const int codePoint = dict[origin];
     if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
         if (codePoint == CHARACTER_ARRAY_TERMINATOR) {
             *pos = origin + 1;
             return NOT_A_CODE_POINT;
         } else {
             *pos = origin + 3;
-            const int32_t char_1 = codePoint << 16;
-            const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
+            const int char_1 = codePoint << 16;
+            const int char_2 = char_1 + (dict[origin + 1] << 8);
             return char_2 + dict[origin + 2];
         }
     } else {
@@ -200,9 +269,9 @@ inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const
     return dict[pos];
 }
 
-inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) {
+AK_FORCE_INLINE int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) {
     int currentPos = pos;
-    int32_t character = dict[currentPos++];
+    int character = dict[currentPos++];
     while (CHARACTER_ARRAY_TERMINATOR != character) {
         if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
             currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
@@ -226,7 +295,7 @@ static inline int attributeAddressSize(const uint8_t flags) {
     */
 }
 
-static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
+static AK_FORCE_INLINE int skipExistingBigrams(const uint8_t *const dict, const int pos) {
     int currentPos = pos;
     uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
     while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
@@ -243,7 +312,7 @@ static inline int childrenAddressSize(const uint8_t flags) {
     /* See the note in attributeAddressSize. The same applies here */
 }
 
-static inline int shortcutByteSize(const uint8_t *const dict, const int pos) {
+static AK_FORCE_INLINE int shortcutByteSize(const uint8_t *const dict, const int pos) {
     return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
 }
 
@@ -255,7 +324,7 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
     return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
 }
 
-inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
+AK_FORCE_INLINE int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
         const int pos) {
     if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
         return pos + shortcutByteSize(dict, pos);
@@ -264,7 +333,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t
     }
 }
 
-inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
+AK_FORCE_INLINE int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
         const int pos) {
     if (FLAG_HAS_BIGRAMS & flags) {
         return skipExistingBigrams(dict, pos);
@@ -273,7 +342,7 @@ inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t fl
     }
 }
 
-inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags,
+AK_FORCE_INLINE int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags,
         const int pos) {
     // This function skips all attributes: shortcuts and bigrams.
     int newPos = pos;
@@ -282,7 +351,7 @@ inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint
     return newPos;
 }
 
-inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
+AK_FORCE_INLINE int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
         const uint8_t flags, const int pos) {
     int currentPos = pos;
     currentPos = skipChildrenPosition(flags, currentPos);
@@ -290,8 +359,8 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
     return currentPos;
 }
 
-inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
-        const int pos) {
+AK_FORCE_INLINE int BinaryFormat::readChildrenPosition(const uint8_t *const dict,
+        const uint8_t flags, const int pos) {
     int offset = 0;
     switch (MASK_GROUP_ADDRESS_TYPE & flags) {
         case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
@@ -318,7 +387,7 @@ inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
     return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
 }
 
-inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
+AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
         const uint8_t flags, int *pos) {
     int offset = 0;
     const int origin = *pos;
@@ -352,8 +421,8 @@ inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) {
 
 // This function gets the byte position of the last chargroup of the exact matching word in the
 // dictionary. If no match is found, it returns NOT_VALID_WORD.
-inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
-        const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) {
+AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
+        const int *const inWord, const int length, const bool forceLowerCaseSearch) {
     int pos = 0;
     int wordPos = 0;
 
@@ -362,14 +431,14 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
         // there was no match (or we would have found it).
         if (wordPos >= length) return NOT_VALID_WORD;
         int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
-        const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
+        const int wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
         while (true) {
             // If there are no more character groups in this node, it means we could not
             // find a matching character for this depth, therefore there is no match.
             if (0 >= charGroupCount) return NOT_VALID_WORD;
             const int charGroupPos = pos;
             const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
-            int32_t character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
+            int character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
             if (character == wChar) {
                 // This is the correct node. Only one character group may start with the same
                 // char within a node, so either we found our match in this node, or there is
@@ -438,8 +507,8 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
  * outUnigramFrequency: a pointer to an int to write the frequency into.
  * Return value : the length of the word, of 0 if the word was not found.
  */
-inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
-        const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) {
+AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
+        const int maxDepth, int *outWord, int *outUnigramFrequency) {
     int pos = 0;
     int wordPos = 0;
 
@@ -457,13 +526,13 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
                  --charGroupCount) {
             const int startPos = pos;
             const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
-            const int32_t character = getCodePointAndForwardPointer(root, &pos);
+            const int character = getCodePointAndForwardPointer(root, &pos);
             if (address == startPos) {
                 // We found the address. Copy the rest of the word in the buffer and return
                 // the length.
                 outWord[wordPos] = character;
                 if (FLAG_HAS_MULTIPLE_CHARS & flags) {
-                    int32_t nextChar = getCodePointAndForwardPointer(root, &pos);
+                    int nextChar = getCodePointAndForwardPointer(root, &pos);
                     // We count chars in order to avoid infinite loops if the file is broken or
                     // if there is some other bug
                     int charCount = maxDepth;
@@ -522,13 +591,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
                 if (0 != lastCandidateGroupPos) {
                     const uint8_t lastFlags =
                             getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
-                    const int32_t lastChar =
+                    const int lastChar =
                             getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
                     // We copy all the characters in this group to the buffer
                     outWord[wordPos] = lastChar;
                     if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
-                        int32_t nextChar =
-                                getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
+                        int nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
                         int charCount = maxDepth;
                         while (-1 != nextChar && --charCount > 0) {
                             outWord[++wordPos] = nextChar;