aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/binary_format.h
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r--native/jni/src/binary_format.h114
1 files changed, 58 insertions, 56 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index eec52e323..9a8c315f7 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -28,10 +28,6 @@ class BinaryFormat {
public:
// Mask and flags for children address type selection.
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
- static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
- static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
- static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
- static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
// Flag for single/multiple char group
static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
@@ -61,36 +57,21 @@ class BinaryFormat {
// Mask and flags for attribute address type selection.
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
- static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
- static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
- static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
- const static int UNKNOWN_FORMAT = -1;
- // Originally, format version 1 had a 16-bit magic number, then the version number `01'
- // then options that must be 0. Hence the first 32-bits of the format are always as follow
- // and it's okay to consider them a magic number as a whole.
- const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
- const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
- // The versions of Latin IME that only handle format version 1 only test for the magic
- // number, so we had to change it so that version 2 files would be rejected by older
- // implementations. On this occasion, we made the magic number 32 bits long.
- const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
-
- const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
- const static int SHORTCUT_LIST_SIZE_SIZE = 2;
+ static const int UNKNOWN_FORMAT = -1;
+ static const int SHORTCUT_LIST_SIZE_SIZE = 2;
static int detectFormat(const uint8_t *const dict);
static unsigned int getHeaderSize(const uint8_t *const dict);
static unsigned int getFlags(const uint8_t *const dict);
static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
- static int32_t getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
+ static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos);
static int skipOtherCharacters(const uint8_t *const dict, const int pos);
static int skipChildrenPosition(const uint8_t flags, const int pos);
static int skipFrequency(const uint8_t flags, const int pos);
static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
- static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
const int pos);
static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos);
@@ -98,10 +79,10 @@ class BinaryFormat {
static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags,
int *pos);
static int getAttributeFrequencyFromFlags(const int flags);
- static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord,
+ static int getTerminalPosition(const uint8_t *const root, const int *const inWord,
const int length, const bool forceLowerCaseSearch);
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
- uint16_t *outWord, int *outUnigramFrequency);
+ int *outWord, int *outUnigramFrequency);
static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
static int getProbability(const int position, const std::map<int, int> *bigramMap,
const uint8_t *bigramFilter, const int unigramFreq);
@@ -113,17 +94,37 @@ class BinaryFormat {
REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
};
- const static unsigned int NO_FLAGS = 0;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
- const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
- const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
- const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
+ static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
+ static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
+ static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
+ static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+
+ // Originally, format version 1 had a 16-bit magic number, then the version number `01'
+ // then options that must be 0. Hence the first 32-bits of the format are always as follow
+ // and it's okay to consider them a magic number as a whole.
+ static const uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
+ static const unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
+ // The versions of Latin IME that only handle format version 1 only test for the magic
+ // number, so we had to change it so that version 2 files would be rejected by older
+ // implementations. On this occasion, we made the magic number 32 bits long.
+ static const uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
+
+ static const int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
+ static const int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
+ static const int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
+ static const int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
+ static const unsigned int NO_FLAGS = 0;
static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
+ static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
};
-inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
+AK_FORCE_INLINE int BinaryFormat::detectFormat(const uint8_t *const dict) {
// The magic number is stored big-endian.
const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
switch (magicNumber) {
@@ -148,7 +149,7 @@ inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) {
switch (detectFormat(dict)) {
case 1:
- return NO_FLAGS;
+ return NO_FLAGS; // TODO: NO_FLAGS is unused anywhere else?
default:
return (dict[6] << 8) + dict[7];
}
@@ -166,7 +167,8 @@ inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) {
}
}
-inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) {
+AK_FORCE_INLINE int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict,
+ int *pos) {
const int msb = dict[(*pos)++];
if (msb < 0x80) return msb;
return ((msb & 0x7F) << 8) | dict[(*pos)++];
@@ -176,17 +178,18 @@ inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict
return dict[(*pos)++];
}
-inline int32_t BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, int *pos) {
+AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict,
+ int *pos) {
const int origin = *pos;
- const int32_t codePoint = dict[origin];
+ const int codePoint = dict[origin];
if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
if (codePoint == CHARACTER_ARRAY_TERMINATOR) {
*pos = origin + 1;
return NOT_A_CODE_POINT;
} else {
*pos = origin + 3;
- const int32_t char_1 = codePoint << 16;
- const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
+ const int char_1 = codePoint << 16;
+ const int char_2 = char_1 + (dict[origin + 1] << 8);
return char_2 + dict[origin + 2];
}
} else {
@@ -200,9 +203,9 @@ inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const
return dict[pos];
}
-inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) {
+AK_FORCE_INLINE int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) {
int currentPos = pos;
- int32_t character = dict[currentPos++];
+ int character = dict[currentPos++];
while (CHARACTER_ARRAY_TERMINATOR != character) {
if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE;
@@ -226,7 +229,7 @@ static inline int attributeAddressSize(const uint8_t flags) {
*/
}
-static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
+static AK_FORCE_INLINE int skipExistingBigrams(const uint8_t *const dict, const int pos) {
int currentPos = pos;
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
@@ -255,7 +258,7 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}
-inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
+AK_FORCE_INLINE int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
const int pos) {
if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
return pos + shortcutByteSize(dict, pos);
@@ -264,7 +267,7 @@ inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t
}
}
-inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
+AK_FORCE_INLINE int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
const int pos) {
if (FLAG_HAS_BIGRAMS & flags) {
return skipExistingBigrams(dict, pos);
@@ -273,7 +276,7 @@ inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t fl
}
}
-inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags,
+AK_FORCE_INLINE int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags,
const int pos) {
// This function skips all attributes: shortcuts and bigrams.
int newPos = pos;
@@ -282,7 +285,7 @@ inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint
return newPos;
}
-inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
+AK_FORCE_INLINE int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
const uint8_t flags, const int pos) {
int currentPos = pos;
currentPos = skipChildrenPosition(flags, currentPos);
@@ -290,8 +293,8 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
return currentPos;
}
-inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
- const int pos) {
+AK_FORCE_INLINE int BinaryFormat::readChildrenPosition(const uint8_t *const dict,
+ const uint8_t flags, const int pos) {
int offset = 0;
switch (MASK_GROUP_ADDRESS_TYPE & flags) {
case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
@@ -318,7 +321,7 @@ inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
}
-inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
+AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
const uint8_t flags, int *pos) {
int offset = 0;
const int origin = *pos;
@@ -352,8 +355,8 @@ inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) {
// This function gets the byte position of the last chargroup of the exact matching word in the
// dictionary. If no match is found, it returns NOT_VALID_WORD.
-inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
- const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) {
+AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
+ const int *const inWord, const int length, const bool forceLowerCaseSearch) {
int pos = 0;
int wordPos = 0;
@@ -362,14 +365,14 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
// there was no match (or we would have found it).
if (wordPos >= length) return NOT_VALID_WORD;
int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
- const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
+ const int wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
while (true) {
// If there are no more character groups in this node, it means we could not
// find a matching character for this depth, therefore there is no match.
if (0 >= charGroupCount) return NOT_VALID_WORD;
const int charGroupPos = pos;
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
- int32_t character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
+ int character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
if (character == wChar) {
// This is the correct node. Only one character group may start with the same
// char within a node, so either we found our match in this node, or there is
@@ -438,8 +441,8 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
* outUnigramFrequency: a pointer to an int to write the frequency into.
* Return value : the length of the word, of 0 if the word was not found.
*/
-inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
- const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) {
+AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
+ const int maxDepth, int *outWord, int *outUnigramFrequency) {
int pos = 0;
int wordPos = 0;
@@ -457,13 +460,13 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
--charGroupCount) {
const int startPos = pos;
const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
- const int32_t character = getCodePointAndForwardPointer(root, &pos);
+ const int character = getCodePointAndForwardPointer(root, &pos);
if (address == startPos) {
// We found the address. Copy the rest of the word in the buffer and return
// the length.
outWord[wordPos] = character;
if (FLAG_HAS_MULTIPLE_CHARS & flags) {
- int32_t nextChar = getCodePointAndForwardPointer(root, &pos);
+ int nextChar = getCodePointAndForwardPointer(root, &pos);
// We count chars in order to avoid infinite loops if the file is broken or
// if there is some other bug
int charCount = maxDepth;
@@ -522,13 +525,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
if (0 != lastCandidateGroupPos) {
const uint8_t lastFlags =
getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
- const int32_t lastChar =
+ const int lastChar =
getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
// We copy all the characters in this group to the buffer
outWord[wordPos] = lastChar;
if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
- int32_t nextChar =
- getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
+ int nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
int charCount = maxDepth;
while (-1 != nextChar && --charCount > 0) {
outWord[++wordPos] = nextChar;