aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/binary_format.h
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r--native/jni/src/binary_format.h200
1 files changed, 119 insertions, 81 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 51bf8ebbc..d8f3e83dd 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -18,18 +18,49 @@
#define LATINIME_BINARY_FORMAT_H
#include <limits>
+#include <map>
#include "bloom_filter.h"
-#include "unigram_dictionary.h"
+#include "char_utils.h"
namespace latinime {
class BinaryFormat {
- private:
- const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
- const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
- const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
-
public:
+ // Mask and flags for children address type selection.
+ static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
+ static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
+ static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
+ static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
+ static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
+
+ // Flag for single/multiple char group
+ static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
+
+ // Flag for terminal groups
+ static const int FLAG_IS_TERMINAL = 0x10;
+
+ // Flag for shortcut targets presence
+ static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
+ // Flag for bigram presence
+ static const int FLAG_HAS_BIGRAMS = 0x04;
+
+ // Attribute (bigram/shortcut) related flags:
+ // Flag for presence of more attributes
+ static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
+ // Flag for sign of offset. If this flag is set, the offset value must be negated.
+ static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
+
+ // Mask for attribute frequency, stored on 4 bits inside the flags byte.
+ static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
+ // The numeric value of the shortcut frequency that means 'whitelist'.
+ static const int WHITELIST_SHORTCUT_FREQUENCY = 15;
+
+ // Mask and flags for attribute address type selection.
+ static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
+ static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+
const static int UNKNOWN_FORMAT = -1;
// Originally, format version 1 had a 16-bit magic number, then the version number `01'
// then options that must be 0. Hence the first 32-bits of the format are always as follow
@@ -44,29 +75,29 @@ class BinaryFormat {
const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
const static int SHORTCUT_LIST_SIZE_SIZE = 2;
- static int detectFormat(const uint8_t* const dict);
- static unsigned int getHeaderSize(const uint8_t* const dict);
- static unsigned int getFlags(const uint8_t* const dict);
- static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
- static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
- static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
- static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
- static int skipOtherCharacters(const uint8_t* const dict, const int pos);
+ static int detectFormat(const uint8_t *const dict);
+ static unsigned int getHeaderSize(const uint8_t *const dict);
+ static unsigned int getFlags(const uint8_t *const dict);
+ static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
+ static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
+ static int32_t getCharCodeAndForwardPointer(const uint8_t *const dict, int *pos);
+ static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos);
+ static int skipOtherCharacters(const uint8_t *const dict, const int pos);
static int skipChildrenPosition(const uint8_t flags, const int pos);
static int skipFrequency(const uint8_t flags, const int pos);
- static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
- static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
- static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
- static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
+ static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
+ static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
+ static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
const int pos);
- static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos);
+ static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos);
static bool hasChildrenInFlags(const uint8_t flags);
- static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
+ static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags,
int *pos);
- static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
- const int length);
- static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
- uint16_t* outWord, int* outUnigramFrequency);
+ static int getAttributeFrequencyFromFlags(const int flags);
+ static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord,
+ const int length, const bool forceLowerCaseSearch);
+ static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
+ uint16_t *outWord, int *outUnigramFrequency);
static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
static int getProbability(const int position, const std::map<int, int> *bigramMap,
const uint8_t *bigramFilter, const int unigramFreq);
@@ -79,9 +110,16 @@ class BinaryFormat {
REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4
};
const static unsigned int NO_FLAGS = 0;
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
+ const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
+ const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F;
+ const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2;
+ static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos);
};
-inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
+inline int BinaryFormat::detectFormat(const uint8_t *const dict) {
// The magic number is stored big-endian.
const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
switch (magicNumber) {
@@ -103,7 +141,7 @@ inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
}
}
-inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) {
+inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) {
switch (detectFormat(dict)) {
case 1:
return NO_FLAGS;
@@ -112,7 +150,7 @@ inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) {
}
}
-inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
+inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) {
switch (detectFormat(dict)) {
case 1:
return FORMAT_VERSION_1_HEADER_SIZE;
@@ -124,17 +162,17 @@ inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
}
}
-inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
+inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) {
const int msb = dict[(*pos)++];
if (msb < 0x80) return msb;
return ((msb & 0x7F) << 8) | dict[(*pos)++];
}
-inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) {
+inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict, int *pos) {
return dict[(*pos)++];
}
-inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) {
+inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t *const dict, int *pos) {
const int origin = *pos;
const int32_t character = dict[origin];
if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
@@ -153,12 +191,12 @@ inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const d
}
}
-inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict,
+inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict,
const int pos) {
return dict[pos];
}
-inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) {
+inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) {
int currentPos = pos;
int32_t character = dict[currentPos++];
while (CHARACTER_ARRAY_TERMINATOR != character) {
@@ -172,22 +210,22 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const in
static inline int attributeAddressSize(const uint8_t flags) {
static const int ATTRIBUTE_ADDRESS_SHIFT = 4;
- return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
+ return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
/* Note: this is a value-dependant optimization of what may probably be
more readably written this way:
- switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) {
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
+ switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
default: return 0;
}
*/
}
-static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
+static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) {
int currentPos = pos;
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
- while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
+ while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) {
currentPos += attributeAddressSize(flags);
flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
}
@@ -197,11 +235,11 @@ static inline int skipExistingBigrams(const uint8_t* const dict, const int pos)
static inline int childrenAddressSize(const uint8_t flags) {
static const int CHILDREN_ADDRESS_SHIFT = 6;
- return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
+ return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT;
/* See the note in attributeAddressSize. The same applies here */
}
-static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
+static inline int shortcutByteSize(const uint8_t *const dict, const int pos) {
return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
}
@@ -210,28 +248,28 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos
}
inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
- return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
+ return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}
-inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
+inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags,
const int pos) {
- if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
+ if (FLAG_HAS_SHORTCUT_TARGETS & flags) {
return pos + shortcutByteSize(dict, pos);
} else {
return pos;
}
}
-inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
+inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags,
const int pos) {
- if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
+ if (FLAG_HAS_BIGRAMS & flags) {
return skipExistingBigrams(dict, pos);
} else {
return pos;
}
}
-inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
+inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags,
const int pos) {
// This function skips all attributes: shortcuts and bigrams.
int newPos = pos;
@@ -240,7 +278,7 @@ inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint
return newPos;
}
-inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
+inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict,
const uint8_t flags, const int pos) {
int currentPos = pos;
currentPos = skipChildrenPosition(flags, currentPos);
@@ -248,18 +286,18 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict,
return currentPos;
}
-inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags,
+inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags,
const int pos) {
int offset = 0;
- switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) {
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
+ switch (MASK_GROUP_ADDRESS_TYPE & flags) {
+ case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
offset = dict[pos];
break;
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
+ case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
offset = dict[pos] << 8;
offset += dict[pos + 1];
break;
- case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
+ case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
offset = dict[pos] << 16;
offset += dict[pos + 1] << 8;
offset += dict[pos + 2];
@@ -273,42 +311,45 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const u
}
inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) {
- return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+ return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags));
}
-inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict,
+inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict,
const uint8_t flags, int *pos) {
int offset = 0;
const int origin = *pos;
- switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
+ switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
offset = dict[origin];
*pos = origin + 1;
break;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
offset = dict[origin] << 8;
offset += dict[origin + 1];
*pos = origin + 2;
break;
- case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
offset = dict[origin] << 16;
offset += dict[origin + 1] << 8;
offset += dict[origin + 2];
*pos = origin + 3;
break;
}
- if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
+ if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) {
return origin - offset;
} else {
return origin + offset;
}
}
+inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) {
+ return flags & MASK_ATTRIBUTE_FREQUENCY;
+}
+
// This function gets the byte position of the last chargroup of the exact matching word in the
// dictionary. If no match is found, it returns NOT_VALID_WORD.
-inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
- const int32_t* const inWord, const int length) {
+inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
+ const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) {
int pos = 0;
int wordPos = 0;
@@ -317,7 +358,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
// there was no match (or we would have found it).
if (wordPos > length) return NOT_VALID_WORD;
int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
- const int32_t wChar = inWord[wordPos];
+ const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
while (true) {
// If there are no more character groups in this node, it means we could not
// find a matching character for this depth, therefore there is no match.
@@ -330,7 +371,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
// char within a node, so either we found our match in this node, or there is
// no match and we can return NOT_VALID_WORD. So we will check all the characters
// in this character group indeed does match.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
while (NOT_A_CHARACTER != character) {
++wordPos;
@@ -348,14 +389,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
// If we don't match the length AND don't have children, then a word in the
// dictionary fully matches a prefix of the searched word but not the full word.
++wordPos;
- if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
+ if (FLAG_IS_TERMINAL & flags) {
if (wordPos == length) {
return charGroupPos;
}
- pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
+ pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
}
- if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
+ if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
return NOT_VALID_WORD;
}
// We have children and we are still shorter than the word we are searching for, so
@@ -365,7 +405,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
break;
} else {
// This chargroup does not match, so skip the remaining part and go to the next.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = BinaryFormat::skipOtherCharacters(root, pos);
}
pos = BinaryFormat::skipFrequency(flags, pos);
@@ -394,8 +434,8 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
* outUnigramFrequency: a pointer to an int to write the frequency into.
* Return value : the length of the word, of 0 if the word was not found.
*/
-inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
- const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) {
+inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
+ const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) {
int pos = 0;
int wordPos = 0;
@@ -418,7 +458,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
// We found the address. Copy the rest of the word in the buffer and return
// the length.
outWord[wordPos] = character;
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
// We count chars in order to avoid infinite loops if the file is broken or
// if there is some other bug
@@ -433,7 +473,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
}
// We need to skip past this char group, so skip any remaining chars after the
// first and possibly the frequency.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & flags) {
pos = skipOtherCharacters(root, pos);
}
pos = skipFrequency(flags, pos);
@@ -441,8 +481,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
// The fact that this group has children is very important. Since we already know
// that this group does not match, if it has no children we know it is irrelevant
// to what we are searching for.
- const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
- (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags));
+ const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS !=
+ (MASK_GROUP_ADDRESS_TYPE & flags));
// We will write in `found' whether we have passed the children address we are
// searching for. For example if we search for "beer", the children of b are less
// than the address we are searching for and the children of c are greater. When we
@@ -482,7 +522,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
// We copy all the characters in this group to the buffer
outWord[wordPos] = lastChar;
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
+ if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
int32_t nextChar =
getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
int charCount = maxDepth;
@@ -538,8 +578,8 @@ inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const
// 0 for the bigram frequency represents the middle of the 16th step from the top,
// while a value of 15 represents the middle of the top step.
// See makedict.BinaryDictInputOutput for details.
- const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
- return (int)(unigramFreq + (bigramFreq + 1) * stepSize);
+ const float stepSize = (static_cast<float>(MAX_FREQ) - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
+ return static_cast<int>(unigramFreq + (bigramFreq + 1) * stepSize);
}
// This returns a probability in log space.
@@ -555,7 +595,5 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int,
return backoff(unigramFreq);
}
}
-
} // namespace latinime
-
#endif // LATINIME_BINARY_FORMAT_H