aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/binary_format.h
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r--native/jni/src/binary_format.h77
1 files changed, 77 insertions, 0 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 2d2e19501..06f50dc7f 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -23,6 +23,7 @@
#include "bloom_filter.h"
#include "char_utils.h"
+#include "hash_map_compat.h"
namespace latinime {
@@ -66,6 +67,7 @@ class BinaryFormat {
static int detectFormat(const uint8_t *const dict);
static int getHeaderSize(const uint8_t *const dict);
static int getFlags(const uint8_t *const dict);
+ static bool hasBlacklistedOrNotAWordFlag(const int flags);
static void readHeaderValue(const uint8_t *const dict, const char *const key, int *outValue,
const int outValueSize);
static int readHeaderValueInt(const uint8_t *const dict, const char *const key);
@@ -92,7 +94,13 @@ class BinaryFormat {
const int unigramProbability, const int bigramProbability);
static int getProbability(const int position, const std::map<int, int> *bigramMap,
const uint8_t *bigramFilter, const int unigramProbability);
+ static int getBigramProbabilityFromHashMap(const int position,
+ const hash_map_compat<int, int> *bigramMap, const int unigramProbability);
static float getMultiWordCostMultiplier(const uint8_t *const dict);
+ static void fillBigramProbabilityToHashMap(const uint8_t *const root, int position,
+ hash_map_compat<int, int> *bigramMap);
+ static int getBigramProbability(const uint8_t *const root, int position,
+ const int nextPosition, const int unigramProbability);
// Flags for special processing
// Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
@@ -104,6 +112,8 @@ class BinaryFormat {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
+ static int getBigramListPositionForWordPosition(const uint8_t *const root, int position);
+
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
@@ -162,6 +172,10 @@ inline int BinaryFormat::getFlags(const uint8_t *const dict) {
}
}
+inline bool BinaryFormat::hasBlacklistedOrNotAWordFlag(const int flags) {
+ return (flags & (FLAG_IS_BLACKLISTED | FLAG_IS_NOT_A_WORD)) != 0;
+}
+
inline int BinaryFormat::getHeaderSize(const uint8_t *const dict) {
switch (detectFormat(dict)) {
case 1:
@@ -682,5 +696,68 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int,
}
return backoff(unigramProbability);
}
+
+// This returns a probability in log space.
+inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position,
+ const hash_map_compat<int, int> *bigramMap, const int unigramProbability) {
+ if (!bigramMap) return backoff(unigramProbability);
+ const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
+ if (bigramProbabilityIt != bigramMap->end()) {
+ const int bigramProbability = bigramProbabilityIt->second;
+ return computeProbabilityForBigram(unigramProbability, bigramProbability);
+ }
+ return backoff(unigramProbability);
+}
+
+AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap(
+ const uint8_t *const root, int position, hash_map_compat<int, int> *bigramMap) {
+ position = getBigramListPositionForWordPosition(root, position);
+ if (0 == position) return;
+
+ uint8_t bigramFlags;
+ do {
+ bigramFlags = getFlagsAndForwardPointer(root, &position);
+ const int probability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
+ const int bigramPos = getAttributeAddressAndForwardPointer(root, bigramFlags,
+ &position);
+ (*bigramMap)[bigramPos] = probability;
+ } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
+}
+
+AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position,
+ const int nextPosition, const int unigramProbability) {
+ position = getBigramListPositionForWordPosition(root, position);
+ if (0 == position) return backoff(unigramProbability);
+
+ uint8_t bigramFlags;
+ do {
+ bigramFlags = getFlagsAndForwardPointer(root, &position);
+ const int bigramPos = getAttributeAddressAndForwardPointer(
+ root, bigramFlags, &position);
+ if (bigramPos == nextPosition) {
+ const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
+ return computeProbabilityForBigram(unigramProbability, bigramProbability);
+ }
+ } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
+ return backoff(unigramProbability);
+}
+
+// Returns a pointer to the start of the bigram list.
+AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition(
+ const uint8_t *const root, int position) {
+ if (NOT_VALID_WORD == position) return 0;
+ const uint8_t flags = getFlagsAndForwardPointer(root, &position);
+ if (!(flags & FLAG_HAS_BIGRAMS)) return 0;
+ if (flags & FLAG_HAS_MULTIPLE_CHARS) {
+ position = skipOtherCharacters(root, position);
+ } else {
+ getCodePointAndForwardPointer(root, &position);
+ }
+ position = skipProbability(flags, position);
+ position = skipChildrenPosition(flags, position);
+ position = skipShortcuts(root, flags, position);
+ return position;
+}
+
} // namespace latinime
#endif // LATINIME_BINARY_FORMAT_H