diff options
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r-- | native/jni/src/binary_format.h | 84 |
1 files changed, 44 insertions, 40 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 2d7c4b492..1c4061fd8 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -52,10 +52,10 @@ class BinaryFormat { // Flag for sign of offset. If this flag is set, the offset value must be negated. static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; - // Mask for attribute frequency, stored on 4 bits inside the flags byte. - static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; - // The numeric value of the shortcut frequency that means 'whitelist'. - static const int WHITELIST_SHORTCUT_FREQUENCY = 15; + // Mask for attribute probability, stored on 4 bits inside the flags byte. + static const int MASK_ATTRIBUTE_PROBABILITY = 0x0F; + // The numeric value of the shortcut probability that means 'whitelist'. + static const int WHITELIST_SHORTCUT_PROBABILITY = 15; // Mask and flags for attribute address type selection. static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; @@ -72,10 +72,10 @@ class BinaryFormat { static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); - static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); + static int readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos); static int skipOtherCharacters(const uint8_t *const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); - static int skipFrequency(const uint8_t flags, const int pos); + static int skipProbability(const uint8_t flags, const int pos); static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); @@ -83,14 +83,15 @@ class BinaryFormat { static bool hasChildrenInFlags(const uint8_t flags); static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos); - static int getAttributeFrequencyFromFlags(const int flags); + static int getAttributeProbabilityFromFlags(const int flags); static int getTerminalPosition(const uint8_t *const root, const int *const inWord, const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, - int *outWord, int *outUnigramFrequency); - static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); + int *outWord, int *outUnigramProbability); + static int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability); static int getProbability(const int position, const std::map<int, int> *bigramMap, - const uint8_t *bigramFilter, const int unigramFreq); + const uint8_t *bigramFilter, const int unigramProbability); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or @@ -264,7 +265,7 @@ AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *c } } -inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict, +inline int BinaryFormat::readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos) { return dict[pos]; } @@ -320,7 +321,7 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos return pos + childrenAddressSize(flags); } -inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { +inline int BinaryFormat::skipProbability(const uint8_t flags, const int pos) { return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } @@ -415,8 +416,8 @@ AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uin } } -inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) { - return flags & MASK_ATTRIBUTE_FREQUENCY; +inline int BinaryFormat::getAttributeProbabilityFromFlags(const int flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; } // This function gets the byte position of the last chargroup of the exact matching word in the @@ -466,7 +467,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (wordPos == length) { return charGroupPos; } - pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); + pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos); } if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { return NOT_VALID_WORD; @@ -481,7 +482,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = BinaryFormat::skipOtherCharacters(root, pos); } - pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipProbability(flags, pos); pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); } --charGroupCount; @@ -504,11 +505,11 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. - * outUnigramFrequency: a pointer to an int to write the frequency into. + * outUnigramProbability: a pointer to an int to write the probability into. * Return value : the length of the word, of 0 if the word was not found. */ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, - const int maxDepth, int *outWord, int *outUnigramFrequency) { + const int maxDepth, int *outWord, int *outUnigramProbability) { int pos = 0; int wordPos = 0; @@ -541,15 +542,15 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co nextChar = getCodePointAndForwardPointer(root, &pos); } } - *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); + *outUnigramProbability = readProbabilityWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the - // first and possibly the frequency. + // first and possibly the probability. if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = skipOtherCharacters(root, pos); } - pos = skipFrequency(flags, pos); + pos = skipProbability(flags, pos); // The fact that this group has children is very important. Since we already know // that this group does not match, if it has no children we know it is irrelevant @@ -604,9 +605,9 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co } } ++wordPos; - // Now we only need to branch to the children address. Skip the frequency if + // Now we only need to branch to the children address. Skip the probability if // it's there, read pos, and break to resume the search at pos. - lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); + lastCandidateGroupPos = skipProbability(lastFlags, lastCandidateGroupPos); pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); break; } else { @@ -635,36 +636,39 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } -static inline int backoff(const int unigramFreq) { - return unigramFreq; +static inline int backoff(const int unigramProbability) { + return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. - // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); + // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } -inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { - // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the - // unigram frequency to be the median value of the 17th step from the top. A value of - // 0 for the bigram frequency represents the middle of the 16th step from the top, +inline int BinaryFormat::computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the + // unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. - const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); - return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize); + const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, - const uint8_t *bigramFilter, const int unigramFreq) { - if (!bigramMap || !bigramFilter) return backoff(unigramFreq); - if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); - const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); - if (bigramFreqIt != bigramMap->end()) { - const int bigramFreq = bigramFreqIt->second; - return computeFrequencyForBigram(unigramFreq, bigramFreq); + const uint8_t *bigramFilter, const int unigramProbability) { + if (!bigramMap || !bigramFilter) return backoff(unigramProbability); + if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); + const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); + if (bigramProbabilityIt != bigramMap->end()) { + const int bigramProbability = bigramProbabilityIt->second; + return computeProbabilityForBigram(unigramProbability, bigramProbability); } - return backoff(unigramFreq); + return backoff(unigramProbability); } } // namespace latinime #endif // LATINIME_BINARY_FORMAT_H |