diff options
Diffstat (limited to 'native/jni/src/binary_format.h')
-rw-r--r-- | native/jni/src/binary_format.h | 27 |
1 files changed, 17 insertions, 10 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 40f197619..51bf8ebbc 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -66,7 +66,8 @@ class BinaryFormat { static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, - uint16_t* outWord); + uint16_t* outWord, int* outUnigramFrequency); + static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); @@ -390,10 +391,11 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramFrequency: a pointer to an int to write the frequency into. * Return value : the length of the word, of 0 if the word was not found. */ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, - const int maxDepth, uint16_t* outWord) { + const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { int pos = 0; int wordPos = 0; @@ -421,11 +423,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug int charCount = maxDepth; - while (-1 != nextChar && --charCount > 0) { + while (NOT_A_CHARACTER != nextChar && --charCount > 0) { outWord[++wordPos] = nextChar; nextChar = getCharCodeAndForwardPointer(root, &pos); } } + *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the @@ -529,6 +532,16 @@ static inline int backoff(const int unigramFreq) { // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); } +inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { + // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the + // unigram frequency to be the median value of the 17th step from the top. A value of + // 0 for the bigram frequency represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictInputOutput for details. + const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); + return (int)(unigramFreq + (bigramFreq + 1) * stepSize); +} + // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq) { @@ -537,13 +550,7 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int, const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); if (bigramFreqIt != bigramMap->end()) { const int bigramFreq = bigramFreqIt->second; - // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the - // unigram frequency to be the median value of the 17th step from the top. A value of - // 0 for the bigram frequency represents the middle of the 16th step from the top, - // while a value of 15 represents the middle of the top step. - // See makedict.BinaryDictInputOutput for details. - const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); - return (int)(unigramFreq + bigramFreq * stepSize); + return computeFrequencyForBigram(unigramFreq, bigramFreq); } else { return backoff(unigramFreq); } |