1 files changed, 17 insertions, 10 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 40f197619..51bf8ebbc 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -66,7 +66,8 @@ class BinaryFormat {
     static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
             const int length);
     static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
-            uint16_t* outWord);
+            uint16_t* outWord, int* outUnigramFrequency);
+    static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
     static int getProbability(const int position, const std::map<int, int> *bigramMap,
             const uint8_t *bigramFilter, const int unigramFreq);
 
@@ -390,10 +391,11 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
  * address: the byte position of the last chargroup of the word we are searching for (this is
  *   what is stored as the "bigram address" in each bigram)
  * outword: an array to write the found word, with MAX_WORD_LENGTH size.
+ * outUnigramFrequency: a pointer to an int to write the frequency into.
  * Return value : the length of the word, of 0 if the word was not found.
  */
 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
-        const int maxDepth, uint16_t* outWord) {
+        const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) {
     int pos = 0;
     int wordPos = 0;
 
@@ -421,11 +423,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
                     // We count chars in order to avoid infinite loops if the file is broken or
                     // if there is some other bug
                     int charCount = maxDepth;
-                    while (-1 != nextChar && --charCount > 0) {
+                    while (NOT_A_CHARACTER != nextChar && --charCount > 0) {
                         outWord[++wordPos] = nextChar;
                         nextChar = getCharCodeAndForwardPointer(root, &pos);
                     }
                 }
+                *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
                 return ++wordPos;
             }
             // We need to skip past this char group, so skip any remaining chars after the
@@ -529,6 +532,16 @@ static inline int backoff(const int unigramFreq) {
     // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
 }
 
+inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
+    // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
+    // unigram frequency to be the median value of the 17th step from the top. A value of
+    // 0 for the bigram frequency represents the middle of the 16th step from the top,
+    // while a value of 15 represents the middle of the top step.
+    // See makedict.BinaryDictInputOutput for details.
+    const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
+    return (int)(unigramFreq + (bigramFreq + 1) * stepSize);
+}
+
 // This returns a probability in log space.
 inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
         const uint8_t *bigramFilter, const int unigramFreq) {
@@ -537,13 +550,7 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int,
     const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
     if (bigramFreqIt != bigramMap->end()) {
         const int bigramFreq = bigramFreqIt->second;
-        // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
-        // unigram frequency to be the median value of the 17th step from the top. A value of
-        // 0 for the bigram frequency represents the middle of the 16th step from the top,
-        // while a value of 15 represents the middle of the top step.
-        // See makedict.BinaryDictInputOutput for details.
-        const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
-        return (int)(unigramFreq + bigramFreq * stepSize);
+        return computeFrequencyForBigram(unigramFreq, bigramFreq);
     } else {
         return backoff(unigramFreq);
     }