6 files changed, 47 insertions, 19 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index ac2a26172..eb4bf8d1a 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -117,14 +117,22 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
     do {
         bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
         uint16_t bigramBuffer[MAX_WORD_LENGTH];
+        int unigramFreq;
         const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
                 &pos);
         const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
-                bigramBuffer);
+                bigramBuffer, &unigramFreq);
 
         // codesSize == 0 means we are trying to find bigram predictions.
         if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) {
-            const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+            const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+            // Due to space constraints, the frequency for bigrams is approximate - the lower the
+            // unigram frequency, the worse the precision. The theoritical maximum error in
+            // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
+            // in very bad cases. This means that sometimes, we'll see some bigrams interverted
+            // here, but it can't get too bad.
+            const int frequency =
+                    BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq);
             if (addWordBigram(bigramBuffer, length, frequency)) {
                 ++bigramCount;
             }
@@ -149,8 +157,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
     } else {
         pos = BinaryFormat::skipOtherCharacters(root, pos);
     }
-    pos = BinaryFormat::skipChildrenPosition(flags, pos);
     pos = BinaryFormat::skipFrequency(flags, pos);
+    pos = BinaryFormat::skipChildrenPosition(flags, pos);
     pos = BinaryFormat::skipShortcuts(root, flags, pos);
     return pos;
 }
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 40f197619..51bf8ebbc 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -66,7 +66,8 @@ class BinaryFormat {
     static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
             const int length);
     static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
-            uint16_t* outWord);
+            uint16_t* outWord, int* outUnigramFrequency);
+    static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
     static int getProbability(const int position, const std::map<int, int> *bigramMap,
             const uint8_t *bigramFilter, const int unigramFreq);
 
@@ -390,10 +391,11 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
  * address: the byte position of the last chargroup of the word we are searching for (this is
  *   what is stored as the "bigram address" in each bigram)
  * outword: an array to write the found word, with MAX_WORD_LENGTH size.
+ * outUnigramFrequency: a pointer to an int to write the frequency into.
  * Return value : the length of the word, of 0 if the word was not found.
  */
 inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address,
-        const int maxDepth, uint16_t* outWord) {
+        const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) {
     int pos = 0;
     int wordPos = 0;
 
@@ -421,11 +423,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
                     // We count chars in order to avoid infinite loops if the file is broken or
                     // if there is some other bug
                     int charCount = maxDepth;
-                    while (-1 != nextChar && --charCount > 0) {
+                    while (NOT_A_CHARACTER != nextChar && --charCount > 0) {
                         outWord[++wordPos] = nextChar;
                         nextChar = getCharCodeAndForwardPointer(root, &pos);
                     }
                 }
+                *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
                 return ++wordPos;
             }
             // We need to skip past this char group, so skip any remaining chars after the
@@ -529,6 +532,16 @@ static inline int backoff(const int unigramFreq) {
     // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
 }
 
+inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
+    // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
+    // unigram frequency to be the median value of the 17th step from the top. A value of
+    // 0 for the bigram frequency represents the middle of the 16th step from the top,
+    // while a value of 15 represents the middle of the top step.
+    // See makedict.BinaryDictInputOutput for details.
+    const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
+    return (int)(unigramFreq + (bigramFreq + 1) * stepSize);
+}
+
 // This returns a probability in log space.
 inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
         const uint8_t *bigramFilter, const int unigramFreq) {
@@ -537,13 +550,7 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int,
     const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
     if (bigramFreqIt != bigramMap->end()) {
         const int bigramFreq = bigramFreqIt->second;
-        // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
-        // unigram frequency to be the median value of the 17th step from the top. A value of
-        // 0 for the bigram frequency represents the middle of the 16th step from the top,
-        // while a value of 15 represents the middle of the top step.
-        // See makedict.BinaryDictInputOutput for details.
-        const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
-        return (int)(unigramFreq + bigramFreq * stepSize);
+        return computeFrequencyForBigram(unigramFreq, bigramFreq);
     } else {
         return backoff(unigramFreq);
     }
diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp
index 65d0f73a3..1fb02478b 100644
--- a/native/jni/src/dictionary.cpp
+++ b/native/jni/src/dictionary.cpp
@@ -55,8 +55,8 @@ Dictionary::~Dictionary() {
     delete mBigramDictionary;
 }
 
-bool Dictionary::isValidWord(const int32_t *word, int length) {
-    return mUnigramDictionary->isValidWord(word, length);
+int Dictionary::getFrequency(const int32_t *word, int length) {
+    return mUnigramDictionary->getFrequency(word, length);
 }
 
 bool Dictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2,
diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h
index 87891ee4d..9f2367904 100644
--- a/native/jni/src/dictionary.h
+++ b/native/jni/src/dictionary.h
@@ -52,7 +52,7 @@ class Dictionary {
                 maxWordLength, maxBigrams);
     }
 
-    bool isValidWord(const int32_t *word, int length);
+    int getFrequency(const int32_t *word, int length);
     bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2);
     void *getDict() { return (void *)mDict; }
     int getDictSize() { return mDictSize; }
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index 828582848..efe9c4fe3 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -747,8 +747,21 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor
     return maxFreq;
 }
 
-bool UnigramDictionary::isValidWord(const int32_t* const inWord, const int length) const {
-    return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
+int UnigramDictionary::getFrequency(const int32_t* const inWord, const int length) const {
+    const uint8_t* const root = DICT_ROOT;
+    int pos = BinaryFormat::getTerminalPosition(root, inWord, length);
+    if (NOT_VALID_WORD == pos) {
+        return NOT_A_PROBABILITY;
+    }
+    const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
+    const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags));
+    if (hasMultipleChars) {
+        pos = BinaryFormat::skipOtherCharacters(root, pos);
+    } else {
+        BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos);
+    }
+    const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
+    return unigramFreq;
 }
 
 // TODO: remove this function.
diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h
index b9233518f..b70894004 100644
--- a/native/jni/src/unigram_dictionary.h
+++ b/native/jni/src/unigram_dictionary.h
@@ -72,7 +72,7 @@ class UnigramDictionary {
 
     UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
             int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags);
-    bool isValidWord(const int32_t* const inWord, const int length) const;
+    int getFrequency(const int32_t* const inWord, const int length) const;
     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
     int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool,
             Correction *correction, const int *xcoordinates, const int *ycoordinates,