diff options
Diffstat (limited to 'native/jni/src/bigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index ac2a26172..eb4bf8d1a 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -117,14 +117,22 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); uint16_t bigramBuffer[MAX_WORD_LENGTH]; + int unigramFreq; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, - bigramBuffer); + bigramBuffer, &unigramFreq); // codesSize == 0 means we are trying to find bigram predictions. if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) { - const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + // Due to space constraints, the frequency for bigrams is approximate - the lower the + // unigram frequency, the worse the precision. The theoritical maximum error in + // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 + // in very bad cases. This means that sometimes, we'll see some bigrams interverted + // here, but it can't get too bad. + const int frequency = + BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq); if (addWordBigram(bigramBuffer, length, frequency)) { ++bigramCount; } @@ -149,8 +157,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, } else { pos = BinaryFormat::skipOtherCharacters(root, pos); } - pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipShortcuts(root, flags, pos); return pos; } |