aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/bigram_dictionary.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/bigram_dictionary.cpp')
-rw-r--r--native/jni/src/bigram_dictionary.cpp14
1 files changed, 11 insertions, 3 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index ac2a26172..eb4bf8d1a 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -117,14 +117,22 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
do {
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
uint16_t bigramBuffer[MAX_WORD_LENGTH];
+ int unigramFreq;
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
&pos);
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
- bigramBuffer);
+ bigramBuffer, &unigramFreq);
// codesSize == 0 means we are trying to find bigram predictions.
if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) {
- const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+ const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
+ // Due to space constraints, the frequency for bigrams is approximate - the lower the
+ // unigram frequency, the worse the precision. The theoritical maximum error in
+ // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
+ // in very bad cases. This means that sometimes, we'll see some bigrams interverted
+ // here, but it can't get too bad.
+ const int frequency =
+ BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq);
if (addWordBigram(bigramBuffer, length, frequency)) {
++bigramCount;
}
@@ -149,8 +157,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
} else {
pos = BinaryFormat::skipOtherCharacters(root, pos);
}
- pos = BinaryFormat::skipChildrenPosition(flags, pos);
pos = BinaryFormat::skipFrequency(flags, pos);
+ pos = BinaryFormat::skipChildrenPosition(flags, pos);
pos = BinaryFormat::skipShortcuts(root, flags, pos);
return pos;
}