diff options
author | 2012-05-29 00:40:32 -0700 | |
---|---|---|
committer | 2012-05-29 00:40:32 -0700 | |
commit | 01fcf0dab0393ddf08b678d5e31560e7020335a4 (patch) | |
tree | 712d7ad40b7bba0267e9c9a3c979dd9796a8b9e8 | |
parent | 14c72f071ea951a4ad5ce068c7944b135e859e48 (diff) | |
parent | 18f650172d29800edb772d3798391b2d430426df (diff) | |
download | latinime-01fcf0dab0393ddf08b678d5e31560e7020335a4.tar.gz latinime-01fcf0dab0393ddf08b678d5e31560e7020335a4.tar.xz latinime-01fcf0dab0393ddf08b678d5e31560e7020335a4.zip |
am 18f65017: am bfba64bc: Merge "Compute the correct frequency for bigram prediction" into jb-dev
* commit '18f650172d29800edb772d3798391b2d430426df':
Compute the correct frequency for bigram prediction
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 7 | ||||
-rw-r--r-- | native/jni/src/binary_format.h | 6 |
2 files changed, 9 insertions, 4 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index 8c73f4400..8d0c8597f 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -117,14 +117,17 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); uint16_t bigramBuffer[MAX_WORD_LENGTH]; + int unigramFreq; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, - bigramBuffer); + bigramBuffer, &unigramFreq); // codesSize == 0 means we are trying to find bigram predictions. if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) { - const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int frequency = + BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq); if (addWordBigram(bigramBuffer, length, frequency)) { ++bigramCount; } diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 85fdd9418..51bf8ebbc 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -66,7 +66,7 @@ class BinaryFormat { static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, - uint16_t* outWord); + uint16_t* outWord, int* outUnigramFrequency); static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); @@ -391,10 +391,11 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramFrequency: a pointer to an int to write the frequency into. * Return value : the length of the word, of 0 if the word was not found. */ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, - const int maxDepth, uint16_t* outWord) { + const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { int pos = 0; int wordPos = 0; @@ -427,6 +428,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a nextChar = getCharCodeAndForwardPointer(root, &pos); } } + *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the |