diff options
Diffstat (limited to 'native/jni/src')
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 1 | ||||
-rw-r--r-- | native/jni/src/binary_format.h | 32 | ||||
-rw-r--r-- | native/jni/src/defines.h | 5 | ||||
-rw-r--r-- | native/jni/src/dictionary.cpp | 1 | ||||
-rw-r--r-- | native/jni/src/proximity_info.cpp | 1 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 1 |
6 files changed, 30 insertions, 11 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index 7ed4dc439..ac2a26172 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -22,6 +22,7 @@ #include "bigram_dictionary.h" #include "binary_format.h" #include "bloom_filter.h" +#include "defines.h" #include "dictionary.h" namespace latinime { diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index b87593ca9..40f197619 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -520,19 +520,33 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a return 0; } -// This should probably return a probability in log space. +static inline int backoff(const int unigramFreq) { + return unigramFreq; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); +} + +// This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq) { - if (!bigramMap || !bigramFilter) return unigramFreq; - if (!isInFilter(bigramFilter, position)) return unigramFreq; - const std::map<int, int>::const_iterator bigramFreq = bigramMap->find(position); - if (bigramFreq != bigramMap->end()) { - // TODO: return the frequency in bigramFreq->second - return unigramFreq; + if (!bigramMap || !bigramFilter) return backoff(unigramFreq); + if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); + const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); + if (bigramFreqIt != bigramMap->end()) { + const int bigramFreq = bigramFreqIt->second; + // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the + // unigram frequency to be the median value of the 17th step from the top. A value of + // 0 for the bigram frequency represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictInputOutput for details. + const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); + return (int)(unigramFreq + bigramFreq * stepSize); } else { - return unigramFreq; + return backoff(unigramFreq); } - // TODO: if the unigram frequency is used, compute the actual probability } } // namespace latinime diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index c6ad66abe..dfc5238a0 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -207,6 +207,7 @@ static inline void prof_out(void) { #define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f #define HALF_SCORE_SQUARED_RADIUS 32.0f #define MAX_FREQ 255 +#define MAX_BIGRAM_FREQ 15 // This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. @@ -225,8 +226,8 @@ static inline void prof_out(void) { #define MULTIPLE_WORDS_DEMOTION_RATE 80 #define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6 -#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39 -#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22 +#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.35 +#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.185 #define MAX_DEPTH_MULTIPLIER 3 diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp index 8ea7c49fa..65d0f73a3 100644 --- a/native/jni/src/dictionary.cpp +++ b/native/jni/src/dictionary.cpp @@ -20,6 +20,7 @@ #define LOG_TAG "LatinIME: dictionary.cpp" #include "binary_format.h" +#include "defines.h" #include "dictionary.h" namespace latinime { diff --git a/native/jni/src/proximity_info.cpp b/native/jni/src/proximity_info.cpp index c00c4c20f..960d40119 100644 --- a/native/jni/src/proximity_info.cpp +++ b/native/jni/src/proximity_info.cpp @@ -21,6 +21,7 @@ #define LOG_TAG "LatinIME: proximity_info.cpp" #include "additional_proximity_chars.h" +#include "defines.h" #include "dictionary.h" #include "proximity_info.h" diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index ee8c49703..3c826e918 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -21,6 +21,7 @@ #define LOG_TAG "LatinIME: unigram_dictionary.cpp" #include "char_utils.h" +#include "defines.h" #include "dictionary.h" #include "unigram_dictionary.h" |