aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src')
-rw-r--r--native/jni/src/bigram_dictionary.cpp1
-rw-r--r--native/jni/src/binary_format.h32
-rw-r--r--native/jni/src/defines.h5
-rw-r--r--native/jni/src/dictionary.cpp1
-rw-r--r--native/jni/src/proximity_info.cpp1
-rw-r--r--native/jni/src/unigram_dictionary.cpp1
6 files changed, 30 insertions, 11 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp
index 7ed4dc439..ac2a26172 100644
--- a/native/jni/src/bigram_dictionary.cpp
+++ b/native/jni/src/bigram_dictionary.cpp
@@ -22,6 +22,7 @@
#include "bigram_dictionary.h"
#include "binary_format.h"
#include "bloom_filter.h"
+#include "defines.h"
#include "dictionary.h"
namespace latinime {
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index b87593ca9..40f197619 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -520,19 +520,33 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
return 0;
}
-// This should probably return a probability in log space.
+static inline int backoff(const int unigramFreq) {
+ return unigramFreq;
+ // For some reason, applying the backoff weight gives bad results in tests. To apply the
+ // backoff weight, we divide the probability by 2, which in our storing format means
+ // decreasing the score by 8.
+ // TODO: figure out what's wrong with this.
+ // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
+}
+
+// This returns a probability in log space.
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
const uint8_t *bigramFilter, const int unigramFreq) {
- if (!bigramMap || !bigramFilter) return unigramFreq;
- if (!isInFilter(bigramFilter, position)) return unigramFreq;
- const std::map<int, int>::const_iterator bigramFreq = bigramMap->find(position);
- if (bigramFreq != bigramMap->end()) {
- // TODO: return the frequency in bigramFreq->second
- return unigramFreq;
+ if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
+ if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
+ const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
+ if (bigramFreqIt != bigramMap->end()) {
+ const int bigramFreq = bigramFreqIt->second;
+ // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
+ // unigram frequency to be the median value of the 17th step from the top. A value of
+ // 0 for the bigram frequency represents the middle of the 16th step from the top,
+ // while a value of 15 represents the middle of the top step.
+ // See makedict.BinaryDictInputOutput for details.
+ const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
+ return (int)(unigramFreq + bigramFreq * stepSize);
} else {
- return unigramFreq;
+ return backoff(unigramFreq);
}
- // TODO: if the unigram frequency is used, compute the actual probability
}
} // namespace latinime
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h
index c6ad66abe..dfc5238a0 100644
--- a/native/jni/src/defines.h
+++ b/native/jni/src/defines.h
@@ -207,6 +207,7 @@ static inline void prof_out(void) {
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
#define HALF_SCORE_SQUARED_RADIUS 32.0f
#define MAX_FREQ 255
+#define MAX_BIGRAM_FREQ 15
// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
@@ -225,8 +226,8 @@ static inline void prof_out(void) {
#define MULTIPLE_WORDS_DEMOTION_RATE 80
#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6
-#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
-#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
+#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.35
+#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.185
#define MAX_DEPTH_MULTIPLIER 3
diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp
index 8ea7c49fa..65d0f73a3 100644
--- a/native/jni/src/dictionary.cpp
+++ b/native/jni/src/dictionary.cpp
@@ -20,6 +20,7 @@
#define LOG_TAG "LatinIME: dictionary.cpp"
#include "binary_format.h"
+#include "defines.h"
#include "dictionary.h"
namespace latinime {
diff --git a/native/jni/src/proximity_info.cpp b/native/jni/src/proximity_info.cpp
index c00c4c20f..960d40119 100644
--- a/native/jni/src/proximity_info.cpp
+++ b/native/jni/src/proximity_info.cpp
@@ -21,6 +21,7 @@
#define LOG_TAG "LatinIME: proximity_info.cpp"
#include "additional_proximity_chars.h"
+#include "defines.h"
#include "dictionary.h"
#include "proximity_info.h"
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp
index ee8c49703..3c826e918 100644
--- a/native/jni/src/unigram_dictionary.cpp
+++ b/native/jni/src/unigram_dictionary.cpp
@@ -21,6 +21,7 @@
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
#include "char_utils.h"
+#include "defines.h"
#include "dictionary.h"
#include "unigram_dictionary.h"