aboutsummaryrefslogtreecommitdiffstats
path: root/native/src/correction.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'native/src/correction.cpp')
-rw-r--r--native/src/correction.cpp211
1 files changed, 153 insertions, 58 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index 27dc40745..dc31bfae7 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -16,11 +16,13 @@
#include <assert.h>
#include <ctype.h>
+#include <math.h>
#include <stdio.h>
#include <string.h>
#define LOG_TAG "LatinIME: correction.cpp"
+#include "char_utils.h"
#include "correction.h"
#include "dictionary.h"
#include "proximity_info.h"
@@ -31,73 +33,49 @@ namespace latinime {
// edit distance funcitons //
/////////////////////////////
-#if 0 /* no longer used */
-inline static int editDistance(
- int* editDistanceTable, const unsigned short* input,
- const int inputLength, const unsigned short* output, const int outputLength) {
- // dp[li][lo] dp[a][b] = dp[ a * lo + b]
- int* dp = editDistanceTable;
- const int li = inputLength + 1;
- const int lo = outputLength + 1;
- for (int i = 0; i < li; ++i) {
- dp[lo * i] = i;
- }
- for (int i = 0; i < lo; ++i) {
- dp[i] = i;
- }
-
- for (int i = 0; i < li - 1; ++i) {
- for (int j = 0; j < lo - 1; ++j) {
- const uint32_t ci = Dictionary::toBaseLowerCase(input[i]);
- const uint32_t co = Dictionary::toBaseLowerCase(output[j]);
- const uint16_t cost = (ci == co) ? 0 : 1;
- dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
- min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
- if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1])
- && co == Dictionary::toBaseLowerCase(input[i - 1])) {
- dp[(i + 1) * lo + (j + 1)] = min(
- dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
- }
- }
+inline static void initEditDistance(int *editDistanceTable) {
+ for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
+ editDistanceTable[i] = i;
}
+}
- if (DEBUG_EDIT_DISTANCE) {
- LOGI("IN = %d, OUT = %d", inputLength, outputLength);
- for (int i = 0; i < li; ++i) {
- for (int j = 0; j < lo; ++j) {
- LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
+inline static void dumpEditDistance10ForDebug(int *editDistanceTable, const int inputLength,
+ const int outputLength) {
+ if (DEBUG_DICT) {
+ LOGI("EditDistanceTable");
+ for (int i = 0; i <= 10; ++i) {
+ int c[11];
+ for (int j = 0; j <= 10; ++j) {
+ if (j < inputLength + 1 && i < outputLength + 1) {
+ c[j] = (editDistanceTable + i * (inputLength + 1))[j];
+ } else {
+ c[j] = -1;
+ }
}
+ LOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]",
+ c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]);
}
}
- return dp[li * lo - 1];
-}
-#endif
-
-inline static void initEditDistance(int *editDistanceTable) {
- for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
- editDistanceTable[i] = i;
- }
}
inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input,
const int inputLength, const unsigned short *output, const int outputLength) {
+ // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched.
// Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j].
// Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated,
// and calculate dp[ouputLength][0] ... dp[outputLength][inputLength].
int *const current = editDistanceTable + outputLength * (inputLength + 1);
const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1);
const int *const prevprev =
- outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : NULL;
+ outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0;
current[0] = outputLength;
- const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]);
- const uint32_t prevCO =
- outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0;
+ const uint32_t co = toBaseLowerCase(output[outputLength - 1]);
+ const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0;
for (int i = 1; i <= inputLength; ++i) {
- const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]);
+ const uint32_t ci = toBaseLowerCase(input[i - 1]);
const uint16_t cost = (ci == co) ? 0 : 1;
current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
- if (i >= 2 && prevprev && ci == prevCO
- && co == Dictionary::toBaseLowerCase(input[i - 2])) {
+ if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) {
current[i] = min(current[i], prevprev[i - 2] + 1);
}
}
@@ -105,6 +83,9 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne
inline static int getCurrentEditDistance(
int *editDistanceTable, const int inputLength, const int outputLength) {
+ if (DEBUG_DICT) {
+ LOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
+ }
return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
}
@@ -133,6 +114,9 @@ void Correction::initCorrection(const ProximityInfo *pi, const int inputLength,
mInputLength = inputLength;
mMaxDepth = maxDepth;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
+ // TODO: This is not supposed to be required. Check what's going wrong with
+ // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL]
+ initEditDistance(mEditDistanceTable);
}
void Correction::initCorrectionState(
@@ -146,7 +130,7 @@ void Correction::initCorrectionState(
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
- const bool useFullEditDistance) {
+ const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
// TODO: remove
mTransposedPos = transposedPos;
mExcessivePos = excessivePos;
@@ -159,6 +143,8 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
mSpaceProximityPos = spaceProximityPos;
mMissingSpacePos = missingSpacePos;
mUseFullEditDistance = useFullEditDistance;
+ mDoAutoCompletion = doAutoCompletion;
+ mMaxErrors = maxErrors;
}
void Correction::checkState() {
@@ -280,7 +266,9 @@ void Correction::startToTraverseAllNodes() {
bool Correction::needsToPrune() const {
// TODO: use edit distance here
- return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance;
+ return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
+ // Allow one char longer word for missing character
+ || (!mDoAutoCompletion && (mOutputIndex + 1 >= mInputLength));
}
void Correction::addCharToCurrentWord(const int32_t c) {
@@ -312,12 +300,17 @@ inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
Correction::CorrectionType Correction::processCharAndCalcState(
const int32_t c, const bool isTerminal) {
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
+ if (correctionCount > mMaxErrors) {
+ return UNRELATED;
+ }
+
// TODO: Change the limit if we'll allow two or more corrections
const bool noCorrectionsHappenedSoFar = correctionCount == 0;
const bool canTryCorrection = noCorrectionsHappenedSoFar;
int proximityIndex = 0;
mDistances[mOutputIndex] = NOT_A_DISTANCE;
+ // Skip checking this node
if (mNeedsToTraverseAllNodes || isQuote(c)) {
bool incremented = false;
if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
@@ -342,6 +335,7 @@ Correction::CorrectionType Correction::processCharAndCalcState(
return processSkipChar(c, isTerminal, incremented);
}
+ // Check possible corrections.
if (mExcessivePos >= 0) {
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
mExcessivePos = mOutputIndex;
@@ -392,7 +386,12 @@ Correction::CorrectionType Correction::processCharAndCalcState(
}
// TODO: Change the limit if we'll allow two or more proximity chars with corrections
- const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0;
+ // Work around: When the mMaxErrors is 1, we only allow just one error
+ // including proximity correction.
+ const bool checkProximityChars = (mMaxErrors > 1)
+ ? (noCorrectionsHappenedSoFar || mProximityCount == 0)
+ : (noCorrectionsHappenedSoFar && mProximityCount == 0);
+
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
? ProximityInfo::EQUIVALENT_CHAR
: mProximityInfo->getMatchedProximityId(
@@ -607,13 +606,7 @@ inline static int getQuoteCount(const unsigned short* word, const int length) {
}
inline static bool isUpperCase(unsigned short c) {
- if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
- c = BASE_CHARS[c];
- }
- if (isupper(c)) {
- return true;
- }
- return false;
+ return isAsciiUpper(toBaseChar(c));
}
//////////////////////
@@ -654,6 +647,9 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
// TODO: Calculate edit distance for transposed and excessive
int ed = 0;
+ if (DEBUG_DICT_FULL) {
+ dumpEditDistance10ForDebug(editDistanceTable, inputLength, outputIndex + 1);
+ }
int adjustedProximityMatchedCount = proximityMatchedCount;
int finalFreq = freq;
@@ -938,4 +934,103 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
return totalFreq;
}
+/* Damerau-Levenshtein distance */
+inline static int editDistanceInternal(
+ int* editDistanceTable, const unsigned short* before,
+ const int beforeLength, const unsigned short* after, const int afterLength) {
+ // dp[li][lo] dp[a][b] = dp[ a * lo + b]
+ int* dp = editDistanceTable;
+ const int li = beforeLength + 1;
+ const int lo = afterLength + 1;
+ for (int i = 0; i < li; ++i) {
+ dp[lo * i] = i;
+ }
+ for (int i = 0; i < lo; ++i) {
+ dp[i] = i;
+ }
+
+ for (int i = 0; i < li - 1; ++i) {
+ for (int j = 0; j < lo - 1; ++j) {
+ const uint32_t ci = toBaseLowerCase(before[i]);
+ const uint32_t co = toBaseLowerCase(after[j]);
+ const uint16_t cost = (ci == co) ? 0 : 1;
+ dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
+ min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
+ if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1])
+ && co == toBaseLowerCase(before[i - 1])) {
+ dp[(i + 1) * lo + (j + 1)] = min(
+ dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
+ }
+ }
+ }
+
+ if (DEBUG_EDIT_DISTANCE) {
+ LOGI("IN = %d, OUT = %d", beforeLength, afterLength);
+ for (int i = 0; i < li; ++i) {
+ for (int j = 0; j < lo; ++j) {
+ LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
+ }
+ }
+ }
+ return dp[li * lo - 1];
+}
+
+int Correction::RankingAlgorithm::editDistance(const unsigned short* before,
+ const int beforeLength, const unsigned short* after, const int afterLength) {
+ int table[(beforeLength + 1) * (afterLength + 1)];
+ return editDistanceInternal(table, before, beforeLength, after, afterLength);
+}
+
+
+// In dictionary.cpp, getSuggestion() method,
+// suggestion scores are computed using the below formula.
+// original score
+// := pow(mTypedLetterMultiplier (this is defined 2),
+// (the number of matched characters between typed word and suggested word))
+// * (individual word's score which defined in the unigram dictionary,
+// and this score is defined in range [0, 255].)
+// Then, the following processing is applied.
+// - If the dictionary word is matched up to the point of the user entry
+// (full match up to min(before.length(), after.length())
+// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2)
+// - If the word is a true full match except for differences in accents or
+// capitalization, then treat it as if the score was 255.
+// - If before.length() == after.length()
+// => multiply by mFullWordMultiplier (this is defined 2))
+// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2
+// For historical reasons we ignore the 1.2 modifier (because the measure for a good
+// autocorrection threshold was done at a time when it didn't exist). This doesn't change
+// the result.
+// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2.
+
+/* static */
+double Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before,
+ const int beforeLength, const unsigned short* after, const int afterLength,
+ const int score) {
+ if (0 == beforeLength || 0 == afterLength) {
+ return 0;
+ }
+ const int distance = editDistance(before, beforeLength, after, afterLength);
+ int spaceCount = 0;
+ for (int i = 0; i < afterLength; ++i) {
+ if (after[i] == CODE_SPACE) {
+ ++spaceCount;
+ }
+ }
+
+ if (spaceCount == afterLength) {
+ return 0;
+ }
+
+ const double maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE
+ * pow((double)TYPED_LETTER_MULTIPLIER,
+ (double)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER;
+
+ // add a weight based on edit distance.
+ // distance <= max(afterLength, beforeLength) == afterLength,
+ // so, 0 <= distance / afterLength <= 1
+ const double weight = 1.0 - (double) distance / afterLength;
+ return (score / maxScore) * weight;
+}
+
} // namespace latinime