aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--native/src/correction.cpp100
-rw-r--r--native/src/correction.h3
-rw-r--r--native/src/unigram_dictionary.cpp6
-rw-r--r--native/src/unigram_dictionary.h4
4 files changed, 65 insertions, 48 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index 364913f38..2da82dc3d 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -32,48 +32,6 @@ namespace latinime {
// edit distance funcitons //
/////////////////////////////
-#if 0 /* no longer used */
-inline static int editDistance(
- int* editDistanceTable, const unsigned short* input,
- const int inputLength, const unsigned short* output, const int outputLength) {
- // dp[li][lo] dp[a][b] = dp[ a * lo + b]
- int* dp = editDistanceTable;
- const int li = inputLength + 1;
- const int lo = outputLength + 1;
- for (int i = 0; i < li; ++i) {
- dp[lo * i] = i;
- }
- for (int i = 0; i < lo; ++i) {
- dp[i] = i;
- }
-
- for (int i = 0; i < li - 1; ++i) {
- for (int j = 0; j < lo - 1; ++j) {
- const uint32_t ci = toBaseLowerCase(input[i]);
- const uint32_t co = toBaseLowerCase(output[j]);
- const uint16_t cost = (ci == co) ? 0 : 1;
- dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
- min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
- if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
- && co == toBaseLowerCase(input[i - 1])) {
- dp[(i + 1) * lo + (j + 1)] = min(
- dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
- }
- }
- }
-
- if (DEBUG_EDIT_DISTANCE) {
- LOGI("IN = %d, OUT = %d", inputLength, outputLength);
- for (int i = 0; i < li; ++i) {
- for (int j = 0; j < lo; ++j) {
- LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
- }
- }
- }
- return dp[li * lo - 1];
-}
-#endif
-
inline static void initEditDistance(int *editDistanceTable) {
for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
editDistanceTable[i] = i;
@@ -145,7 +103,7 @@ void Correction::initCorrectionState(
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
- const bool useFullEditDistance, const bool doAutoCompletion) {
+ const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
// TODO: remove
mTransposedPos = transposedPos;
mExcessivePos = excessivePos;
@@ -159,6 +117,7 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
mMissingSpacePos = missingSpacePos;
mUseFullEditDistance = useFullEditDistance;
mDoAutoCompletion = doAutoCompletion;
+ mMaxErrors = maxErrors;
}
void Correction::checkState() {
@@ -314,12 +273,17 @@ inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
Correction::CorrectionType Correction::processCharAndCalcState(
const int32_t c, const bool isTerminal) {
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
+ if (correctionCount > mMaxErrors) {
+ return UNRELATED;
+ }
+
// TODO: Change the limit if we'll allow two or more corrections
const bool noCorrectionsHappenedSoFar = correctionCount == 0;
const bool canTryCorrection = noCorrectionsHappenedSoFar;
int proximityIndex = 0;
mDistances[mOutputIndex] = NOT_A_DISTANCE;
+ // Skip checking this node
if (mNeedsToTraverseAllNodes || isQuote(c)) {
bool incremented = false;
if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
@@ -344,6 +308,7 @@ Correction::CorrectionType Correction::processCharAndCalcState(
return processSkipChar(c, isTerminal, incremented);
}
+ // Check possible corrections.
if (mExcessivePos >= 0) {
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
mExcessivePos = mOutputIndex;
@@ -394,7 +359,12 @@ Correction::CorrectionType Correction::processCharAndCalcState(
}
// TODO: Change the limit if we'll allow two or more proximity chars with corrections
- const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0;
+ // Work around: When the mMaxErrors is 1, we only allow just one error
+ // including proximity correction.
+ const bool checkProximityChars = (mMaxErrors > 1)
+ ? (noCorrectionsHappenedSoFar || mProximityCount == 0)
+ : (noCorrectionsHappenedSoFar && mProximityCount == 0);
+
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
? ProximityInfo::EQUIVALENT_CHAR
: mProximityInfo->getMatchedProximityId(
@@ -934,4 +904,46 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
return totalFreq;
}
+#if 0 /* no longer used. keep just for reference */
+inline static int editDistance(
+ int* editDistanceTable, const unsigned short* input,
+ const int inputLength, const unsigned short* output, const int outputLength) {
+ // dp[li][lo] dp[a][b] = dp[ a * lo + b]
+ int* dp = editDistanceTable;
+ const int li = inputLength + 1;
+ const int lo = outputLength + 1;
+ for (int i = 0; i < li; ++i) {
+ dp[lo * i] = i;
+ }
+ for (int i = 0; i < lo; ++i) {
+ dp[i] = i;
+ }
+
+ for (int i = 0; i < li - 1; ++i) {
+ for (int j = 0; j < lo - 1; ++j) {
+ const uint32_t ci = toBaseLowerCase(input[i]);
+ const uint32_t co = toBaseLowerCase(output[j]);
+ const uint16_t cost = (ci == co) ? 0 : 1;
+ dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
+ min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
+ if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
+ && co == toBaseLowerCase(input[i - 1])) {
+ dp[(i + 1) * lo + (j + 1)] = min(
+ dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
+ }
+ }
+ }
+
+ if (DEBUG_EDIT_DISTANCE) {
+ LOGI("IN = %d, OUT = %d", inputLength, outputLength);
+ for (int i = 0; i < li; ++i) {
+ for (int j = 0; j < lo; ++j) {
+ LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
+ }
+ }
+ }
+ return dp[li * lo - 1];
+}
+#endif
+
} // namespace latinime
diff --git a/native/src/correction.h b/native/src/correction.h
index 4a8d1fab7..e55be8dd6 100644
--- a/native/src/correction.h
+++ b/native/src/correction.h
@@ -45,7 +45,7 @@ public:
// TODO: remove
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance,
- const bool doAutoCompletion);
+ const bool doAutoCompletion, const int maxErrors);
void checkState();
bool initProcessState(const int index);
@@ -118,6 +118,7 @@ private:
int mMissingSpacePos;
int mTerminalInputIndex;
int mTerminalOutputIndex;
+ int mMaxErrors;
// The following arrays are state buffer.
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 7cf191970..ac9f53ed2 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -261,7 +261,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
// TODO: Remove setCorrectionParams
correction->setCorrectionParams(0, 0, 0,
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
- true /* doAutoCompletion */);
+ true /* doAutoCompletion */, DEFAULT_MAX_ERRORS);
int rootPosition = ROOT_POS;
// Get the number of children of root, then increment the position
int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
@@ -296,7 +296,7 @@ void UnigramDictionary::getMissingSpaceWords(
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos,
- useFullEditDistance, true /* doAutoCompletion */);
+ useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
}
@@ -305,7 +305,7 @@ void UnigramDictionary::getMistypedSpaceWords(
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */,
- useFullEditDistance, true /* doAutoCompletion */);
+ useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
}
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 0b0126524..f5cb43860 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -61,6 +61,10 @@ public:
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+ // Error tolerances
+ static const int DEFAULT_MAX_ERRORS = 2;
+ static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
+
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion);