aboutsummaryrefslogtreecommitdiffstats
path: root/native
diff options
context:
space:
mode:
Diffstat (limited to 'native')
-rw-r--r--native/src/correction.cpp274
-rw-r--r--native/src/correction.h69
-rw-r--r--native/src/correction_state.h57
-rw-r--r--native/src/defines.h17
-rw-r--r--native/src/proximity_info.cpp4
-rw-r--r--native/src/proximity_info.h5
-rw-r--r--native/src/unigram_dictionary.cpp47
-rw-r--r--native/src/unigram_dictionary.h11
8 files changed, 359 insertions, 125 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index 6d682c0c9..a4090a966 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -21,6 +21,7 @@
#define LOG_TAG "LatinIME: correction.cpp"
#include "correction.h"
+#include "dictionary.h"
#include "proximity_info.h"
namespace latinime {
@@ -49,12 +50,21 @@ void Correction::initCorrection(const ProximityInfo *pi, const int inputLength,
mInputLength = inputLength;
mMaxDepth = maxDepth;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
- mSkippedOutputIndex = -1;
+}
+
+void Correction::initCorrectionState(
+ const int rootPos, const int childCount, const bool traverseAll) {
+ latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
+ // TODO: remove
+ mCorrectionStates[0].mSkipPos = mSkipPos;
}
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos) {
+ // TODO: remove
mSkipPos = skipPos;
+ // TODO: remove
+ mCorrectionStates[0].mSkipPos = skipPos;
mExcessivePos = excessivePos;
mTransposedPos = transposedPos;
mSpaceProximityPos = spaceProximityPos;
@@ -83,33 +93,37 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen
if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) {
return -1;
}
+
*word = mWord;
const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2)
: (mInputLength == inputIndex + 1);
return Correction::RankingAlgorithm::calculateFinalFreq(
- inputIndex, outputIndex, mMatchedCharCount, freq, sameLength, this);
+ inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this);
}
-void Correction::initProcessState(const int matchCount, const int inputIndex,
- const int outputIndex, const bool traverseAllNodes, const int diffs) {
- mMatchedCharCount = matchCount;
- mInputIndex = inputIndex;
+bool Correction::initProcessState(const int outputIndex) {
+ if (mCorrectionStates[outputIndex].mChildCount <= 0) {
+ return false;
+ }
mOutputIndex = outputIndex;
- mTraverseAllNodes = traverseAllNodes;
- mDiffs = diffs;
-}
-
-void Correction::getProcessState(int *matchedCount, int *inputIndex, int *outputIndex,
- bool *traverseAllNodes, int *diffs) {
- *matchedCount = mMatchedCharCount;
- *inputIndex = mInputIndex;
- *outputIndex = mOutputIndex;
- *traverseAllNodes = mTraverseAllNodes;
- *diffs = mDiffs;
+ --(mCorrectionStates[outputIndex].mChildCount);
+ mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
+ mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
+ mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
+ mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
+ mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
+ mSkipping = false;
+ mProximityMatching = false;
+ mMatching = false;
+ return true;
}
-void Correction::charMatched() {
- ++mMatchedCharCount;
+int Correction::goDownTree(
+ const int parentIndex, const int childCount, const int firstChildPos) {
+ mCorrectionStates[mOutputIndex].mParentIndex = parentIndex;
+ mCorrectionStates[mOutputIndex].mChildCount = childCount;
+ mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos;
+ return mOutputIndex;
}
// TODO: remove
@@ -123,8 +137,8 @@ int Correction::getInputIndex() {
}
// TODO: remove
-bool Correction::needsToTraverseAll() {
- return mTraverseAllNodes;
+bool Correction::needsToTraverseAllNodes() {
+ return mNeedsToTraverseAllNodes;
}
void Correction::incrementInputIndex() {
@@ -133,21 +147,32 @@ void Correction::incrementInputIndex() {
void Correction::incrementOutputIndex() {
++mOutputIndex;
+ mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex;
+ mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount;
+ mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos;
+ mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex;
+ mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes;
+ mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount;
+ mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount;
+ mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
+ mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos;
+ mCorrectionStates[mOutputIndex].mMatching = mMatching;
+ mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching;
}
-void Correction::startTraverseAll() {
- mTraverseAllNodes = true;
+void Correction::startToTraverseAllNodes() {
+ mNeedsToTraverseAllNodes = true;
}
bool Correction::needsToPrune() const {
return (mOutputIndex - 1 >= (mTransposedPos >= 0 ? mInputLength - 1 : mMaxDepth)
- || mDiffs > mMaxEditDistance);
+ || mProximityCount > mMaxEditDistance);
}
Correction::CorrectionType Correction::processSkipChar(
const int32_t c, const bool isTerminal) {
mWord[mOutputIndex] = c;
- if (needsToTraverseAll() && isTerminal) {
+ if (needsToTraverseAllNodes() && isTerminal) {
mTerminalInputIndex = mInputIndex;
mTerminalOutputIndex = mOutputIndex;
incrementOutputIndex();
@@ -169,10 +194,31 @@ Correction::CorrectionType Correction::processCharAndCalcState(
bool skip = false;
if (mSkipPos >= 0) {
+ if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
+ if (DEBUG_DICT) {
+ assert(mSkipPos == mOutputIndex - 1);
+ }
+ ++mSkipPos;
+ }
skip = mSkipPos == mOutputIndex;
+ mSkipping = true;
}
- if (mTraverseAllNodes || isQuote(c)) {
+ if (mNeedsToTraverseAllNodes || isQuote(c)) {
+ const bool checkProximityChars =
+ !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
+ // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
+ // proximity chars of "s", but it should rather be handled as a skipped char.
+ if (checkProximityChars
+ && mInputIndex > 0
+ && mCorrectionStates[mOutputIndex].mProximityMatching
+ && mCorrectionStates[mOutputIndex].mSkipping
+ && mProximityInfo->getMatchedProximityId(
+ mInputIndex - 1, c, false)
+ == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
+ ++mSkippedCount;
+ --mProximityCount;
+ }
return processSkipChar(c, isTerminal);
} else {
int inputIndexForProximity = mInputIndex;
@@ -186,40 +232,40 @@ Correction::CorrectionType Correction::processCharAndCalcState(
}
}
+ // TODO: sum counters
const bool checkProximityChars =
- !(mSkipPos >= 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
+ !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
int matchedProximityCharId = mProximityInfo->getMatchedProximityId(
inputIndexForProximity, c, checkProximityChars);
- const bool unrelated = ProximityInfo::UNRELATED_CHAR == matchedProximityCharId;
- if (unrelated) {
- if (skip) {
+ if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
+ if (skip && mProximityCount == 0) {
// Skip this letter and continue deeper
- mSkippedOutputIndex = mOutputIndex;
+ ++mSkippedCount;
+ return processSkipChar(c, isTerminal);
+ } else if (checkProximityChars
+ && inputIndexForProximity > 0
+ && mCorrectionStates[mOutputIndex].mProximityMatching
+ && mCorrectionStates[mOutputIndex].mSkipping
+ && mProximityInfo->getMatchedProximityId(
+ inputIndexForProximity - 1, c, false)
+ == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
+ ++mSkippedCount;
+ --mProximityCount;
return processSkipChar(c, isTerminal);
} else {
return UNRELATED;
}
- }
-
- // No need to skip. Finish traversing and increment skipPos.
- // TODO: Remove this?
- if (skip) {
- mWord[mOutputIndex] = c;
- incrementOutputIndex();
- return TRAVERSE_ALL_NOT_ON_TERMINAL;
+ } else if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
+ // If inputIndex is greater than mInputLength, that means there is no
+ // proximity chars. So, we don't need to check proximity.
+ mMatching = true;
+ } else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) {
+ mProximityMatching = true;
+ incrementProximityCount();
}
mWord[mOutputIndex] = c;
- // If inputIndex is greater than mInputLength, that means there is no
- // proximity chars. So, we don't need to check proximity.
- if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
- charMatched();
- }
-
- if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) {
- incrementDiffs();
- }
const bool isSameAsUserTypedLength = mInputLength
== getInputIndex() + 1
@@ -232,7 +278,7 @@ Correction::CorrectionType Correction::processCharAndCalcState(
}
// Start traversing all nodes after the index exceeds the user typed length
if (isSameAsUserTypedLength) {
- startTraverseAll();
+ startToTraverseAllNodes();
}
// Finally, we are ready to go to the next character, the next "virtual node".
@@ -298,26 +344,117 @@ inline static void multiplyRate(const int rate, int *freq) {
}
}
+/* static */
+inline static int editDistance(
+ int* editDistanceTable, const unsigned short* input,
+ const int inputLength, const unsigned short* output, const int outputLength) {
+ // dp[li][lo] dp[a][b] = dp[ a * lo + b]
+ int* dp = editDistanceTable;
+ const int li = inputLength + 1;
+ const int lo = outputLength + 1;
+ for (int i = 0; i < li; ++i) {
+ dp[lo * i] = i;
+ }
+ for (int i = 0; i < lo; ++i) {
+ dp[i] = i;
+ }
+
+ for (int i = 0; i < li - 1; ++i) {
+ for (int j = 0; j < lo - 1; ++j) {
+ const uint32_t ci = Dictionary::toBaseLowerCase(input[i]);
+ const uint32_t co = Dictionary::toBaseLowerCase(output[j]);
+ const uint16_t cost = (ci == co) ? 0 : 1;
+ dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
+ min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
+ if (li > 0 && lo > 0
+ && ci == Dictionary::toBaseLowerCase(output[j - 1])
+ && co == Dictionary::toBaseLowerCase(input[i - 1])) {
+ dp[(i + 1) * lo + (j + 1)] = min(
+ dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
+ }
+ }
+ }
+
+ if (DEBUG_EDIT_DISTANCE) {
+ LOGI("IN = %d, OUT = %d", inputLength, outputLength);
+ for (int i = 0; i < li; ++i) {
+ for (int j = 0; j < lo; ++j) {
+ LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
+ }
+ }
+ }
+ return dp[li * lo - 1];
+}
+
//////////////////////
// RankingAlgorithm //
//////////////////////
-int Correction::RankingAlgorithm::calculateFinalFreq(
- const int inputIndex, const int outputIndex,
- const int matchCount, const int freq, const bool sameLength,
+/* static */
+int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
+ const int freq, const bool sameLength, int* editDistanceTable,
const Correction* correction) {
- const int skipPos = correction->getSkipPos();
const int excessivePos = correction->getExcessivePos();
const int transposedPos = correction->getTransposedPos();
const int inputLength = correction->mInputLength;
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
const ProximityInfo *proximityInfo = correction->mProximityInfo;
- const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
+ const int skipCount = correction->mSkippedCount;
+ const int proximityMatchedCount = correction->mProximityCount;
+
+ // TODO: use mExcessiveCount
+ int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
+
+ const unsigned short* word = correction->mWord;
+ const bool skipped = skipCount > 0;
+
+ // ----- TODO: use edit distance here as follows? ---------------------- /
+ //if (!skipped && excessivePos < 0 && transposedPos < 0) {
+ // const int ed = editDistance(dp, proximityInfo->getInputWord(),
+ // inputLength, word, outputIndex + 1);
+ // matchCount = outputIndex + 1 - ed;
+ // if (ed == 1 && !sameLength) ++matchCount;
+ //}
+ // const int ed = editDistance(dp, proximityInfo->getInputWord(),
+ // inputLength, word, outputIndex + 1);
+ // if (ed == 1 && !sameLength) ++matchCount; ------------------------ /
+ int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
// TODO: Demote by edit distance
int finalFreq = freq * matchWeight;
- if (skipPos >= 0) {
+ // +1 +11/-12
+ /*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) {
+ const int ed = editDistance(dp, proximityInfo->getInputWord(),
+ inputLength, word, outputIndex + 1);
+ if (ed == 1) {
+ multiplyRate(160, &finalFreq);
+ }
+ }*/
+ if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0
+ && (proximityMatchedCount > 0 || skipped)) {
+ const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(),
+ inputLength, word, outputIndex + 1);
+ if (ed == 1) {
+ multiplyRate(160, &finalFreq);
+ }
+ }
+
+ // TODO: Promote properly?
+ //if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
+ // && !sameLength) {
+ // multiplyRate(150, &finalFreq);
+ //}
+ //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
+ // && !sameLength) {
+ // multiplyRate(150, &finalFreq);
+ //}
+ //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0
+ // && inputLength == outputIndex + 1) {
+ // multiplyRate(150, &finalFreq);
+ //}
+
+ if (skipped) {
if (inputLength >= 2) {
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
@@ -351,10 +488,10 @@ int Correction::RankingAlgorithm::calculateFinalFreq(
}
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
- if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
+ if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
- } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0
+ } else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0
&& outputIndex > 0) {
// A word with proximity corrections
if (DEBUG_DICT) {
@@ -363,13 +500,34 @@ int Correction::RankingAlgorithm::calculateFinalFreq(
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
}
- if (DEBUG_DICT) {
+ if (DEBUG_DICT_FULL) {
LOGI("calc: %d, %d", outputIndex, sameLength);
}
if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq);
+
+ // TODO: check excessive count and transposed count
+ /*
+ If the last character of the user input word is the same as the next character
+ of the output word, and also all of characters of the user input are matched
+ to the output word, we'll promote that word a bit because
+ that word can be considered the combination of skipped and matched characters.
+ This means that the 'sm' pattern wins over the 'ma' pattern.
+ e.g.)
+ shel -> shell [mmmma] or [mmmsm]
+ hel -> hello [mmmaa] or [mmsma]
+ m ... matching
+ s ... skipping
+ a ... traversing all
+ */
+ if (matchCount == inputLength && matchCount >= 2 && !skipped
+ && word[matchCount] == word[matchCount - 1]) {
+ multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
+ }
+
return finalFreq;
}
+/* static */
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
const int firstFreq, const int secondFreq, const Correction* correction) {
const int spaceProximityPos = correction->mSpaceProximityPos;
diff --git a/native/src/correction.h b/native/src/correction.h
index ae6c7a421..9d385a44e 100644
--- a/native/src/correction.h
+++ b/native/src/correction.h
@@ -18,6 +18,7 @@
#define LATINIME_CORRECTION_H
#include <stdint.h>
+#include "correction_state.h"
#include "defines.h"
@@ -39,16 +40,16 @@ public:
Correction(const int typedLetterMultiplier, const int fullWordMultiplier);
void initCorrection(
const ProximityInfo *pi, const int inputLength, const int maxWordLength);
+ void initCorrectionState(const int rootPos, const int childCount, const bool traverseAll);
+
+ // TODO: remove
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
const int spaceProximityPos, const int missingSpacePos);
void checkState();
- void initProcessState(const int matchCount, const int inputIndex, const int outputIndex,
- const bool traverseAllNodes, const int diffs);
- void getProcessState(int *matchedCount, int *inputIndex, int *outputIndex,
- bool *traverseAllNodes, int *diffs);
+ bool initProcessState(const int index);
+
int getOutputIndex();
int getInputIndex();
- bool needsToTraverseAll();
virtual ~Correction();
int getSpaceProximityPos() const {
@@ -77,52 +78,68 @@ public:
CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
- int getDiffs() const {
- return mDiffs;
+ /////////////////////////
+ // Tree helper methods
+ int goDownTree(const int parentIndex, const int childCount, const int firstChildPos);
+
+ inline int getTreeSiblingPos(const int index) const {
+ return mCorrectionStates[index].mSiblingPos;
+ }
+
+ inline void setTreeSiblingPos(const int index, const int pos) {
+ mCorrectionStates[index].mSiblingPos = pos;
+ }
+
+ inline int getTreeParentIndex(const int index) const {
+ return mCorrectionStates[index].mParentIndex;
}
private:
- void charMatched();
- void incrementInputIndex();
- void incrementOutputIndex();
- void startTraverseAll();
+ inline void incrementInputIndex();
+ inline void incrementOutputIndex();
+ inline bool needsToTraverseAllNodes();
+ inline void startToTraverseAllNodes();
+ inline bool isQuote(const unsigned short c);
+ inline CorrectionType processSkipChar(const int32_t c, const bool isTerminal);
// TODO: remove
-
- void incrementDiffs() {
- ++mDiffs;
+ inline void incrementProximityCount() {
+ ++mProximityCount;
}
const int TYPED_LETTER_MULTIPLIER;
const int FULL_WORD_MULTIPLIER;
-
const ProximityInfo *mProximityInfo;
int mMaxEditDistance;
int mMaxDepth;
int mInputLength;
- int mSkipPos;
- int mSkippedOutputIndex;
int mExcessivePos;
int mTransposedPos;
int mSpaceProximityPos;
int mMissingSpacePos;
-
- int mMatchedCharCount;
- int mInputIndex;
- int mOutputIndex;
int mTerminalInputIndex;
int mTerminalOutputIndex;
- int mDiffs;
- bool mTraverseAllNodes;
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
+ // Caveat: Do not create multiple tables per thread as this table eats up RAM a lot.
+ int mEditDistanceTable[MAX_WORD_LENGTH_INTERNAL * MAX_WORD_LENGTH_INTERNAL];
- inline bool isQuote(const unsigned short c);
- inline CorrectionType processSkipChar(const int32_t c, const bool isTerminal);
+ CorrectionState mCorrectionStates[MAX_WORD_LENGTH_INTERNAL];
+
+ // The following member variables are being used as cache values of the correction state.
+ int mOutputIndex;
+ int mInputIndex;
+ int mProximityCount;
+ int mSkippedCount;
+ int mSkipPos;
+ bool mNeedsToTraverseAllNodes;
+ bool mMatching;
+ bool mSkipping;
+ bool mProximityMatching;
class RankingAlgorithm {
public:
static int calculateFinalFreq(const int inputIndex, const int depth,
- const int matchCount, const int freq, const bool sameLength,
+ const int freq, const bool sameLength, int *editDistanceTable,
const Correction* correction);
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const Correction* correction);
diff --git a/native/src/correction_state.h b/native/src/correction_state.h
new file mode 100644
index 000000000..267deda9b
--- /dev/null
+++ b/native/src/correction_state.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_CORRECTION_STATE_H
+#define LATINIME_CORRECTION_STATE_H
+
+#include <stdint.h>
+
+#include "defines.h"
+
+namespace latinime {
+
+struct CorrectionState {
+ int mParentIndex;
+ int mSiblingPos;
+ uint16_t mChildCount;
+ uint8_t mInputIndex;
+ uint8_t mProximityCount;
+ uint8_t mSkippedCount;
+ int8_t mSkipPos; // should be signed
+ bool mMatching;
+ bool mSkipping;
+ bool mProximityMatching;
+ bool mNeedsToTraverseAllNodes;
+
+};
+
+inline static void initCorrectionState(CorrectionState *state, const int rootPos,
+ const uint16_t childCount, const bool traverseAll) {
+ state->mParentIndex = -1;
+ state->mChildCount = childCount;
+ state->mInputIndex = 0;
+ state->mProximityCount = 0;
+ state->mSiblingPos = rootPos;
+ state->mSkippedCount = 0;
+ state->mMatching = false;
+ state->mSkipping = false;
+ state->mProximityMatching = false;
+ state->mNeedsToTraverseAllNodes = traverseAll;
+ state->mSkipPos = -1;
+}
+
+} // namespace latinime
+#endif // LATINIME_CORRECTION_STATE_H
diff --git a/native/src/defines.h b/native/src/defines.h
index 5a5d3ee0c..c1d08e695 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -94,20 +94,36 @@ static void prof_out(void) {
#endif
#define DEBUG_DICT true
#define DEBUG_DICT_FULL false
+#define DEBUG_EDIT_DISTANCE false
#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
#define DEBUG_NODE DEBUG_DICT_FULL
#define DEBUG_TRACE DEBUG_DICT_FULL
#define DEBUG_PROXIMITY_INFO true
+#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0)
+
+static char charBuf[50];
+
+static void dumpWord(const unsigned short* word, const int length) {
+ for (int i = 0; i < length; ++i) {
+ charBuf[i] = word[i];
+ }
+ charBuf[length] = 0;
+ LOGI("[ %s ]", charBuf);
+}
+
#else // FLAG_DBG
#define DEBUG_DICT false
#define DEBUG_DICT_FULL false
+#define DEBUG_EDIT_DISTANCE false
#define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE false
#define DEBUG_TRACE false
#define DEBUG_PROXIMITY_INFO false
+#define DUMP_WORD(word, length)
+
#endif // FLAG_DBG
#ifndef U_SHORT_MAX
@@ -160,6 +176,7 @@ static void prof_out(void) {
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
+#define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp
index d437e251a..361bdacbf 100644
--- a/native/src/proximity_info.cpp
+++ b/native/src/proximity_info.cpp
@@ -68,6 +68,10 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const {
void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength) {
mInputCodes = inputCodes;
mInputLength = inputLength;
+ for (int i = 0; i < inputLength; ++i) {
+ mPrimaryInputWord[i] = getPrimaryCharAt(i);
+ }
+ mPrimaryInputWord[inputLength] = 0;
}
inline const int* ProximityInfo::getProximityCharsAt(const int index) const {
diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h
index 5034c3b89..75fc8fb63 100644
--- a/native/src/proximity_info.h
+++ b/native/src/proximity_info.h
@@ -46,6 +46,10 @@ public:
ProximityType getMatchedProximityId(
const int index, const unsigned short c, const bool checkProximityChars) const;
bool sameAsTyped(const unsigned short *word, int length) const;
+ const unsigned short* getPrimaryInputWord() const {
+ return mPrimaryInputWord;
+ }
+
private:
int getStartIndexFromCoordinates(const int x, const int y) const;
const int MAX_PROXIMITY_CHARS_SIZE;
@@ -58,6 +62,7 @@ private:
const int *mInputCodes;
uint32_t *mProximityCharsArray;
int mInputLength;
+ unsigned short mPrimaryInputWord[MAX_WORD_LENGTH_INTERNAL];
};
} // namespace latinime
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index df1a2e273..6bc350505 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -187,20 +187,15 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
mCorrection->initCorrection(mProximityInfo, mInputLength, maxDepth);
PROF_END(0);
+ // TODO: remove
PROF_START(1);
- getSuggestionCandidates(-1, -1, -1);
+ // Note: This line is intentionally left blank
PROF_END(1);
PROF_START(2);
// Suggestion with missing character
- if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
- for (int i = 0; i < codesSize; ++i) {
- if (DEBUG_DICT) {
- LOGI("--- Suggest missing characters %d", i);
- }
- getSuggestionCandidates(i, -1, -1);
- }
- }
+ LOGI("--- Suggest missing characters");
+ getSuggestionCandidates(0, -1, -1);
PROF_END(2);
PROF_START(3);
@@ -352,44 +347,28 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
int rootPosition = ROOT_POS;
// Get the number of children of root, then increment the position
int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
- int depth = 0;
+ int outputIndex = 0;
- mStackChildCount[0] = childCount;
- mStackTraverseAll[0] = (mInputLength <= 0);
- mStackInputIndex[0] = 0;
- mStackDiffs[0] = 0;
- mStackSiblingPos[0] = rootPosition;
- mStackOutputIndex[0] = 0;
- mStackMatchedCount[0] = 0;
+ mCorrection->initCorrectionState(rootPosition, childCount, (mInputLength <= 0));
// Depth first search
- while (depth >= 0) {
- if (mStackChildCount[depth] > 0) {
- --mStackChildCount[depth];
- int siblingPos = mStackSiblingPos[depth];
+ while (outputIndex >= 0) {
+ if (mCorrection->initProcessState(outputIndex)) {
+ int siblingPos = mCorrection->getTreeSiblingPos(outputIndex);
int firstChildPos;
- mCorrection->initProcessState(
- mStackMatchedCount[depth], mStackInputIndex[depth], mStackOutputIndex[depth],
- mStackTraverseAll[depth], mStackDiffs[depth]);
- // needsToTraverseChildrenNodes should be false
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
mCorrection, &childCount, &firstChildPos, &siblingPos);
// Update next sibling pos
- mStackSiblingPos[depth] = siblingPos;
+ mCorrection->setTreeSiblingPos(outputIndex, siblingPos);
+
if (needsToTraverseChildrenNodes) {
// Goes to child node
- ++depth;
- mStackChildCount[depth] = childCount;
- mStackSiblingPos[depth] = firstChildPos;
-
- mCorrection->getProcessState(&mStackMatchedCount[depth],
- &mStackInputIndex[depth], &mStackOutputIndex[depth],
- &mStackTraverseAll[depth], &mStackDiffs[depth]);
+ outputIndex = mCorrection->goDownTree(outputIndex, childCount, firstChildPos);
}
} else {
// Goes to parent sibling node
- --depth;
+ outputIndex = mCorrection->getTreeParentIndex(outputIndex);
}
}
}
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 8bcd7cea5..cfe63ff79 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -19,6 +19,7 @@
#include <stdint.h>
#include "correction.h"
+#include "correction_state.h"
#include "defines.h"
#include "proximity_info.h"
@@ -134,13 +135,9 @@ private:
// MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
- int mStackMatchedCount[MAX_WORD_LENGTH_INTERNAL];
- int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];
- bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL];
- int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
- int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
- int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
- int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
+ int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
+ int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
+ int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
};
} // namespace latinime