diff options
Diffstat (limited to 'native')
-rw-r--r-- | native/src/correction.cpp | 274 | ||||
-rw-r--r-- | native/src/correction.h | 69 | ||||
-rw-r--r-- | native/src/correction_state.h | 57 | ||||
-rw-r--r-- | native/src/defines.h | 17 | ||||
-rw-r--r-- | native/src/proximity_info.cpp | 4 | ||||
-rw-r--r-- | native/src/proximity_info.h | 5 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 47 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 11 |
8 files changed, 359 insertions, 125 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 6d682c0c9..a4090a966 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -21,6 +21,7 @@ #define LOG_TAG "LatinIME: correction.cpp" #include "correction.h" +#include "dictionary.h" #include "proximity_info.h" namespace latinime { @@ -49,12 +50,21 @@ void Correction::initCorrection(const ProximityInfo *pi, const int inputLength, mInputLength = inputLength; mMaxDepth = maxDepth; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; - mSkippedOutputIndex = -1; +} + +void Correction::initCorrectionState( + const int rootPos, const int childCount, const bool traverseAll) { + latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll); + // TODO: remove + mCorrectionStates[0].mSkipPos = mSkipPos; } void Correction::setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos) { + // TODO: remove mSkipPos = skipPos; + // TODO: remove + mCorrectionStates[0].mSkipPos = skipPos; mExcessivePos = excessivePos; mTransposedPos = transposedPos; mSpaceProximityPos = spaceProximityPos; @@ -83,33 +93,37 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { return -1; } + *word = mWord; const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2) : (mInputLength == inputIndex + 1); return Correction::RankingAlgorithm::calculateFinalFreq( - inputIndex, outputIndex, mMatchedCharCount, freq, sameLength, this); + inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this); } -void Correction::initProcessState(const int matchCount, const int inputIndex, - const int outputIndex, const bool traverseAllNodes, const int diffs) { - mMatchedCharCount = matchCount; - mInputIndex = inputIndex; +bool Correction::initProcessState(const int outputIndex) { + if (mCorrectionStates[outputIndex].mChildCount <= 0) { + return false; + } mOutputIndex = outputIndex; - mTraverseAllNodes = traverseAllNodes; - mDiffs = diffs; -} - -void Correction::getProcessState(int *matchedCount, int *inputIndex, int *outputIndex, - bool *traverseAllNodes, int *diffs) { - *matchedCount = mMatchedCharCount; - *inputIndex = mInputIndex; - *outputIndex = mOutputIndex; - *traverseAllNodes = mTraverseAllNodes; - *diffs = mDiffs; + --(mCorrectionStates[outputIndex].mChildCount); + mInputIndex = mCorrectionStates[outputIndex].mInputIndex; + mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes; + mProximityCount = mCorrectionStates[outputIndex].mProximityCount; + mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount; + mSkipPos = mCorrectionStates[outputIndex].mSkipPos; + mSkipping = false; + mProximityMatching = false; + mMatching = false; + return true; } -void Correction::charMatched() { - ++mMatchedCharCount; +int Correction::goDownTree( + const int parentIndex, const int childCount, const int firstChildPos) { + mCorrectionStates[mOutputIndex].mParentIndex = parentIndex; + mCorrectionStates[mOutputIndex].mChildCount = childCount; + mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos; + return mOutputIndex; } // TODO: remove @@ -123,8 +137,8 @@ int Correction::getInputIndex() { } // TODO: remove -bool Correction::needsToTraverseAll() { - return mTraverseAllNodes; +bool Correction::needsToTraverseAllNodes() { + return mNeedsToTraverseAllNodes; } void Correction::incrementInputIndex() { @@ -133,21 +147,32 @@ void Correction::incrementInputIndex() { void Correction::incrementOutputIndex() { ++mOutputIndex; + mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex; + mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount; + mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos; + mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex; + mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes; + mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount; + mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount; + mCorrectionStates[mOutputIndex].mSkipping = mSkipping; + mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos; + mCorrectionStates[mOutputIndex].mMatching = mMatching; + mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; } -void Correction::startTraverseAll() { - mTraverseAllNodes = true; +void Correction::startToTraverseAllNodes() { + mNeedsToTraverseAllNodes = true; } bool Correction::needsToPrune() const { return (mOutputIndex - 1 >= (mTransposedPos >= 0 ? mInputLength - 1 : mMaxDepth) - || mDiffs > mMaxEditDistance); + || mProximityCount > mMaxEditDistance); } Correction::CorrectionType Correction::processSkipChar( const int32_t c, const bool isTerminal) { mWord[mOutputIndex] = c; - if (needsToTraverseAll() && isTerminal) { + if (needsToTraverseAllNodes() && isTerminal) { mTerminalInputIndex = mInputIndex; mTerminalOutputIndex = mOutputIndex; incrementOutputIndex(); @@ -169,10 +194,31 @@ Correction::CorrectionType Correction::processCharAndCalcState( bool skip = false; if (mSkipPos >= 0) { + if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { + if (DEBUG_DICT) { + assert(mSkipPos == mOutputIndex - 1); + } + ++mSkipPos; + } skip = mSkipPos == mOutputIndex; + mSkipping = true; } - if (mTraverseAllNodes || isQuote(c)) { + if (mNeedsToTraverseAllNodes || isQuote(c)) { + const bool checkProximityChars = + !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0); + // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of + // proximity chars of "s", but it should rather be handled as a skipped char. + if (checkProximityChars + && mInputIndex > 0 + && mCorrectionStates[mOutputIndex].mProximityMatching + && mCorrectionStates[mOutputIndex].mSkipping + && mProximityInfo->getMatchedProximityId( + mInputIndex - 1, c, false) + == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { + ++mSkippedCount; + --mProximityCount; + } return processSkipChar(c, isTerminal); } else { int inputIndexForProximity = mInputIndex; @@ -186,40 +232,40 @@ Correction::CorrectionType Correction::processCharAndCalcState( } } + // TODO: sum counters const bool checkProximityChars = - !(mSkipPos >= 0 || mExcessivePos >= 0 || mTransposedPos >= 0); + !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0); int matchedProximityCharId = mProximityInfo->getMatchedProximityId( inputIndexForProximity, c, checkProximityChars); - const bool unrelated = ProximityInfo::UNRELATED_CHAR == matchedProximityCharId; - if (unrelated) { - if (skip) { + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (skip && mProximityCount == 0) { // Skip this letter and continue deeper - mSkippedOutputIndex = mOutputIndex; + ++mSkippedCount; + return processSkipChar(c, isTerminal); + } else if (checkProximityChars + && inputIndexForProximity > 0 + && mCorrectionStates[mOutputIndex].mProximityMatching + && mCorrectionStates[mOutputIndex].mSkipping + && mProximityInfo->getMatchedProximityId( + inputIndexForProximity - 1, c, false) + == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { + ++mSkippedCount; + --mProximityCount; return processSkipChar(c, isTerminal); } else { return UNRELATED; } - } - - // No need to skip. Finish traversing and increment skipPos. - // TODO: Remove this? - if (skip) { - mWord[mOutputIndex] = c; - incrementOutputIndex(); - return TRAVERSE_ALL_NOT_ON_TERMINAL; + } else if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + // If inputIndex is greater than mInputLength, that means there is no + // proximity chars. So, we don't need to check proximity. + mMatching = true; + } else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) { + mProximityMatching = true; + incrementProximityCount(); } mWord[mOutputIndex] = c; - // If inputIndex is greater than mInputLength, that means there is no - // proximity chars. So, we don't need to check proximity. - if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { - charMatched(); - } - - if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) { - incrementDiffs(); - } const bool isSameAsUserTypedLength = mInputLength == getInputIndex() + 1 @@ -232,7 +278,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( } // Start traversing all nodes after the index exceeds the user typed length if (isSameAsUserTypedLength) { - startTraverseAll(); + startToTraverseAllNodes(); } // Finally, we are ready to go to the next character, the next "virtual node". @@ -298,26 +344,117 @@ inline static void multiplyRate(const int rate, int *freq) { } } +/* static */ +inline static int editDistance( + int* editDistanceTable, const unsigned short* input, + const int inputLength, const unsigned short* output, const int outputLength) { + // dp[li][lo] dp[a][b] = dp[ a * lo + b] + int* dp = editDistanceTable; + const int li = inputLength + 1; + const int lo = outputLength + 1; + for (int i = 0; i < li; ++i) { + dp[lo * i] = i; + } + for (int i = 0; i < lo; ++i) { + dp[i] = i; + } + + for (int i = 0; i < li - 1; ++i) { + for (int j = 0; j < lo - 1; ++j) { + const uint32_t ci = Dictionary::toBaseLowerCase(input[i]); + const uint32_t co = Dictionary::toBaseLowerCase(output[j]); + const uint16_t cost = (ci == co) ? 0 : 1; + dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, + min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); + if (li > 0 && lo > 0 + && ci == Dictionary::toBaseLowerCase(output[j - 1]) + && co == Dictionary::toBaseLowerCase(input[i - 1])) { + dp[(i + 1) * lo + (j + 1)] = min( + dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); + } + } + } + + if (DEBUG_EDIT_DISTANCE) { + LOGI("IN = %d, OUT = %d", inputLength, outputLength); + for (int i = 0; i < li; ++i) { + for (int j = 0; j < lo; ++j) { + LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); + } + } + } + return dp[li * lo - 1]; +} + ////////////////////// // RankingAlgorithm // ////////////////////// -int Correction::RankingAlgorithm::calculateFinalFreq( - const int inputIndex, const int outputIndex, - const int matchCount, const int freq, const bool sameLength, +/* static */ +int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, + const int freq, const bool sameLength, int* editDistanceTable, const Correction* correction) { - const int skipPos = correction->getSkipPos(); const int excessivePos = correction->getExcessivePos(); const int transposedPos = correction->getTransposedPos(); const int inputLength = correction->mInputLength; const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; const ProximityInfo *proximityInfo = correction->mProximityInfo; - const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); + const int skipCount = correction->mSkippedCount; + const int proximityMatchedCount = correction->mProximityCount; + + // TODO: use mExcessiveCount + int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0); + + const unsigned short* word = correction->mWord; + const bool skipped = skipCount > 0; + + // ----- TODO: use edit distance here as follows? ---------------------- / + //if (!skipped && excessivePos < 0 && transposedPos < 0) { + // const int ed = editDistance(dp, proximityInfo->getInputWord(), + // inputLength, word, outputIndex + 1); + // matchCount = outputIndex + 1 - ed; + // if (ed == 1 && !sameLength) ++matchCount; + //} + // const int ed = editDistance(dp, proximityInfo->getInputWord(), + // inputLength, word, outputIndex + 1); + // if (ed == 1 && !sameLength) ++matchCount; ------------------------ / + int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); // TODO: Demote by edit distance int finalFreq = freq * matchWeight; - if (skipPos >= 0) { + // +1 +11/-12 + /*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) { + const int ed = editDistance(dp, proximityInfo->getInputWord(), + inputLength, word, outputIndex + 1); + if (ed == 1) { + multiplyRate(160, &finalFreq); + } + }*/ + if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0 + && (proximityMatchedCount > 0 || skipped)) { + const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(), + inputLength, word, outputIndex + 1); + if (ed == 1) { + multiplyRate(160, &finalFreq); + } + } + + // TODO: Promote properly? + //if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex + // && !sameLength) { + // multiplyRate(150, &finalFreq); + //} + //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex + // && !sameLength) { + // multiplyRate(150, &finalFreq); + //} + //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 + // && inputLength == outputIndex + 1) { + // multiplyRate(150, &finalFreq); + //} + + if (skipped) { if (inputLength >= 2) { const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) @@ -351,10 +488,10 @@ int Correction::RankingAlgorithm::calculateFinalFreq( } multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } - if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { + if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) { finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } - } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 + } else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0 && outputIndex > 0) { // A word with proximity corrections if (DEBUG_DICT) { @@ -363,13 +500,34 @@ int Correction::RankingAlgorithm::calculateFinalFreq( multiplyIntCapped(typedLetterMultiplier, &finalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } - if (DEBUG_DICT) { + if (DEBUG_DICT_FULL) { LOGI("calc: %d, %d", outputIndex, sameLength); } if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq); + + // TODO: check excessive count and transposed count + /* + If the last character of the user input word is the same as the next character + of the output word, and also all of characters of the user input are matched + to the output word, we'll promote that word a bit because + that word can be considered the combination of skipped and matched characters. + This means that the 'sm' pattern wins over the 'ma' pattern. + e.g.) + shel -> shell [mmmma] or [mmmsm] + hel -> hello [mmmaa] or [mmsma] + m ... matching + s ... skipping + a ... traversing all + */ + if (matchCount == inputLength && matchCount >= 2 && !skipped + && word[matchCount] == word[matchCount - 1]) { + multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); + } + return finalFreq; } +/* static */ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( const int firstFreq, const int secondFreq, const Correction* correction) { const int spaceProximityPos = correction->mSpaceProximityPos; diff --git a/native/src/correction.h b/native/src/correction.h index ae6c7a421..9d385a44e 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -18,6 +18,7 @@ #define LATINIME_CORRECTION_H #include <stdint.h> +#include "correction_state.h" #include "defines.h" @@ -39,16 +40,16 @@ public: Correction(const int typedLetterMultiplier, const int fullWordMultiplier); void initCorrection( const ProximityInfo *pi, const int inputLength, const int maxWordLength); + void initCorrectionState(const int rootPos, const int childCount, const bool traverseAll); + + // TODO: remove void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos); void checkState(); - void initProcessState(const int matchCount, const int inputIndex, const int outputIndex, - const bool traverseAllNodes, const int diffs); - void getProcessState(int *matchedCount, int *inputIndex, int *outputIndex, - bool *traverseAllNodes, int *diffs); + bool initProcessState(const int index); + int getOutputIndex(); int getInputIndex(); - bool needsToTraverseAll(); virtual ~Correction(); int getSpaceProximityPos() const { @@ -77,52 +78,68 @@ public: CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); - int getDiffs() const { - return mDiffs; + ///////////////////////// + // Tree helper methods + int goDownTree(const int parentIndex, const int childCount, const int firstChildPos); + + inline int getTreeSiblingPos(const int index) const { + return mCorrectionStates[index].mSiblingPos; + } + + inline void setTreeSiblingPos(const int index, const int pos) { + mCorrectionStates[index].mSiblingPos = pos; + } + + inline int getTreeParentIndex(const int index) const { + return mCorrectionStates[index].mParentIndex; } private: - void charMatched(); - void incrementInputIndex(); - void incrementOutputIndex(); - void startTraverseAll(); + inline void incrementInputIndex(); + inline void incrementOutputIndex(); + inline bool needsToTraverseAllNodes(); + inline void startToTraverseAllNodes(); + inline bool isQuote(const unsigned short c); + inline CorrectionType processSkipChar(const int32_t c, const bool isTerminal); // TODO: remove - - void incrementDiffs() { - ++mDiffs; + inline void incrementProximityCount() { + ++mProximityCount; } const int TYPED_LETTER_MULTIPLIER; const int FULL_WORD_MULTIPLIER; - const ProximityInfo *mProximityInfo; int mMaxEditDistance; int mMaxDepth; int mInputLength; - int mSkipPos; - int mSkippedOutputIndex; int mExcessivePos; int mTransposedPos; int mSpaceProximityPos; int mMissingSpacePos; - - int mMatchedCharCount; - int mInputIndex; - int mOutputIndex; int mTerminalInputIndex; int mTerminalOutputIndex; - int mDiffs; - bool mTraverseAllNodes; unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; + // Caveat: Do not create multiple tables per thread as this table eats up RAM a lot. + int mEditDistanceTable[MAX_WORD_LENGTH_INTERNAL * MAX_WORD_LENGTH_INTERNAL]; - inline bool isQuote(const unsigned short c); - inline CorrectionType processSkipChar(const int32_t c, const bool isTerminal); + CorrectionState mCorrectionStates[MAX_WORD_LENGTH_INTERNAL]; + + // The following member variables are being used as cache values of the correction state. + int mOutputIndex; + int mInputIndex; + int mProximityCount; + int mSkippedCount; + int mSkipPos; + bool mNeedsToTraverseAllNodes; + bool mMatching; + bool mSkipping; + bool mProximityMatching; class RankingAlgorithm { public: static int calculateFinalFreq(const int inputIndex, const int depth, - const int matchCount, const int freq, const bool sameLength, + const int freq, const bool sameLength, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, const Correction* correction); diff --git a/native/src/correction_state.h b/native/src/correction_state.h new file mode 100644 index 000000000..267deda9b --- /dev/null +++ b/native/src/correction_state.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_CORRECTION_STATE_H +#define LATINIME_CORRECTION_STATE_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +struct CorrectionState { + int mParentIndex; + int mSiblingPos; + uint16_t mChildCount; + uint8_t mInputIndex; + uint8_t mProximityCount; + uint8_t mSkippedCount; + int8_t mSkipPos; // should be signed + bool mMatching; + bool mSkipping; + bool mProximityMatching; + bool mNeedsToTraverseAllNodes; + +}; + +inline static void initCorrectionState(CorrectionState *state, const int rootPos, + const uint16_t childCount, const bool traverseAll) { + state->mParentIndex = -1; + state->mChildCount = childCount; + state->mInputIndex = 0; + state->mProximityCount = 0; + state->mSiblingPos = rootPos; + state->mSkippedCount = 0; + state->mMatching = false; + state->mSkipping = false; + state->mProximityMatching = false; + state->mNeedsToTraverseAllNodes = traverseAll; + state->mSkipPos = -1; +} + +} // namespace latinime +#endif // LATINIME_CORRECTION_STATE_H diff --git a/native/src/defines.h b/native/src/defines.h index 5a5d3ee0c..c1d08e695 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -94,20 +94,36 @@ static void prof_out(void) { #endif #define DEBUG_DICT true #define DEBUG_DICT_FULL false +#define DEBUG_EDIT_DISTANCE false #define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL #define DEBUG_NODE DEBUG_DICT_FULL #define DEBUG_TRACE DEBUG_DICT_FULL #define DEBUG_PROXIMITY_INFO true +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) + +static char charBuf[50]; + +static void dumpWord(const unsigned short* word, const int length) { + for (int i = 0; i < length; ++i) { + charBuf[i] = word[i]; + } + charBuf[length] = 0; + LOGI("[ %s ]", charBuf); +} + #else // FLAG_DBG #define DEBUG_DICT false #define DEBUG_DICT_FULL false +#define DEBUG_EDIT_DISTANCE false #define DEBUG_SHOW_FOUND_WORD false #define DEBUG_NODE false #define DEBUG_TRACE false #define DEBUG_PROXIMITY_INFO false +#define DUMP_WORD(word, length) + #endif // FLAG_DBG #ifndef U_SHORT_MAX @@ -160,6 +176,7 @@ static void prof_out(void) { #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 +#define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index d437e251a..361bdacbf 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -68,6 +68,10 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength) { mInputCodes = inputCodes; mInputLength = inputLength; + for (int i = 0; i < inputLength; ++i) { + mPrimaryInputWord[i] = getPrimaryCharAt(i); + } + mPrimaryInputWord[inputLength] = 0; } inline const int* ProximityInfo::getProximityCharsAt(const int index) const { diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index 5034c3b89..75fc8fb63 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -46,6 +46,10 @@ public: ProximityType getMatchedProximityId( const int index, const unsigned short c, const bool checkProximityChars) const; bool sameAsTyped(const unsigned short *word, int length) const; + const unsigned short* getPrimaryInputWord() const { + return mPrimaryInputWord; + } + private: int getStartIndexFromCoordinates(const int x, const int y) const; const int MAX_PROXIMITY_CHARS_SIZE; @@ -58,6 +62,7 @@ private: const int *mInputCodes; uint32_t *mProximityCharsArray; int mInputLength; + unsigned short mPrimaryInputWord[MAX_WORD_LENGTH_INTERNAL]; }; } // namespace latinime diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index df1a2e273..6bc350505 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -187,20 +187,15 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, mCorrection->initCorrection(mProximityInfo, mInputLength, maxDepth); PROF_END(0); + // TODO: remove PROF_START(1); - getSuggestionCandidates(-1, -1, -1); + // Note: This line is intentionally left blank PROF_END(1); PROF_START(2); // Suggestion with missing character - if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) { - for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) { - LOGI("--- Suggest missing characters %d", i); - } - getSuggestionCandidates(i, -1, -1); - } - } + LOGI("--- Suggest missing characters"); + getSuggestionCandidates(0, -1, -1); PROF_END(2); PROF_START(3); @@ -352,44 +347,28 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); - int depth = 0; + int outputIndex = 0; - mStackChildCount[0] = childCount; - mStackTraverseAll[0] = (mInputLength <= 0); - mStackInputIndex[0] = 0; - mStackDiffs[0] = 0; - mStackSiblingPos[0] = rootPosition; - mStackOutputIndex[0] = 0; - mStackMatchedCount[0] = 0; + mCorrection->initCorrectionState(rootPosition, childCount, (mInputLength <= 0)); // Depth first search - while (depth >= 0) { - if (mStackChildCount[depth] > 0) { - --mStackChildCount[depth]; - int siblingPos = mStackSiblingPos[depth]; + while (outputIndex >= 0) { + if (mCorrection->initProcessState(outputIndex)) { + int siblingPos = mCorrection->getTreeSiblingPos(outputIndex); int firstChildPos; - mCorrection->initProcessState( - mStackMatchedCount[depth], mStackInputIndex[depth], mStackOutputIndex[depth], - mStackTraverseAll[depth], mStackDiffs[depth]); - // needsToTraverseChildrenNodes should be false const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, mCorrection, &childCount, &firstChildPos, &siblingPos); // Update next sibling pos - mStackSiblingPos[depth] = siblingPos; + mCorrection->setTreeSiblingPos(outputIndex, siblingPos); + if (needsToTraverseChildrenNodes) { // Goes to child node - ++depth; - mStackChildCount[depth] = childCount; - mStackSiblingPos[depth] = firstChildPos; - - mCorrection->getProcessState(&mStackMatchedCount[depth], - &mStackInputIndex[depth], &mStackOutputIndex[depth], - &mStackTraverseAll[depth], &mStackDiffs[depth]); + outputIndex = mCorrection->goDownTree(outputIndex, childCount, firstChildPos); } } else { // Goes to parent sibling node - --depth; + outputIndex = mCorrection->getTreeParentIndex(outputIndex); } } } diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 8bcd7cea5..cfe63ff79 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -19,6 +19,7 @@ #include <stdint.h> #include "correction.h" +#include "correction_state.h" #include "defines.h" #include "proximity_info.h" @@ -134,13 +135,9 @@ private: // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; - int mStackMatchedCount[MAX_WORD_LENGTH_INTERNAL]; - int mStackChildCount[MAX_WORD_LENGTH_INTERNAL]; - bool mStackTraverseAll[MAX_WORD_LENGTH_INTERNAL]; - int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL]; - int mStackDiffs[MAX_WORD_LENGTH_INTERNAL]; - int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL]; - int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL]; + int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove + int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove + int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove }; } // namespace latinime |