diff options
Diffstat (limited to 'native/src/correction.cpp')
-rw-r--r-- | native/src/correction.cpp | 650 |
1 files changed, 426 insertions, 224 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 27dc40745..087219ed4 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -16,12 +16,15 @@ #include <assert.h> #include <ctype.h> +#include <math.h> #include <stdio.h> #include <string.h> #define LOG_TAG "LatinIME: correction.cpp" +#include "char_utils.h" #include "correction.h" +#include "defines.h" #include "dictionary.h" #include "proximity_info.h" @@ -31,81 +34,60 @@ namespace latinime { // edit distance funcitons // ///////////////////////////// -#if 0 /* no longer used */ -inline static int editDistance( - int* editDistanceTable, const unsigned short* input, - const int inputLength, const unsigned short* output, const int outputLength) { - // dp[li][lo] dp[a][b] = dp[ a * lo + b] - int* dp = editDistanceTable; - const int li = inputLength + 1; - const int lo = outputLength + 1; - for (int i = 0; i < li; ++i) { - dp[lo * i] = i; - } - for (int i = 0; i < lo; ++i) { - dp[i] = i; - } - - for (int i = 0; i < li - 1; ++i) { - for (int j = 0; j < lo - 1; ++j) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i]); - const uint32_t co = Dictionary::toBaseLowerCase(output[j]); - const uint16_t cost = (ci == co) ? 0 : 1; - dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, - min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); - if (i > 0 && j > 0 && ci == Dictionary::toBaseLowerCase(output[j - 1]) - && co == Dictionary::toBaseLowerCase(input[i - 1])) { - dp[(i + 1) * lo + (j + 1)] = min( - dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); - } - } +inline static void initEditDistance(int *editDistanceTable) { + for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { + editDistanceTable[i] = i; } +} - if (DEBUG_EDIT_DISTANCE) { - LOGI("IN = %d, OUT = %d", inputLength, outputLength); - for (int i = 0; i < li; ++i) { - for (int j = 0; j < lo; ++j) { - LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); +inline static void dumpEditDistance10ForDebug(int *editDistanceTable, + const int editDistanceTableWidth, const int outputLength) { + if (DEBUG_DICT) { + AKLOGI("EditDistanceTable"); + for (int i = 0; i <= 10; ++i) { + int c[11]; + for (int j = 0; j <= 10; ++j) { + if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { + c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; + } else { + c[j] = -1; + } } + AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); } } - return dp[li * lo - 1]; -} -#endif - -inline static void initEditDistance(int *editDistanceTable) { - for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { - editDistanceTable[i] = i; - } } inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input, const int inputLength, const unsigned short *output, const int outputLength) { + // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched. // Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j]. // Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated, // and calculate dp[ouputLength][0] ... dp[outputLength][inputLength]. int *const current = editDistanceTable + outputLength * (inputLength + 1); const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1); const int *const prevprev = - outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : NULL; + outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0; current[0] = outputLength; - const uint32_t co = Dictionary::toBaseLowerCase(output[outputLength - 1]); - const uint32_t prevCO = - outputLength >= 2 ? Dictionary::toBaseLowerCase(output[outputLength - 2]) : 0; + const uint32_t co = toBaseLowerCase(output[outputLength - 1]); + const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; for (int i = 1; i <= inputLength; ++i) { - const uint32_t ci = Dictionary::toBaseLowerCase(input[i - 1]); + const uint32_t ci = toBaseLowerCase(input[i - 1]); const uint16_t cost = (ci == co) ? 0 : 1; current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); - if (i >= 2 && prevprev && ci == prevCO - && co == Dictionary::toBaseLowerCase(input[i - 2])) { + if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) { current[i] = min(current[i], prevprev[i - 2] + 1); } } } -inline static int getCurrentEditDistance( - int *editDistanceTable, const int inputLength, const int outputLength) { - return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; +inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth, + const int outputLength, const int inputLength) { + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); + } + return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength]; } ////////////////////// @@ -133,6 +115,9 @@ void Correction::initCorrection(const ProximityInfo *pi, const int inputLength, mInputLength = inputLength; mMaxDepth = maxDepth; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; + // TODO: This is not supposed to be required. Check what's going wrong with + // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] + initEditDistance(mEditDistanceTable); } void Correction::initCorrectionState( @@ -146,7 +131,7 @@ void Correction::initCorrectionState( void Correction::setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos, - const bool useFullEditDistance) { + const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) { // TODO: remove mTransposedPos = transposedPos; mExcessivePos = excessivePos; @@ -159,6 +144,8 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos, mSpaceProximityPos = spaceProximityPos; mMissingSpacePos = missingSpacePos; mUseFullEditDistance = useFullEditDistance; + mDoAutoCompletion = doAutoCompletion; + mMaxErrors = maxErrors; } void Correction::checkState() { @@ -172,23 +159,34 @@ void Correction::checkState() { } } -int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const unsigned short *word) { - return Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - firstFreq, secondFreq, this, word); +int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, + const int wordCount, const bool isSpaceProximity, const unsigned short *word) { + return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray, + wordCount, this, isSpaceProximity, word); } int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { + return getFinalFreqInternal(freq, word, wordLength, mInputLength); +} + +int Correction::getFinalFreqForSubQueue(const int freq, unsigned short **word, int *wordLength, + const int inputLength) { + return getFinalFreqInternal(freq, word, wordLength, inputLength); +} + +int Correction::getFinalFreqInternal(const int freq, unsigned short **word, int *wordLength, + const int inputLength) { const int outputIndex = mTerminalOutputIndex; const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; - if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { - return -1; + if (outputIndex < MIN_SUGGEST_DEPTH) { + return NOT_A_FREQUENCY; } *word = mWord; - return Correction::RankingAlgorithm::calculateFinalFreq( - inputIndex, outputIndex, freq, mEditDistanceTable, this); + int finalFreq = Correction::RankingAlgorithm::calculateFinalFreq( + inputIndex, outputIndex, freq, mEditDistanceTable, this, inputLength); + return finalFreq; } bool Correction::initProcessState(const int outputIndex) { @@ -213,6 +211,7 @@ bool Correction::initProcessState(const int outputIndex) { mMatching = false; mProximityMatching = false; + mAdditionalProximityMatching = false; mTransposing = false; mExceeding = false; mSkipping = false; @@ -229,20 +228,10 @@ int Correction::goDownTree( } // TODO: remove -int Correction::getOutputIndex() { - return mOutputIndex; -} - -// TODO: remove int Correction::getInputIndex() { return mInputIndex; } -// TODO: remove -bool Correction::needsToTraverseAllNodes() { - return mNeedsToTraverseAllNodes; -} - void Correction::incrementInputIndex() { ++mInputIndex; } @@ -269,6 +258,7 @@ void Correction::incrementOutputIndex() { mCorrectionStates[mOutputIndex].mMatching = mMatching; mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; + mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching; mCorrectionStates[mOutputIndex].mTransposing = mTransposing; mCorrectionStates[mOutputIndex].mExceeding = mExceeding; mCorrectionStates[mOutputIndex].mSkipping = mSkipping; @@ -280,7 +270,9 @@ void Correction::startToTraverseAllNodes() { bool Correction::needsToPrune() const { // TODO: use edit distance here - return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance; + return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance + // Allow one char longer word for missing character + || (!mDoAutoCompletion && (mOutputIndex > mInputLength)); } void Correction::addCharToCurrentWord(const int32_t c) { @@ -290,13 +282,12 @@ void Correction::addCharToCurrentWord(const int32_t c) { mWord, mOutputIndex + 1); } -// TODO: inline? Correction::CorrectionType Correction::processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { addCharToCurrentWord(c); - if (needsToTraverseAllNodes() && isTerminal) { - mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); - mTerminalOutputIndex = mOutputIndex; + mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); + mTerminalOutputIndex = mOutputIndex; + if (mNeedsToTraverseAllNodes && isTerminal) { incrementOutputIndex(); return TRAVERSE_ALL_ON_TERMINAL; } else { @@ -305,19 +296,36 @@ Correction::CorrectionType Correction::processSkipChar( } } +Correction::CorrectionType Correction::processUnrelatedCorrectionType() { + // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + return UNRELATED; +} + inline bool isEquivalentChar(ProximityInfo::ProximityType type) { return type == ProximityInfo::EQUIVALENT_CHAR; } +inline bool isProximityCharOrEquivalentChar(ProximityInfo::ProximityType type) { + return type == ProximityInfo::EQUIVALENT_CHAR + || type == ProximityInfo::NEAR_PROXIMITY_CHAR; +} + Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); + if (correctionCount > mMaxErrors) { + return processUnrelatedCorrectionType(); + } + // TODO: Change the limit if we'll allow two or more corrections const bool noCorrectionsHappenedSoFar = correctionCount == 0; const bool canTryCorrection = noCorrectionsHappenedSoFar; int proximityIndex = 0; mDistances[mOutputIndex] = NOT_A_DISTANCE; + // Skip checking this node if (mNeedsToTraverseAllNodes || isQuote(c)) { bool incremented = false; if (mLastCharExceeded && mInputIndex == mInputLength - 1) { @@ -342,6 +350,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( return processSkipChar(c, isTerminal, incremented); } + // Check possible corrections. if (mExcessivePos >= 0) { if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { mExcessivePos = mOutputIndex; @@ -382,30 +391,42 @@ Correction::CorrectionType Correction::processCharAndCalcState( incrementInputIndex(); } else { --mTransposedCount; - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } // TODO: Change the limit if we'll allow two or more proximity chars with corrections - const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0; + // Work around: When the mMaxErrors is 1, we only allow just one error + // including proximity correction. + const bool checkProximityChars = (mMaxErrors > 1) + ? (noCorrectionsHappenedSoFar || mProximityCount == 0) + : (noCorrectionsHappenedSoFar && mProximityCount == 0); + ProximityInfo::ProximityType matchedProximityCharId = secondTransposing ? ProximityInfo::EQUIVALENT_CHAR : mProximityInfo->getMatchedProximityId( mInputIndex, c, checkProximityChars, &proximityIndex); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId + || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { if (canTryCorrection && mOutputIndex > 0 && mCorrectionStates[mOutputIndex].mProximityMatching && mCorrectionStates[mOutputIndex].mExceeding && isEquivalentChar(mProximityInfo->getMatchedProximityId( mInputIndex, mWord[mOutputIndex - 1], false))) { - if (DEBUG_CORRECTION) { - LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); } // Conversion p->e // Example: @@ -423,7 +444,11 @@ Correction::CorrectionType Correction::processCharAndCalcState( } } - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId + || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + mAdditionalProximityMatching = true; + } // TODO: Optimize // As the current char turned out to be an unrelated char, // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] @@ -465,6 +490,18 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mSkippedCount; --mProximityCount; return processSkipChar(c, isTerminal, false); + } else if (mInputIndex - 1 < mInputLength + && mSkippedCount > 0 + && mCorrectionStates[mOutputIndex].mSkipping + && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching + && isProximityCharOrEquivalentChar( + mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { + // Conversion s->a + incrementInputIndex(); + --mSkippedCount; + mProximityMatching = true; + ++mProximityCount; + mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength && isEquivalentChar( mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { @@ -475,17 +512,52 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mExcessiveCount; incrementInputIndex(); } + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + DUMP_WORD(mWord, mOutputIndex); + if (mTransposing) { + AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } else { + AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } + } } else if (mSkipping) { // 3. Skip correction ++mSkippedCount; + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } return processSkipChar(c, isTerminal, false); + } else if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + // As a last resort, use additional proximity characters + mProximityMatching = true; + ++mProximityCount; + mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } } else { - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } else if (secondTransposing) { // If inputIndex is greater than mInputLength, that means there is no @@ -500,6 +572,13 @@ Correction::CorrectionType Correction::processCharAndCalcState( ++mProximityCount; mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 + || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { + AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + mTransposedCount, mExcessiveCount, c); + } } addCharToCurrentWord(c); @@ -533,13 +612,17 @@ Correction::CorrectionType Correction::processCharAndCalcState( || isSameAsUserTypedLength) && isTerminal) { mTerminalInputIndex = mInputIndex - 1; mTerminalOutputIndex = mOutputIndex - 1; - if (DEBUG_CORRECTION) { + if (DEBUG_CORRECTION + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); - LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } return ON_TERMINAL; } else { + mTerminalInputIndex = mInputIndex - 1; + mTerminalOutputIndex = mOutputIndex - 1; return NOT_ON_TERMINAL; } } @@ -547,55 +630,6 @@ Correction::CorrectionType Correction::processCharAndCalcState( Correction::~Correction() { } -///////////////////////// -// static inline utils // -///////////////////////// - -static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; -static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { - return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); -} - -static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; -inline static void multiplyIntCapped(const int multiplier, int *base) { - const int temp = *base; - if (temp != S_INT_MAX) { - // Branch if multiplier == 2 for the optimization - if (multiplier == 2) { - *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; - } else { - // TODO: This overflow check gives a wrong answer when, for example, - // temp = 2^16 + 1 and multiplier = 2^17 + 1. - // Fix this behavior. - const int tempRetval = temp * multiplier; - *base = tempRetval >= temp ? tempRetval : S_INT_MAX; - } - } -} - -inline static int powerIntCapped(const int base, const int n) { - if (n <= 0) return 1; - if (base == 2) { - return n < 31 ? 1 << n : S_INT_MAX; - } else { - int ret = base; - for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); - return ret; - } -} - -inline static void multiplyRate(const int rate, int *freq) { - if (*freq != S_INT_MAX) { - if (*freq > 1000000) { - *freq /= 100; - multiplyIntCapped(rate, freq); - } else { - multiplyIntCapped(rate, freq); - *freq /= 100; - } - } -} - inline static int getQuoteCount(const unsigned short* word, const int length) { int quoteCount = 0; for (int i = 0; i < length; ++i) { @@ -607,13 +641,7 @@ inline static int getQuoteCount(const unsigned short* word, const int length) { } inline static bool isUpperCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (isupper(c)) { - return true; - } - return false; + return isAsciiUpper(toBaseChar(c)); } ////////////////////// @@ -622,9 +650,9 @@ inline static bool isUpperCase(unsigned short c) { /* static */ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, - const int freq, int* editDistanceTable, const Correction* correction) { + const int freq, int* editDistanceTable, const Correction* correction, + const int inputLength) { const int excessivePos = correction->getExcessivePos(); - const int inputLength = correction->mInputLength; const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; const ProximityInfo *proximityInfo = correction->mProximityInfo; @@ -649,45 +677,55 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const const unsigned short* word = correction->mWord; const bool skipped = skippedCount > 0; - const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1) + const int quoteDiffCount = max(0, getQuoteCount(word, outputLength) - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); // TODO: Calculate edit distance for transposed and excessive int ed = 0; + if (DEBUG_DICT_FULL) { + dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength); + } int adjustedProximityMatchedCount = proximityMatchedCount; int finalFreq = freq; + if (DEBUG_CORRECTION_FREQ + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { + AKLOGI("FinalFreq0: %d", finalFreq); + } // TODO: Optimize this. - // TODO: Ignoring edit distance for transposed char, for now - if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) { - ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1); + if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { + ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength, + inputLength) - transposedCount; + const int matchWeight = powerIntCapped(typedLetterMultiplier, - max(inputLength, outputIndex + 1) - ed); + max(inputLength, outputLength) - ed); multiplyIntCapped(matchWeight, &finalFreq); // TODO: Demote further if there are two or more excessive chars with longer user input? - if (inputLength > outputIndex + 1) { + if (inputLength > outputLength) { multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); } ed = max(0, ed - quoteDiffCount); - - if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { - // Promote a word with just one skipped or excessive char - if (sameLength) { - multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); - } else { + adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)), + proximityMatchedCount); + if (transposedCount < 1) { + if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) { + // Promote a word with just one skipped or excessive char + if (sameLength) { + multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE + + WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength, + &finalFreq); + } else { + multiplyIntCapped(typedLetterMultiplier, &finalFreq); + } + } else if (ed == 0) { multiplyIntCapped(typedLetterMultiplier, &finalFreq); + sameLength = true; } - } else if (ed == 0) { - multiplyIntCapped(typedLetterMultiplier, &finalFreq); - sameLength = true; } - adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)), - proximityMatchedCount); } else { - // TODO: Calculate the edit distance for transposed char const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); multiplyIntCapped(matchWeight, &finalFreq); } @@ -707,7 +745,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const / (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); if (DEBUG_DICT_FULL) { - LOGI("Demotion rate for missing character is %d.", demotionRate); + AKLOGI("Demotion rate for missing character is %d.", demotionRate); } multiplyRate(demotionRate, &finalFreq); } @@ -720,8 +758,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const if (excessiveCount > 0) { multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { - if (DEBUG_CORRECTION_FREQ) { - LOGI("Double excessive demotion"); + if (DEBUG_DICT_FULL) { + AKLOGI("Double excessive demotion"); } // If an excessive character is not adjacent to the left char or the right char, // we will demote this word. @@ -729,11 +767,14 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } } + const bool performTouchPositionCorrection = + CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() + && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0; // Score calibration by touch coordinates is being done only for pure-fat finger typing error // cases. + int additionalProximityCount = 0; // TODO: Remove this constraint. - if (CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() - && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0) { + if (performTouchPositionCorrection) { for (int i = 0; i < outputLength; ++i) { const int squaredDistance = correction->mDistances[i]; if (i < adjustedProximityMatchedCount) { @@ -744,40 +785,60 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f; static const float B = 1.0f; static const float C = 0.5f; + static const float MIN = 0.3f; static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS; static const float R2 = HALF_SCORE_SQUARED_RADIUS; const float x = (float)squaredDistance / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; - const float factor = (x < R1) + const float factor = max((x < R1) ? (A * (R1 - x) + B * x) / R1 - : (B * (R2 - x) + C * (x - R1)) / (R2 - R1); + : (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN); // factor is piecewise linear function like: // A -_ . // ^-_ . // B \ . - // \ . - // C \ . - // 0 R1 R2 - if (factor <= 0) { - return -1; - } + // \_ . + // C ------------. + // . + // 0 R1 R2 . multiplyRate((int)(factor * 100), &finalFreq); } else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) { multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } else if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { + ++additionalProximityCount; + multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } } } else { + // Demote additional proximity characters + for (int i = 0; i < outputLength; ++i) { + const int squaredDistance = correction->mDistances[i]; + if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) { + ++additionalProximityCount; + } + } // Promotion for a word with proximity characters for (int i = 0; i < adjustedProximityMatchedCount; ++i) { // A word with proximity corrections if (DEBUG_DICT_FULL) { - LOGI("Found a proximity correction."); + AKLOGI("Found a proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &finalFreq); - multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + if (i < additionalProximityCount) { + multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } else { + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); + } } } + // If the user types too many(three or more) proximity characters with additional proximity + // character,do not treat as the same length word. + if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3 + || transposedCount > 0 || skipped || excessiveCount > 0)) { + sameLength = false; + } + const int errorCount = adjustedProximityMatchedCount > 0 ? adjustedProximityMatchedCount : (proximityMatchedCount + transposedCount); @@ -787,13 +848,15 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const // Promotion for an exactly matched word if (ed == 0) { // Full exact match - if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) { + if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0 + && quoteDiffCount == 0 && additionalProximityCount == 0) { finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } } // Promote a word with no correction - if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0) { + if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0 + && additionalProximityCount == 0) { multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } @@ -832,71 +895,101 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } if (DEBUG_DICT_FULL) { - LOGI("calc: %d, %d", outputIndex, sameLength); + AKLOGI("calc: %d, %d", outputLength, sameLength); } - if (DEBUG_CORRECTION_FREQ) { - DUMP_WORD(correction->mWord, outputIndex + 1); - LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, - skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength, - quoteDiffCount, ed, finalFreq); + if (DEBUG_CORRECTION_FREQ + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { + DUMP_WORD(proximityInfo->getPrimaryInputWord(), inputLength); + DUMP_WORD(correction->mWord, outputLength); + AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount, + skippedCount, transposedCount, excessiveCount, additionalProximityCount, + outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); } return finalFreq; } /* static */ -int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const Correction* correction, - const unsigned short *word) { - const int spaceProximityPos = correction->mSpaceProximityPos; - const int missingSpacePos = correction->mMissingSpacePos; - if (DEBUG_DICT) { - int inputCount = 0; - if (spaceProximityPos >= 0) ++inputCount; - if (missingSpacePos >= 0) ++inputCount; - assert(inputCount <= 1); - } - const bool isSpaceProximity = spaceProximityPos >= 0; - const int inputLength = correction->mInputLength; - const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) - : (inputLength - missingSpacePos); +int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( + const int *freqArray, const int *wordLengthArray, const int wordCount, + const Correction* correction, const bool isSpaceProximity, const unsigned short *word) { const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; bool firstCapitalizedWordDemotion = false; - if (firstWordLength >= 2) { - firstCapitalizedWordDemotion = isUpperCase(word[0]); - } - bool secondCapitalizedWordDemotion = false; - if (secondWordLength >= 2) { - secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + + { + // TODO: Handle multiple capitalized word demotion properly + const int firstWordLength = wordLengthArray[0]; + const int secondWordLength = wordLengthArray[1]; + if (firstWordLength >= 2) { + firstCapitalizedWordDemotion = isUpperCase(word[0]); + } + + if (secondWordLength >= 2) { + // FIXME: word[firstWordLength + 1] is incorrect. + secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + } } + const bool capitalizedWordDemotion = firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; - if (DEBUG_DICT_FULL) { - LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); + int totalLength = 0; + int totalFreq = 0; + for (int i = 0; i < wordCount; ++i){ + const int wordLength = wordLengthArray[i]; + if (wordLength <= 0) { + return 0; + } + totalLength += wordLength; + const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1); + int tempFirstFreq = freqArray[i]; + multiplyRate(demotionRate, &tempFirstFreq); + totalFreq += tempFirstFreq; } - if (firstWordLength == 0 || secondWordLength == 0) { + if (totalLength <= 0 || totalFreq <= 0) { return 0; } - const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); - int tempFirstFreq = firstFreq; - multiplyRate(firstDemotionRate, &tempFirstFreq); - - const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); - int tempSecondFreq = secondFreq; - multiplyRate(secondDemotionRate, &tempSecondFreq); - - const int totalLength = firstWordLength + secondWordLength; + // TODO: Currently totalFreq is adjusted to two word metrix. // Promote pairFreq with multiplying by 2, because the word length is the same as the typed // length. - int totalFreq = tempFirstFreq + tempSecondFreq; + totalFreq = totalFreq * 2 / wordCount; + if (wordCount > 2) { + // Safety net for 3+ words -- Caveats: many heuristics and workarounds here. + int oneLengthCounter = 0; + int twoLengthCounter = 0; + for (int i = 0; i < wordCount; ++i) { + const int wordLength = wordLengthArray[i]; + // TODO: Use bigram instead of this safety net + if (i < wordCount - 1) { + const int nextWordLength = wordLengthArray[i + 1]; + if (wordLength == 1 && nextWordLength == 2) { + // Safety net to filter 1 length and 2 length sequential words + return 0; + } + } + const int freq = freqArray[i]; + // Demote too short weak words + if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) { + multiplyRate(100 * freq / MAX_FREQ, &totalFreq); + } + if (wordLength == 1) { + ++oneLengthCounter; + } else if (wordLength == 2) { + ++twoLengthCounter; + } + if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) { + // Safety net to filter too many short words + return 0; + } + } + multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq); + } // This is a workaround to try offsetting the not-enough-demotion which will be done in // calcNormalizedScore in Utils.java. @@ -923,19 +1016,128 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( if (isSpaceProximity) { // A word pair with one space proximity correction if (DEBUG_DICT) { - LOGI("Found a word pair with space proximity correction."); + AKLOGI("Found a word pair with space proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &totalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } - multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + if (isSpaceProximity) { + multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); + } else { + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + } if (capitalizedWordDemotion) { multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); } + if (DEBUG_CORRECTION_FREQ) { + AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1], + wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq); + DUMP_WORD(word, wordLengthArray[0]); + } + return totalFreq; } +/* Damerau-Levenshtein distance */ +inline static int editDistanceInternal( + int* editDistanceTable, const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { + // dp[li][lo] dp[a][b] = dp[ a * lo + b] + int* dp = editDistanceTable; + const int li = beforeLength + 1; + const int lo = afterLength + 1; + for (int i = 0; i < li; ++i) { + dp[lo * i] = i; + } + for (int i = 0; i < lo; ++i) { + dp[i] = i; + } + + for (int i = 0; i < li - 1; ++i) { + for (int j = 0; j < lo - 1; ++j) { + const uint32_t ci = toBaseLowerCase(before[i]); + const uint32_t co = toBaseLowerCase(after[j]); + const uint16_t cost = (ci == co) ? 0 : 1; + dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, + min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); + if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1]) + && co == toBaseLowerCase(before[i - 1])) { + dp[(i + 1) * lo + (j + 1)] = min( + dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); + } + } + } + + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); + for (int i = 0; i < li; ++i) { + for (int j = 0; j < lo; ++j) { + AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); + } + } + } + return dp[li * lo - 1]; +} + +int Correction::RankingAlgorithm::editDistance(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { + int table[(beforeLength + 1) * (afterLength + 1)]; + return editDistanceInternal(table, before, beforeLength, after, afterLength); +} + + +// In dictionary.cpp, getSuggestion() method, +// suggestion scores are computed using the below formula. +// original score +// := pow(mTypedLetterMultiplier (this is defined 2), +// (the number of matched characters between typed word and suggested word)) +// * (individual word's score which defined in the unigram dictionary, +// and this score is defined in range [0, 255].) +// Then, the following processing is applied. +// - If the dictionary word is matched up to the point of the user entry +// (full match up to min(before.length(), after.length()) +// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) +// - If the word is a true full match except for differences in accents or +// capitalization, then treat it as if the score was 255. +// - If before.length() == after.length() +// => multiply by mFullWordMultiplier (this is defined 2)) +// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 +// For historical reasons we ignore the 1.2 modifier (because the measure for a good +// autocorrection threshold was done at a time when it didn't exist). This doesn't change +// the result. +// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. + +/* static */ +double Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength, + const int score) { + if (0 == beforeLength || 0 == afterLength) { + return 0; + } + const int distance = editDistance(before, beforeLength, after, afterLength); + int spaceCount = 0; + for (int i = 0; i < afterLength; ++i) { + if (after[i] == CODE_SPACE) { + ++spaceCount; + } + } + + if (spaceCount == afterLength) { + return 0; + } + + const double maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE + * pow((double)TYPED_LETTER_MULTIPLIER, + (double)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER; + + // add a weight based on edit distance. + // distance <= max(afterLength, beforeLength) == afterLength, + // so, 0 <= distance / afterLength <= 1 + const double weight = 1.0 - (double) distance / afterLength; + return (score / maxScore) * weight; +} + } // namespace latinime |