diff options
Diffstat (limited to 'native/src/correction.cpp')
-rw-r--r-- | native/src/correction.cpp | 191 |
1 files changed, 143 insertions, 48 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp index dc31bfae7..6a129d4e3 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -42,7 +42,7 @@ inline static void initEditDistance(int *editDistanceTable) { inline static void dumpEditDistance10ForDebug(int *editDistanceTable, const int inputLength, const int outputLength) { if (DEBUG_DICT) { - LOGI("EditDistanceTable"); + AKLOGI("EditDistanceTable"); for (int i = 0; i <= 10; ++i) { int c[11]; for (int j = 0; j <= 10; ++j) { @@ -52,7 +52,7 @@ inline static void dumpEditDistance10ForDebug(int *editDistanceTable, const int c[j] = -1; } } - LOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", + AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); } } @@ -83,8 +83,8 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne inline static int getCurrentEditDistance( int *editDistanceTable, const int inputLength, const int outputLength) { - if (DEBUG_DICT) { - LOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); } return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; } @@ -168,8 +168,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen const int outputIndex = mTerminalOutputIndex; const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; - if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { - return -1; + if (outputIndex < MIN_SUGGEST_DEPTH) { + return NOT_A_FREQUENCY; } *word = mWord; @@ -215,20 +215,10 @@ int Correction::goDownTree( } // TODO: remove -int Correction::getOutputIndex() { - return mOutputIndex; -} - -// TODO: remove int Correction::getInputIndex() { return mInputIndex; } -// TODO: remove -bool Correction::needsToTraverseAllNodes() { - return mNeedsToTraverseAllNodes; -} - void Correction::incrementInputIndex() { ++mInputIndex; } @@ -278,13 +268,12 @@ void Correction::addCharToCurrentWord(const int32_t c) { mWord, mOutputIndex + 1); } -// TODO: inline? Correction::CorrectionType Correction::processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { addCharToCurrentWord(c); - if (needsToTraverseAllNodes() && isTerminal) { - mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); - mTerminalOutputIndex = mOutputIndex; + mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); + mTerminalOutputIndex = mOutputIndex; + if (mNeedsToTraverseAllNodes && isTerminal) { incrementOutputIndex(); return TRAVERSE_ALL_ON_TERMINAL; } else { @@ -293,6 +282,13 @@ Correction::CorrectionType Correction::processSkipChar( } } +Correction::CorrectionType Correction::processUnrelatedCorrectionType() { + // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + return UNRELATED; +} + inline bool isEquivalentChar(ProximityInfo::ProximityType type) { return type == ProximityInfo::EQUIVALENT_CHAR; } @@ -301,7 +297,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); if (correctionCount > mMaxErrors) { - return UNRELATED; + return processUnrelatedCorrectionType(); } // TODO: Change the limit if we'll allow two or more corrections @@ -378,10 +374,10 @@ Correction::CorrectionType Correction::processCharAndCalcState( --mTransposedCount; if (DEBUG_CORRECTION) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } @@ -404,7 +400,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( && isEquivalentChar(mProximityInfo->getMatchedProximityId( mInputIndex, mWord[mOutputIndex - 1], false))) { if (DEBUG_CORRECTION) { - LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); + AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); } // Conversion p->e // Example: @@ -481,10 +477,10 @@ Correction::CorrectionType Correction::processCharAndCalcState( } else { if (DEBUG_CORRECTION) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } else if (secondTransposing) { // If inputIndex is greater than mInputLength, that means there is no @@ -534,11 +530,13 @@ Correction::CorrectionType Correction::processCharAndCalcState( mTerminalOutputIndex = mOutputIndex - 1; if (DEBUG_CORRECTION) { DUMP_WORD(mWord, mOutputIndex); - LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } return ON_TERMINAL; } else { + mTerminalInputIndex = mInputIndex - 1; + mTerminalOutputIndex = mOutputIndex - 1; return NOT_ON_TERMINAL; } } @@ -655,9 +653,10 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int finalFreq = freq; // TODO: Optimize this. - // TODO: Ignoring edit distance for transposed char, for now - if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) { - ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1); + if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { + ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1) + - transposedCount; + const int matchWeight = powerIntCapped(typedLetterMultiplier, max(inputLength, outputIndex + 1) - ed); multiplyIntCapped(matchWeight, &finalFreq); @@ -669,21 +668,22 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const ed = max(0, ed - quoteDiffCount); - if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { - // Promote a word with just one skipped or excessive char - if (sameLength) { - multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); - } else { + if (transposedCount < 1) { + if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { + // Promote a word with just one skipped or excessive char + if (sameLength) { + multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); + } else { + multiplyIntCapped(typedLetterMultiplier, &finalFreq); + } + } else if (ed == 0) { multiplyIntCapped(typedLetterMultiplier, &finalFreq); + sameLength = true; } - } else if (ed == 0) { - multiplyIntCapped(typedLetterMultiplier, &finalFreq); - sameLength = true; } adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)), proximityMatchedCount); } else { - // TODO: Calculate the edit distance for transposed char const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); multiplyIntCapped(matchWeight, &finalFreq); } @@ -703,7 +703,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const / (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); if (DEBUG_DICT_FULL) { - LOGI("Demotion rate for missing character is %d.", demotionRate); + AKLOGI("Demotion rate for missing character is %d.", demotionRate); } multiplyRate(demotionRate, &finalFreq); } @@ -717,7 +717,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { if (DEBUG_CORRECTION_FREQ) { - LOGI("Double excessive demotion"); + AKLOGI("Double excessive demotion"); } // If an excessive character is not adjacent to the left char or the right char, // we will demote this word. @@ -767,7 +767,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const for (int i = 0; i < adjustedProximityMatchedCount; ++i) { // A word with proximity corrections if (DEBUG_DICT_FULL) { - LOGI("Found a proximity correction."); + AKLOGI("Found a proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &finalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); @@ -828,12 +828,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } if (DEBUG_DICT_FULL) { - LOGI("calc: %d, %d", outputIndex, sameLength); + AKLOGI("calc: %d, %d", outputIndex, sameLength); } if (DEBUG_CORRECTION_FREQ) { DUMP_WORD(correction->mWord, outputIndex + 1); - LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, + AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); } @@ -874,7 +874,102 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; if (DEBUG_DICT_FULL) { - LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); + AKLOGI("Two words: %c, %c, %d", + word[0], word[firstWordLength + 1], capitalizedWordDemotion); + } + + if (firstWordLength == 0 || secondWordLength == 0) { + return 0; + } + const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); + int tempFirstFreq = firstFreq; + multiplyRate(firstDemotionRate, &tempFirstFreq); + + const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); + int tempSecondFreq = secondFreq; + multiplyRate(secondDemotionRate, &tempSecondFreq); + + const int totalLength = firstWordLength + secondWordLength; + + // Promote pairFreq with multiplying by 2, because the word length is the same as the typed + // length. + int totalFreq = tempFirstFreq + tempSecondFreq; + + // This is a workaround to try offsetting the not-enough-demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) + // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by + // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) + const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); + multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); + + // At this moment, totalFreq is calculated by the following formula: + // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) + // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) + + multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); + + // This is another workaround to offset the demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote + // the same amount because we already have adjusted the synthetic freq of this "missing or + // mistyped space" suggestion candidate above in this method. + const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); + multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); + + if (isSpaceProximity) { + // A word pair with one space proximity correction + if (DEBUG_DICT) { + AKLOGI("Found a word pair with space proximity correction."); + } + multiplyIntCapped(typedLetterMultiplier, &totalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); + } + + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + + if (capitalizedWordDemotion) { + multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); + } + + return totalFreq; +} + +/* static */ +int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld( + const int firstFreq, const int secondFreq, const Correction* correction, + const unsigned short *word) { + const int spaceProximityPos = correction->mSpaceProximityPos; + const int missingSpacePos = correction->mMissingSpacePos; + if (DEBUG_DICT) { + int inputCount = 0; + if (spaceProximityPos >= 0) ++inputCount; + if (missingSpacePos >= 0) ++inputCount; + assert(inputCount <= 1); + } + const bool isSpaceProximity = spaceProximityPos >= 0; + const int inputLength = correction->mInputLength; + const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; + const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) + : (inputLength - missingSpacePos); + const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; + + bool firstCapitalizedWordDemotion = false; + if (firstWordLength >= 2) { + firstCapitalizedWordDemotion = isUpperCase(word[0]); + } + + bool secondCapitalizedWordDemotion = false; + if (secondWordLength >= 2) { + secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + } + + const bool capitalizedWordDemotion = + firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; + + if (DEBUG_DICT_FULL) { + AKLOGI("Two words: %c, %c, %d", + word[0], word[firstWordLength + 1], capitalizedWordDemotion); } if (firstWordLength == 0 || secondWordLength == 0) { @@ -919,7 +1014,7 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( if (isSpaceProximity) { // A word pair with one space proximity correction if (DEBUG_DICT) { - LOGI("Found a word pair with space proximity correction."); + AKLOGI("Found a word pair with space proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &totalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); @@ -965,10 +1060,10 @@ inline static int editDistanceInternal( } if (DEBUG_EDIT_DISTANCE) { - LOGI("IN = %d, OUT = %d", beforeLength, afterLength); + AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); for (int i = 0; i < li; ++i) { for (int j = 0; j < lo; ++j) { - LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); + AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); } } } |