diff options
Diffstat (limited to 'native/src')
-rw-r--r-- | native/src/bigram_dictionary.cpp | 10 | ||||
-rw-r--r-- | native/src/binary_format.h | 4 | ||||
-rw-r--r-- | native/src/correction.cpp | 191 | ||||
-rw-r--r-- | native/src/correction.h | 5 | ||||
-rw-r--r-- | native/src/debug.h | 6 | ||||
-rw-r--r-- | native/src/defines.h | 57 | ||||
-rw-r--r-- | native/src/dictionary.cpp | 4 | ||||
-rw-r--r-- | native/src/dictionary.h | 75 | ||||
-rw-r--r-- | native/src/proximity_info.cpp | 8 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 188 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 13 | ||||
-rw-r--r-- | native/src/words_priority_queue.h | 21 | ||||
-rw-r--r-- | native/src/words_priority_queue_pool.h | 56 |
13 files changed, 409 insertions, 229 deletions
diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp index c340c6c1a..db7734bc7 100644 --- a/native/src/bigram_dictionary.cpp +++ b/native/src/bigram_dictionary.cpp @@ -32,8 +32,8 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { if (DEBUG_DICT) { - LOGI("BigramDictionary - constructor"); - LOGI("Has Bigram : %d", hasBigram); + AKLOGI("BigramDictionary - constructor"); + AKLOGI("Has Bigram : %d", hasBigram); } } @@ -46,7 +46,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Bigram: Found word = %s, freq = %d :", s, frequency); + AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency); #endif } @@ -60,7 +60,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ insertAt++; } if (DEBUG_DICT) { - LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); } if (insertAt < mMaxBigrams) { memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), @@ -76,7 +76,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ } *dest = 0; // NULL terminate if (DEBUG_DICT) { - LOGI("Bigram: Added word at %d", insertAt); + AKLOGI("Bigram: Added word at %d", insertAt); } return true; } diff --git a/native/src/binary_format.h b/native/src/binary_format.h index 9944fa2bd..1d74998f6 100644 --- a/native/src/binary_format.h +++ b/native/src/binary_format.h @@ -61,7 +61,9 @@ inline int BinaryFormat::detectFormat(const uint8_t* const dict) { } inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { - return dict[(*pos)++]; + const int msb = dict[(*pos)++]; + if (msb < 0x80) return msb; + return ((msb & 0x7F) << 8) | dict[(*pos)++]; } inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { diff --git a/native/src/correction.cpp b/native/src/correction.cpp index dc31bfae7..6a129d4e3 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -42,7 +42,7 @@ inline static void initEditDistance(int *editDistanceTable) { inline static void dumpEditDistance10ForDebug(int *editDistanceTable, const int inputLength, const int outputLength) { if (DEBUG_DICT) { - LOGI("EditDistanceTable"); + AKLOGI("EditDistanceTable"); for (int i = 0; i <= 10; ++i) { int c[11]; for (int j = 0; j <= 10; ++j) { @@ -52,7 +52,7 @@ inline static void dumpEditDistance10ForDebug(int *editDistanceTable, const int c[j] = -1; } } - LOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", + AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); } } @@ -83,8 +83,8 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne inline static int getCurrentEditDistance( int *editDistanceTable, const int inputLength, const int outputLength) { - if (DEBUG_DICT) { - LOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); } return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; } @@ -168,8 +168,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen const int outputIndex = mTerminalOutputIndex; const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; - if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { - return -1; + if (outputIndex < MIN_SUGGEST_DEPTH) { + return NOT_A_FREQUENCY; } *word = mWord; @@ -215,20 +215,10 @@ int Correction::goDownTree( } // TODO: remove -int Correction::getOutputIndex() { - return mOutputIndex; -} - -// TODO: remove int Correction::getInputIndex() { return mInputIndex; } -// TODO: remove -bool Correction::needsToTraverseAllNodes() { - return mNeedsToTraverseAllNodes; -} - void Correction::incrementInputIndex() { ++mInputIndex; } @@ -278,13 +268,12 @@ void Correction::addCharToCurrentWord(const int32_t c) { mWord, mOutputIndex + 1); } -// TODO: inline? Correction::CorrectionType Correction::processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { addCharToCurrentWord(c); - if (needsToTraverseAllNodes() && isTerminal) { - mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); - mTerminalOutputIndex = mOutputIndex; + mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); + mTerminalOutputIndex = mOutputIndex; + if (mNeedsToTraverseAllNodes && isTerminal) { incrementOutputIndex(); return TRAVERSE_ALL_ON_TERMINAL; } else { @@ -293,6 +282,13 @@ Correction::CorrectionType Correction::processSkipChar( } } +Correction::CorrectionType Correction::processUnrelatedCorrectionType() { + // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + return UNRELATED; +} + inline bool isEquivalentChar(ProximityInfo::ProximityType type) { return type == ProximityInfo::EQUIVALENT_CHAR; } @@ -301,7 +297,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); if (correctionCount > mMaxErrors) { - return UNRELATED; + return processUnrelatedCorrectionType(); } // TODO: Change the limit if we'll allow two or more corrections @@ -378,10 +374,10 @@ Correction::CorrectionType Correction::processCharAndCalcState( --mTransposedCount; if (DEBUG_CORRECTION) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } @@ -404,7 +400,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( && isEquivalentChar(mProximityInfo->getMatchedProximityId( mInputIndex, mWord[mOutputIndex - 1], false))) { if (DEBUG_CORRECTION) { - LOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); + AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); } // Conversion p->e // Example: @@ -481,10 +477,10 @@ Correction::CorrectionType Correction::processCharAndCalcState( } else { if (DEBUG_CORRECTION) { DUMP_WORD(mWord, mOutputIndex); - LOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } else if (secondTransposing) { // If inputIndex is greater than mInputLength, that means there is no @@ -534,11 +530,13 @@ Correction::CorrectionType Correction::processCharAndCalcState( mTerminalOutputIndex = mOutputIndex - 1; if (DEBUG_CORRECTION) { DUMP_WORD(mWord, mOutputIndex); - LOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, + AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } return ON_TERMINAL; } else { + mTerminalInputIndex = mInputIndex - 1; + mTerminalOutputIndex = mOutputIndex - 1; return NOT_ON_TERMINAL; } } @@ -655,9 +653,10 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int finalFreq = freq; // TODO: Optimize this. - // TODO: Ignoring edit distance for transposed char, for now - if (transposedCount == 0 && (proximityMatchedCount > 0 || skipped || excessiveCount > 0)) { - ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1); + if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { + ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1) + - transposedCount; + const int matchWeight = powerIntCapped(typedLetterMultiplier, max(inputLength, outputIndex + 1) - ed); multiplyIntCapped(matchWeight, &finalFreq); @@ -669,21 +668,22 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const ed = max(0, ed - quoteDiffCount); - if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { - // Promote a word with just one skipped or excessive char - if (sameLength) { - multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); - } else { + if (transposedCount < 1) { + if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) { + // Promote a word with just one skipped or excessive char + if (sameLength) { + multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq); + } else { + multiplyIntCapped(typedLetterMultiplier, &finalFreq); + } + } else if (ed == 0) { multiplyIntCapped(typedLetterMultiplier, &finalFreq); + sameLength = true; } - } else if (ed == 0) { - multiplyIntCapped(typedLetterMultiplier, &finalFreq); - sameLength = true; } adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)), proximityMatchedCount); } else { - // TODO: Calculate the edit distance for transposed char const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); multiplyIntCapped(matchWeight, &finalFreq); } @@ -703,7 +703,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const / (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); if (DEBUG_DICT_FULL) { - LOGI("Demotion rate for missing character is %d.", demotionRate); + AKLOGI("Demotion rate for missing character is %d.", demotionRate); } multiplyRate(demotionRate, &finalFreq); } @@ -717,7 +717,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { if (DEBUG_CORRECTION_FREQ) { - LOGI("Double excessive demotion"); + AKLOGI("Double excessive demotion"); } // If an excessive character is not adjacent to the left char or the right char, // we will demote this word. @@ -767,7 +767,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const for (int i = 0; i < adjustedProximityMatchedCount; ++i) { // A word with proximity corrections if (DEBUG_DICT_FULL) { - LOGI("Found a proximity correction."); + AKLOGI("Found a proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &finalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); @@ -828,12 +828,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const } if (DEBUG_DICT_FULL) { - LOGI("calc: %d, %d", outputIndex, sameLength); + AKLOGI("calc: %d, %d", outputIndex, sameLength); } if (DEBUG_CORRECTION_FREQ) { DUMP_WORD(correction->mWord, outputIndex + 1); - LOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, + AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount, skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq); } @@ -874,7 +874,102 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; if (DEBUG_DICT_FULL) { - LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); + AKLOGI("Two words: %c, %c, %d", + word[0], word[firstWordLength + 1], capitalizedWordDemotion); + } + + if (firstWordLength == 0 || secondWordLength == 0) { + return 0; + } + const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); + int tempFirstFreq = firstFreq; + multiplyRate(firstDemotionRate, &tempFirstFreq); + + const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); + int tempSecondFreq = secondFreq; + multiplyRate(secondDemotionRate, &tempSecondFreq); + + const int totalLength = firstWordLength + secondWordLength; + + // Promote pairFreq with multiplying by 2, because the word length is the same as the typed + // length. + int totalFreq = tempFirstFreq + tempSecondFreq; + + // This is a workaround to try offsetting the not-enough-demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) + // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by + // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) + const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); + multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); + + // At this moment, totalFreq is calculated by the following formula: + // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) + // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) + + multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); + + // This is another workaround to offset the demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote + // the same amount because we already have adjusted the synthetic freq of this "missing or + // mistyped space" suggestion candidate above in this method. + const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); + multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); + + if (isSpaceProximity) { + // A word pair with one space proximity correction + if (DEBUG_DICT) { + AKLOGI("Found a word pair with space proximity correction."); + } + multiplyIntCapped(typedLetterMultiplier, &totalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); + } + + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + + if (capitalizedWordDemotion) { + multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); + } + + return totalFreq; +} + +/* static */ +int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld( + const int firstFreq, const int secondFreq, const Correction* correction, + const unsigned short *word) { + const int spaceProximityPos = correction->mSpaceProximityPos; + const int missingSpacePos = correction->mMissingSpacePos; + if (DEBUG_DICT) { + int inputCount = 0; + if (spaceProximityPos >= 0) ++inputCount; + if (missingSpacePos >= 0) ++inputCount; + assert(inputCount <= 1); + } + const bool isSpaceProximity = spaceProximityPos >= 0; + const int inputLength = correction->mInputLength; + const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; + const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) + : (inputLength - missingSpacePos); + const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; + + bool firstCapitalizedWordDemotion = false; + if (firstWordLength >= 2) { + firstCapitalizedWordDemotion = isUpperCase(word[0]); + } + + bool secondCapitalizedWordDemotion = false; + if (secondWordLength >= 2) { + secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + } + + const bool capitalizedWordDemotion = + firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; + + if (DEBUG_DICT_FULL) { + AKLOGI("Two words: %c, %c, %d", + word[0], word[firstWordLength + 1], capitalizedWordDemotion); } if (firstWordLength == 0 || secondWordLength == 0) { @@ -919,7 +1014,7 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( if (isSpaceProximity) { // A word pair with one space proximity correction if (DEBUG_DICT) { - LOGI("Found a word pair with space proximity correction."); + AKLOGI("Found a word pair with space proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &totalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); @@ -965,10 +1060,10 @@ inline static int editDistanceInternal( } if (DEBUG_EDIT_DISTANCE) { - LOGI("IN = %d, OUT = %d", beforeLength, afterLength); + AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); for (int i = 0; i < li; ++i) { for (int j = 0; j < lo; ++j) { - LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); + AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); } } } diff --git a/native/src/correction.h b/native/src/correction.h index 4012e7e82..22a424f5c 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -48,7 +48,6 @@ class Correction { void checkState(); bool initProcessState(const int index); - int getOutputIndex(); int getInputIndex(); virtual ~Correction(); @@ -101,6 +100,8 @@ class Correction { const int freq, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word); + static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq, + const Correction* correction, const unsigned short *word); static double calcNormalizedScore(const unsigned short* before, const int beforeLength, const unsigned short* after, const int afterLength, const int score); static int editDistance(const unsigned short* before, @@ -115,11 +116,11 @@ class Correction { private: inline void incrementInputIndex(); inline void incrementOutputIndex(); - inline bool needsToTraverseAllNodes(); inline void startToTraverseAllNodes(); inline bool isQuote(const unsigned short c); inline CorrectionType processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented); + inline CorrectionType processUnrelatedCorrectionType(); inline void addCharToCurrentWord(const int32_t c); const int TYPED_LETTER_MULTIPLIER; diff --git a/native/src/debug.h b/native/src/debug.h index 38b2f107a..b13052c95 100644 --- a/native/src/debug.h +++ b/native/src/debug.h @@ -42,7 +42,7 @@ static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned s static inline void LOGI_S16(unsigned short* string, const unsigned int length) { unsigned char tmp_buffer[length]; convertToUnibyteString(string, tmp_buffer, length); - LOGI(">> %s", tmp_buffer); + AKLOGI(">> %s", tmp_buffer); // The log facility is throwing out log that comes too fast. The following // is a dirty way of slowing down processing so that we can see all log. // TODO : refactor this in a blocking log or something. @@ -53,7 +53,7 @@ static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int leng unsigned char c) { unsigned char tmp_buffer[length+1]; convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c); - LOGI(">> %s", tmp_buffer); + AKLOGI(">> %s", tmp_buffer); // Likewise // usleep(10); } @@ -64,7 +64,7 @@ static inline void printDebug(const char* tag, int* codes, int codesSize, int MA buf[codesSize] = 0; while (--codesSize >= 0) buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS]; - LOGI("%s, WORD = %s", tag, buf); + AKLOGI("%s, WORD = %s", tag, buf); free(buf); } diff --git a/native/src/defines.h b/native/src/defines.h index d636e5197..d739043a4 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -20,15 +20,32 @@ #if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) #include <cutils/log.h> +#define AKLOGE ALOGE +#define AKLOGI ALOGI + +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) + +static char charBuf[50]; + +static void dumpWord(const unsigned short* word, const int length) { + for (int i = 0; i < length; ++i) { + charBuf[i] = word[i]; + } + charBuf[length] = 0; + AKLOGI("[ %s ]", charBuf); +} + #else -#define LOGE(fmt, ...) -#define LOGI(fmt, ...) +#define AKLOGE(fmt, ...) +#define AKLOGI(fmt, ...) +#define DUMP_WORD(word, length) #endif #ifdef FLAG_DO_PROFILE // Profiler #include <cutils/log.h> #include <time.h> + #define PROF_BUF_SIZE 100 static double profile_buf[PROF_BUF_SIZE]; static double profile_old[PROF_BUF_SIZE]; @@ -42,8 +59,8 @@ static unsigned int profile_counter[PROF_BUF_SIZE]; #define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while(0) #define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id]) #define PROF_CLOCKOUT(prof_buf_id) \ - LOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) -#define PROF_OUTALL do { LOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0) + AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) +#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0) static void prof_reset(void) { for (int i = 0; i < PROF_BUF_SIZE; ++i) { @@ -55,9 +72,9 @@ static void prof_reset(void) { static void prof_out(void) { if (profile_counter[PROF_BUF_SIZE - 1] != 1) { - LOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); + AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); } - LOGI("Total time is %6.3f ms.", + AKLOGI("Total time is %6.3f ms.", profile_buf[PROF_BUF_SIZE - 1] * 1000 / (double)CLOCKS_PER_SEC); double all = 0; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { @@ -66,7 +83,7 @@ static void prof_out(void) { if (all == 0) all = 1; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { if (profile_buf[i] != 0) { - LOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", + AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", i, (profile_buf[i] * 100 / all), profile_buf[i] * 1000 / (double)CLOCKS_PER_SEC, profile_counter[i]); } @@ -100,20 +117,8 @@ static void prof_out(void) { #define DEBUG_TRACE DEBUG_DICT_FULL #define DEBUG_PROXIMITY_INFO false #define DEBUG_CORRECTION false -#define DEBUG_CORRECTION_FREQ true -#define DEBUG_WORDS_PRIORITY_QUEUE true - -#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) - -static char charBuf[50]; - -static void dumpWord(const unsigned short* word, const int length) { - for (int i = 0; i < length; ++i) { - charBuf[i] = word[i]; - } - charBuf[length] = 0; - LOGI("[ %s ]", charBuf); -} +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_WORDS_PRIORITY_QUEUE false #else // FLAG_DBG @@ -128,7 +133,6 @@ static void dumpWord(const unsigned short* word, const int length) { #define DEBUG_CORRECTION_FREQ false #define DEBUG_WORDS_PRIORITY_QUEUE false -#define DUMP_WORD(word, length) #endif // FLAG_DBG @@ -168,6 +172,7 @@ static void dumpWord(const unsigned short* word, const int length) { #define EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO -2 #define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO -3 #define NOT_A_INDEX -1 +#define NOT_A_FREQUENCY -1 #define KEYCODE_SPACE ' ' @@ -185,7 +190,7 @@ static void dumpWord(const unsigned short* word, const int length) { #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 -#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60 +#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 #define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105 @@ -204,7 +209,11 @@ static void dumpWord(const unsigned short* word, const int length) { // Word limit for sub queues used in WordsPriorityQueuePool. Sub queues are temporary queues used // for better performance. -#define SUB_QUEUE_MAX_WORDS 5 +// Holds up to 1 candidate for each word +#define SUB_QUEUE_MAX_WORDS 1 +#define SUB_QUEUE_MAX_COUNT 10 + +#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f #define MAX_DEPTH_MULTIPLIER 3 diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp index e3673d425..822c2151d 100644 --- a/native/src/dictionary.cpp +++ b/native/src/dictionary.cpp @@ -33,9 +33,9 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) { if (DEBUG_DICT) { if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) { - LOGI("Max word length (%d) is greater than %d", + AKLOGI("Max word length (%d) is greater than %d", maxWordLength, MAX_WORD_LENGTH_INTERNAL); - LOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF)); + AKLOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF)); } } mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); diff --git a/native/src/dictionary.h b/native/src/dictionary.h index 79d377a4f..90d7148d5 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -56,16 +56,7 @@ class Dictionary { // public static utility methods // static inline methods should be defined in the header file - static unsigned short getChar(const unsigned char *dict, int *pos); - static int getCount(const unsigned char *dict, int *pos); - static bool getTerminal(const unsigned char *dict, int *pos); - static int getAddress(const unsigned char *dict, int *pos); - static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos); static int wideStrLen(unsigned short *str); - // returns next sibling's position - static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion, - const int pos, unsigned short *c, int *childrenPosition, - bool *terminal, int *freq); private: bool hasBigram(); @@ -87,56 +78,6 @@ class Dictionary { // public static utility methods // static inline methods should be defined in the header file -inline unsigned short Dictionary::getChar(const unsigned char *dict, int *pos) { - unsigned short ch = (unsigned short) (dict[(*pos)++] & 0xFF); - // If the code is 255, then actual 16 bit code follows (in big endian) - if (ch == 0xFF) { - ch = ((dict[*pos] & 0xFF) << 8) | (dict[*pos + 1] & 0xFF); - (*pos) += 2; - } - return ch; -} - -inline int Dictionary::getCount(const unsigned char *dict, int *pos) { - return dict[(*pos)++] & 0xFF; -} - -inline bool Dictionary::getTerminal(const unsigned char *dict, int *pos) { - return (dict[*pos] & FLAG_TERMINAL_MASK) > 0; -} - -inline int Dictionary::getAddress(const unsigned char *dict, int *pos) { - int address = 0; - if ((dict[*pos] & FLAG_ADDRESS_MASK) == 0) { - *pos += 1; - } else { - address += (dict[*pos] & (ADDRESS_MASK >> 16)) << 16; - address += (dict[*pos + 1] & 0xFF) << 8; - address += (dict[*pos + 2] & 0xFF); - *pos += 3; - } - return address; -} - -inline int Dictionary::getFreq(const unsigned char *dict, - const bool isLatestDictVersion, int *pos) { - int freq = dict[(*pos)++] & 0xFF; - if (isLatestDictVersion) { - // skipping bigram - int bigramExist = (dict[*pos] & FLAG_BIGRAM_READ); - if (bigramExist > 0) { - int nextBigramExist = 1; - while (nextBigramExist > 0) { - (*pos) += 3; - nextBigramExist = (dict[(*pos)++] & FLAG_BIGRAM_CONTINUED); - } - } else { - (*pos)++; - } - } - return freq; -} - inline int Dictionary::wideStrLen(unsigned short *str) { if (!str) return 0; unsigned short *end = str; @@ -144,22 +85,6 @@ inline int Dictionary::wideStrLen(unsigned short *str) { end++; return end - str; } - -inline int Dictionary::setDictionaryValues(const unsigned char *dict, - const bool isLatestDictVersion, const int pos, unsigned short *c,int *childrenPosition, - bool *terminal, int *freq) { - int position = pos; - // -- at char - *c = Dictionary::getChar(dict, &position); - // -- at flag/add - *terminal = Dictionary::getTerminal(dict, &position); - *childrenPosition = Dictionary::getAddress(dict, &position); - // -- after address or flag - *freq = (*terminal) ? Dictionary::getFreq(dict, isLatestDictVersion, &position) : 1; - // returns next sibling's position - return position; -} - } // namespace latinime #endif // LATINIME_DICTIONARY_H diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index 95e35263b..b91957c77 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -52,7 +52,7 @@ ProximityInfo::ProximityInfo(const int maxProximityCharsSize, const int keyboard const int proximityGridLength = GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE; mProximityCharsArray = new uint32_t[proximityGridLength]; if (DEBUG_PROXIMITY_INFO) { - LOGI("Create proximity info array %d", proximityGridLength); + AKLOGI("Create proximity info array %d", proximityGridLength); } memcpy(mProximityCharsArray, proximityCharsArray, proximityGridLength * sizeof(mProximityCharsArray[0])); @@ -102,7 +102,7 @@ inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y) bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { if (x < 0 || y < 0) { if (DEBUG_DICT) { - LOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); + AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); assert(false); } return false; @@ -110,11 +110,11 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { const int startIndex = getStartIndexFromCoordinates(x, y); if (DEBUG_PROXIMITY_INFO) { - LOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); + AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); } for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { if (DEBUG_PROXIMITY_INFO) { - LOGI("Index: %d", mProximityCharsArray[startIndex + i]); + AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]); } if (mProximityCharsArray[startIndex + i] == KEYCODE_SPACE) { return true; diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index e95e03ce5..8be95bc40 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -47,7 +47,7 @@ UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typed BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { if (DEBUG_DICT) { - LOGI("UnigramDictionary - constructor"); + AKLOGI("UnigramDictionary - constructor"); } } @@ -163,14 +163,14 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords); if (DEBUG_DICT) { - LOGI("Returning %d words", suggestedWordsCount); + AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { #ifdef FLAG_DBG short unsigned int* w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; - LOGI("%s %i", s, frequencies[j]); + AKLOGI("%s %i", s, frequencies[j]); #endif } } @@ -186,7 +186,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_OPEN; PROF_START(0); - // Note: This line is intentionally left blank + queuePool->clearAll(); PROF_END(0); PROF_START(1); @@ -213,7 +213,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { for (int i = 1; i < inputLength; ++i) { if (DEBUG_DICT) { - LOGI("--- Suggest missing space characters %d", i); + AKLOGI("--- Suggest missing space characters %d", i); } getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, i, correction, queuePool); @@ -226,12 +226,12 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, // The first and last "mistyped spaces" are taken care of by excessive character handling for (int i = 1; i < inputLength - 1; ++i) { if (DEBUG_DICT) { - LOGI("--- Suggest words with proximity space %d", i); + AKLOGI("--- Suggest words with proximity space %d", i); } const int x = xcoordinates[i]; const int y = ycoordinates[i]; if (DEBUG_PROXIMITY_INFO) { - LOGI("Input[%d] x = %d, y = %d, has space proximity = %d", + AKLOGI("Input[%d] x = %d, y = %d, has space proximity = %d", i, x, y, proximityInfo->hasSpaceProximity(x, y)); } if (proximityInfo->hasSpaceProximity(x, y)) { @@ -241,18 +241,33 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } } PROF_END(6); + if (DEBUG_DICT) { + queuePool->dumpSubQueue1TopSuggestions(); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + WordsPriorityQueue* queue = queuePool->getSubQueue1(i); + if (queue->size() > 0) { + WordsPriorityQueue::SuggestedWord* sw = queue->top(); + const int score = sw->mScore; + const unsigned short* word = sw->mWord; + const int wordLength = sw->mWordLength; + double ns = Correction::RankingAlgorithm::calcNormalizedScore( + proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); + ns += 0; + AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, + (ns > TWO_WORDS_CORRECTION_THRESHOLD)); + DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); + DUMP_WORD(word, wordLength); + } + } + } } void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, - const int *yCoordinates, const int *codes, const int inputLength, - WordsPriorityQueue *queue, Correction *correction) { + const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { if (DEBUG_DICT) { - LOGI("initSuggest"); + AKLOGI("initSuggest"); } proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); - if (queue) { - queue->clear(); - } const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); correction->initCorrection(proximityInfo, inputLength, maxDepth); } @@ -264,15 +279,13 @@ void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool) { - WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - initSuggestions( - proximityInfo, xcoordinates, ycoordinates, codes, inputLength, masterQueue, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, correction, masterQueue, + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, Correction *correction, WordsPriorityQueue *queue, + const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, @@ -280,7 +293,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, doAutoCompletion, maxErrors); int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position - int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); + int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); int outputIndex = 0; correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0)); @@ -292,7 +305,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - correction, &childCount, &firstChildPos, &siblingPos, queue); + correction, &childCount, &firstChildPos, &siblingPos, queuePool); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -327,14 +340,34 @@ void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, cons inline void UnigramDictionary::onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, - WordsPriorityQueue *queue) { + WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) { + const int inputIndex = correction->getInputIndex(); + const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; + if (!addToMasterQueue && !addToSubQueue) { + return; + } + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex); int wordLength; unsigned short* wordPointer; const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); - if (finalFreq >= 0) { + if (finalFreq != NOT_A_FREQUENCY) { if (!terminalAttributes.isShortcutOnly()) { - addWord(wordPointer, wordLength, finalFreq, queue); + if (addToMasterQueue) { + addWord(wordPointer, wordLength, finalFreq, masterQueue); + } + // TODO: Check the validity of "inputIndex == wordLength" + //if (addToSubQueue && inputIndex == wordLength) { + if (addToSubQueue) { + addWord(wordPointer, wordLength, finalFreq, subQueue); + } + } + // Please note that the shortcut candidates will be added to the master queue only. + if (!addToMasterQueue) { + return; } + + // From here, below is the code to add shortcut candidates. TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); while (iterator.hasNextShortcutTarget()) { // TODO: addWord only supports weak ordering, meaning we have no means to control the @@ -345,7 +378,7 @@ inline void UnigramDictionary::onTerminal(const int freq, uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; const int shortcutTargetStringLength = iterator.getNextShortcutTarget( MAX_WORD_LENGTH_INTERNAL, shortcutTarget); - addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, queue); + addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue); } } } @@ -390,7 +423,81 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo const int firstFreq = getMostFrequentWordLike( firstWordStartPos, firstWordLength, proximityInfo, mWord); if (DEBUG_DICT) { - LOGI("First freq: %d", firstFreq); + AKLOGI("First freq: %d", firstFreq); + } + if (firstFreq <= 0) return; + + for (int i = 0; i < firstWordLength; ++i) { + word[i] = mWord[i]; + } + + const int secondFreq = getMostFrequentWordLike( + secondWordStartPos, secondWordLength, proximityInfo, mWord); + if (DEBUG_DICT) { + AKLOGI("Second freq: %d", secondFreq); + } + if (secondFreq <= 0) return; + + word[firstWordLength] = SPACE; + for (int i = (firstWordLength + 1); i < newWordLength; ++i) { + word[i] = mWord[i - firstWordLength - 1]; + } + + // TODO: Remove initSuggestions and correction->setCorrectionParams + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + + correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, + -1 /* transposedPos */, spaceProximityPos, missingSpacePos, + useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); + const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); + if (DEBUG_DICT) { + AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); + } + addWord(word, newWordLength, pairFreq, masterQueue); + return; +} + +void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, const int missingSpacePos, + const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + + if (DEBUG_DICT) { + int inputCount = 0; + if (spaceProximityPos >= 0) ++inputCount; + if (missingSpacePos >= 0) ++inputCount; + assert(inputCount <= 1); + } + const bool isSpaceProximity = spaceProximityPos >= 0; + const int firstWordStartPos = 0; + const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; + const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; + const int secondWordLength = isSpaceProximity + ? (inputLength - spaceProximityPos - 1) + : (inputLength - missingSpacePos); + + if (inputLength >= MAX_WORD_LENGTH) return; + if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos + || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) + return; + + const int newWordLength = firstWordLength + secondWordLength + 1; + + + // Space proximity preparation + //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); + //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue, + //correction); + //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false, + //MAX_ERRORS_FOR_TWO_WORDS); + + // Allocating variable length array on stack + unsigned short word[newWordLength]; + const int firstFreq = getMostFrequentWordLike( + firstWordStartPos, firstWordLength, proximityInfo, mWord); + if (DEBUG_DICT) { + AKLOGI("First freq: %d", firstFreq); } if (firstFreq <= 0) return; @@ -401,7 +508,7 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo const int secondFreq = getMostFrequentWordLike( secondWordStartPos, secondWordLength, proximityInfo, mWord); if (DEBUG_DICT) { - LOGI("Second freq: %d", secondFreq); + AKLOGI("Second freq: %d", secondFreq); } if (secondFreq <= 0) return; @@ -411,15 +518,14 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo } // TODO: Remove initSuggestions and correction->setCorrectionParams - initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, - 0 /* do not clear queue */, correction); + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, -1 /* transposedPos */, spaceProximityPos, missingSpacePos, useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); if (DEBUG_DICT) { - LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); + AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); } addWord(word, newWordLength, pairFreq, masterQueue); return; @@ -507,9 +613,10 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor int maxFreq = -1; const uint8_t* const root = DICT_ROOT; - mStackChildCount[0] = root[0]; + int startPos = 0; + mStackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); mStackInputIndex[0] = 0; - mStackSiblingPos[0] = 1; + mStackSiblingPos[0] = startPos; while (depth >= 0) { const int charGroupCount = mStackChildCount[depth]; int pos = mStackSiblingPos[depth]; @@ -583,7 +690,7 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueue *queue) { + int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool) { if (DEBUG_DICT) { correction->checkState(); } @@ -658,14 +765,13 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, } while (NOT_A_CHARACTER != c); if (isTerminalNode) { - if (needsToInvokeOnTerminal) { - // The frequency should be here, because we come here only if this is actually - // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - TerminalAttributes terminalAttributes(DICT_ROOT, flags, - BinaryFormat::skipFrequency(flags, pos)); - onTerminal(freq, terminalAttributes, correction, queue); - } + // The frequency should be here, because we come here only if this is actually + // a terminal node, and we are on its last char. + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); + const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); + TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); + onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. @@ -690,7 +796,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); if (DEBUG_DICT_FULL) { - LOGI("Traversing was pruned."); + AKLOGI("Traversing was pruned."); } return false; } diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 23581425a..b950971bb 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -93,18 +93,21 @@ class UnigramDictionary { const int codesRemain, const int currentDepth, int* codesDest, Correction *correction, WordsPriorityQueuePool* queuePool); void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, - WordsPriorityQueue *queue, Correction *correction); + const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool); void getSuggestionCandidates( const bool useFullEditDistance, const int inputLength, Correction *correction, - WordsPriorityQueue* queue, const bool doAutoCompletion, const int maxErrors); + WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors); void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); + void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, + const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, + const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, @@ -114,12 +117,12 @@ class UnigramDictionary { const int inputLength, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool); void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, - Correction *correction, WordsPriorityQueue *queue); + Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character bool processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueue *queue); + int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, diff --git a/native/src/words_priority_queue.h b/native/src/words_priority_queue.h index 84f2523c2..6262439b5 100644 --- a/native/src/words_priority_queue.h +++ b/native/src/words_priority_queue.h @@ -48,6 +48,7 @@ class WordsPriorityQueue { mSuggestedWords[i].mUsed = false; } } + ~WordsPriorityQueue() { delete[] mSuggestedWords; } @@ -70,20 +71,19 @@ class WordsPriorityQueue { sw->setParams(score, word, wordLength); } if (sw == 0) { - LOGE("SuggestedWord is accidentally null."); + AKLOGE("SuggestedWord is accidentally null."); return; } if (DEBUG_WORDS_PRIORITY_QUEUE) { - LOGI("Push word. %d, %d", score, wordLength); + AKLOGI("Push word. %d, %d", score, wordLength); DUMP_WORD(word, wordLength); } mSuggestions.push(sw); } - SuggestedWord* topAndPop() { + SuggestedWord* top() { if (mSuggestions.empty()) return 0; SuggestedWord* sw = mSuggestions.top(); - mSuggestions.pop(); return sw; } @@ -93,7 +93,7 @@ class WordsPriorityQueue { while (!mSuggestions.empty() && index >= 0) { SuggestedWord* sw = mSuggestions.top(); if (DEBUG_WORDS_PRIORITY_QUEUE) { - LOGI("dump word. %d", sw->mScore); + AKLOGI("dump word. %d", sw->mScore); DUMP_WORD(sw->mWord, sw->mWordLength); } const unsigned int wordLength = sw->mWordLength; @@ -111,7 +111,7 @@ class WordsPriorityQueue { return size; } - int size() { + int size() const { return mSuggestions.size(); } @@ -119,7 +119,7 @@ class WordsPriorityQueue { while (!mSuggestions.empty()) { SuggestedWord* sw = mSuggestions.top(); if (DEBUG_WORDS_PRIORITY_QUEUE) { - LOGI("Clear word. %d", sw->mScore); + AKLOGI("Clear word. %d", sw->mScore); DUMP_WORD(sw->mWord, sw->mWordLength); } sw->mUsed = false; @@ -127,6 +127,13 @@ class WordsPriorityQueue { } } + void dumpTopWord() { + if (size() <= 0) { + return; + } + DUMP_WORD(mSuggestions.top()->mWord, mSuggestions.top()->mWordLength); + } + private: struct wordComparator { bool operator ()(SuggestedWord * left, SuggestedWord * right) { diff --git a/native/src/words_priority_queue_pool.h b/native/src/words_priority_queue_pool.h index 386297650..5fa254852 100644 --- a/native/src/words_priority_queue_pool.h +++ b/native/src/words_priority_queue_pool.h @@ -17,6 +17,8 @@ #ifndef LATINIME_WORDS_PRIORITY_QUEUE_POOL_H #define LATINIME_WORDS_PRIORITY_QUEUE_POOL_H +#include <assert.h> +#include <new> #include "words_priority_queue.h" namespace latinime { @@ -24,30 +26,60 @@ namespace latinime { class WordsPriorityQueuePool { public: WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) { - mMasterQueue = new WordsPriorityQueue(mainQueueMaxWords, maxWordLength); - mSubQueue1 = new WordsPriorityQueue(subQueueMaxWords, maxWordLength); - mSubQueue2 = new WordsPriorityQueue(subQueueMaxWords, maxWordLength); + mMasterQueue = new(mMasterQueueBuf) WordsPriorityQueue(mainQueueMaxWords, maxWordLength); + for (int i = 0, subQueueBufOffset = 0; i < SUB_QUEUE_MAX_COUNT; + ++i, subQueueBufOffset += sizeof(WordsPriorityQueue)) { + mSubQueues1[i] = new(mSubQueueBuf1 + subQueueBufOffset) + WordsPriorityQueue(subQueueMaxWords, maxWordLength); + mSubQueues2[i] = new(mSubQueueBuf2 + subQueueBufOffset) + WordsPriorityQueue(subQueueMaxWords, maxWordLength); + } } - ~WordsPriorityQueuePool() { - delete mMasterQueue; + virtual ~WordsPriorityQueuePool() { } WordsPriorityQueue* getMasterQueue() { return mMasterQueue; } + // TODO: Come up with more generic pool - WordsPriorityQueue* getSubQueue1() { - return mSubQueue1; + WordsPriorityQueue* getSubQueue1(const int id) { + if (DEBUG_WORDS_PRIORITY_QUEUE) { + assert(id >= 0 && id < SUB_QUEUE_MAX_COUNT); + } + return mSubQueues1[id]; + } + + WordsPriorityQueue* getSubQueue2(const int id) { + if (DEBUG_WORDS_PRIORITY_QUEUE) { + assert(id >= 0 && id < SUB_QUEUE_MAX_COUNT); + } + return mSubQueues2[id]; } - WordsPriorityQueue* getSubQueue2() { - return mSubQueue2; + + inline void clearAll() { + mMasterQueue->clear(); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + mSubQueues1[i]->clear(); + mSubQueues2[i]->clear(); + } + } + + void dumpSubQueue1TopSuggestions() { + AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS"); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + mSubQueues1[i]->dumpTopWord(); + } } private: - WordsPriorityQueue *mMasterQueue; - WordsPriorityQueue *mSubQueue1; - WordsPriorityQueue *mSubQueue2; + WordsPriorityQueue* mMasterQueue; + WordsPriorityQueue* mSubQueues1[SUB_QUEUE_MAX_COUNT]; + WordsPriorityQueue* mSubQueues2[SUB_QUEUE_MAX_COUNT]; + char mMasterQueueBuf[sizeof(WordsPriorityQueue)]; + char mSubQueueBuf1[SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)]; + char mSubQueueBuf2[SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)]; }; } |