diff options
Diffstat (limited to 'native/src')
-rw-r--r-- | native/src/basechars.h | 20 | ||||
-rw-r--r-- | native/src/bigram_dictionary.cpp | 2 | ||||
-rw-r--r-- | native/src/defines.h | 11 | ||||
-rw-r--r-- | native/src/dictionary.h | 17 | ||||
-rw-r--r-- | native/src/proximity_info.cpp | 98 | ||||
-rw-r--r-- | native/src/proximity_info.h | 17 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 176 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 27 |
8 files changed, 213 insertions, 155 deletions
diff --git a/native/src/basechars.h b/native/src/basechars.h index 5a4406606..3843e11c5 100644 --- a/native/src/basechars.h +++ b/native/src/basechars.h @@ -1,3 +1,22 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BASECHARS_H +#define LATINIME_BASECHARS_H + /** * Table mapping most combined Latin, Greek, and Cyrillic characters * to their base characters. If c is in range, BASE_CHARS[c] == c @@ -170,3 +189,4 @@ static unsigned short BASE_CHARS[] = { // generated with: // cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' +#endif // LATINIME_BASECHARS_H diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp index d11aee28e..6ed4d0982 100644 --- a/native/src/bigram_dictionary.cpp +++ b/native/src/bigram_dictionary.cpp @@ -45,8 +45,8 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; -#endif LOGI("Bigram: Found word = %s, freq = %d :", s, frequency); +#endif } // Find the right insertion point diff --git a/native/src/defines.h b/native/src/defines.h index a516190af..bea83b2c5 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -18,8 +18,16 @@ #ifndef LATINIME_DEFINES_H #define LATINIME_DEFINES_H +#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#include <cutils/log.h> +#else +#define LOGE(fmt, ...) +#define LOGI(fmt, ...) +#endif + #ifdef FLAG_DO_PROFILE // Profiler +#include <cutils/log.h> #include <time.h> #define PROF_BUF_SIZE 100 static double profile_buf[PROF_BUF_SIZE]; @@ -92,8 +100,7 @@ static void prof_out(void) { #define DEBUG_PROXIMITY_INFO true #else // FLAG_DBG -#define LOGE(fmt, ...) -#define LOGI(fmt, ...) + #define DEBUG_DICT false #define DEBUG_DICT_FULL false #define DEBUG_SHOW_FOUND_WORD false diff --git a/native/src/dictionary.h b/native/src/dictionary.h index 3dc577a56..73e03d8fd 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -17,7 +17,9 @@ #ifndef LATINIME_DICTIONARY_H #define LATINIME_DICTIONARY_H +#include "basechars.h" #include "bigram_dictionary.h" +#include "char_utils.h" #include "defines.h" #include "proximity_info.h" #include "unigram_dictionary.h" @@ -61,7 +63,7 @@ public: static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion, const int pos, unsigned short *c, int *childrenPosition, bool *terminal, int *freq); - + static inline unsigned short toBaseLowerCase(unsigned short c); // TODO: delete this int getBigramPosition(unsigned short *word, int length); @@ -156,6 +158,19 @@ inline int Dictionary::setDictionaryValues(const unsigned char *dict, return position; } + +inline unsigned short Dictionary::toBaseLowerCase(unsigned short c) { + if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { + c = BASE_CHARS[c]; + } + if (c >='A' && c <= 'Z') { + c |= 32; + } else if (c > 127) { + c = latin_tolower(c); + } + return c; +} + } // namespace latinime #endif // LATINIME_DICTIONARY_H diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index 209c31e6e..c45393f18 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -19,6 +19,7 @@ #define LOG_TAG "LatinIME: proximity_info.cpp" +#include "dictionary.h" #include "proximity_info.h" namespace latinime { @@ -63,4 +64,101 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { return false; } +// TODO: Calculate nearby codes here. +void ProximityInfo::setInputParams(const int* inputCodes, const int inputLength) { + mInputCodes = inputCodes; + mInputLength = inputLength; +} + +inline const int* ProximityInfo::getProximityCharsAt(const int index) const { + return mInputCodes + (index * MAX_PROXIMITY_CHARS_SIZE); +} + +unsigned short ProximityInfo::getPrimaryCharAt(const int index) const { + return getProximityCharsAt(index)[0]; +} + +bool ProximityInfo::existsCharInProximityAt(const int index, const int c) const { + const int *chars = getProximityCharsAt(index); + int i = 0; + while (chars[i] > 0 && i < MAX_PROXIMITY_CHARS_SIZE) { + if (chars[i++] == c) { + return true; + } + } + return false; +} + +bool ProximityInfo::existsAdjacentProximityChars(const int index) const { + if (index < 0 || index >= mInputLength) return false; + const int currentChar = getPrimaryCharAt(index); + const int leftIndex = index - 1; + if (leftIndex >= 0 && existsCharInProximityAt(leftIndex, currentChar)) { + return true; + } + const int rightIndex = index + 1; + if (rightIndex < mInputLength && existsCharInProximityAt(rightIndex, currentChar)) { + return true; + } + return false; +} + +// In the following function, c is the current character of the dictionary word +// currently examined. +// currentChars is an array containing the keys close to the character the +// user actually typed at the same position. We want to see if c is in it: if so, +// then the word contains at that position a character close to what the user +// typed. +// What the user typed is actually the first character of the array. +// Notice : accented characters do not have a proximity list, so they are alone +// in their list. The non-accented version of the character should be considered +// "close", but not the other keys close to the non-accented version. +ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId( + const int index, const unsigned short c, const int skipPos, + const int excessivePos, const int transposedPos) const { + const int *currentChars = getProximityCharsAt(index); + const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); + + // The first char in the array is what user typed. If it matches right away, + // that means the user typed that same char for this pos. + if (currentChars[0] == baseLowerC || currentChars[0] == c) + return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; + + // If one of those is true, we should not check for close characters at all. + if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) + return UNRELATED_CHAR; + + // If the non-accented, lowercased version of that first character matches c, + // then we have a non-accented version of the accented character the user + // typed. Treat it as a close char. + if (Dictionary::toBaseLowerCase(currentChars[0]) == baseLowerC) + return NEAR_PROXIMITY_CHAR; + + // Not an exact nor an accent-alike match: search the list of close keys + int j = 1; + while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS_SIZE) { + const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); + if (matched) return NEAR_PROXIMITY_CHAR; + ++j; + } + + // Was not included, signal this as an unrelated character. + return UNRELATED_CHAR; +} + +bool ProximityInfo::sameAsTyped(const unsigned short *word, int length) const { + if (length != mInputLength) { + return false; + } + const int *inputCodes = mInputCodes; + while (length--) { + if ((unsigned int) *inputCodes != (unsigned int) *word) { + return false; + } + inputCodes += MAX_PROXIMITY_CHARS_SIZE; + word++; + } + return true; +} + } // namespace latinime diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index 327cd0940..435a60151 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -25,11 +25,26 @@ namespace latinime { class ProximityInfo { public: + typedef enum { // Used as a return value for character comparison + SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR, // Same char, possibly with different case or accent + NEAR_PROXIMITY_CHAR, // It is a char located nearby on the keyboard + UNRELATED_CHAR // It is an unrelated char + } ProximityType; + ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth, const int keybaordHeight, const int gridWidth, const int gridHeight, const uint32_t *proximityCharsArray); ~ProximityInfo(); bool hasSpaceProximity(const int x, const int y) const; + void setInputParams(const int* inputCodes, const int inputLength); + const int* getProximityCharsAt(const int index) const; + unsigned short getPrimaryCharAt(const int index) const; + bool existsCharInProximityAt(const int index, const int c) const; + bool existsAdjacentProximityChars(const int index) const; + ProximityType getMatchedProximityId( + const int index, const unsigned short c, const int skipPos, + const int excessivePos, const int transposedPos) const; + bool sameAsTyped(const unsigned short *word, int length) const; private: int getStartIndexFromCoordinates(const int x, const int y) const; const int MAX_PROXIMITY_CHARS_SIZE; @@ -39,7 +54,9 @@ private: const int GRID_HEIGHT; const int CELL_WIDTH; const int CELL_HEIGHT; + const int *mInputCodes; uint32_t *mProximityCharsArray; + int mInputLength; }; } // namespace latinime diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 698584e54..afa8bc545 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -20,7 +20,6 @@ #define LOG_TAG "LatinIME: unigram_dictionary.cpp" -#include "basechars.h" #include "char_utils.h" #include "dictionary.h" #include "unigram_dictionary.h" @@ -54,7 +53,7 @@ UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typed // TODO : remove this variable. ROOT_POS(0), #endif // NEW_DICTIONARY_FORMAT - BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)), + BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { if (DEBUG_DICT) { LOGI("UnigramDictionary - constructor"); @@ -93,7 +92,7 @@ bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codes // codesDest is the current point in the work buffer. // codesSrc is the current point in the user-input, original, content-unmodified buffer. // codesRemain is the remaining size in codesSrc. -void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo, +void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) { @@ -143,7 +142,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *pr (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies); } -int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, +int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies) { @@ -172,8 +171,8 @@ int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; -#endif LOGI("%s %i", s, mFrequencies[j]); +#endif } LOGI("Next letters: "); for (int k = 0; k < NEXT_LETTERS_SIZE; k++) { @@ -187,13 +186,14 @@ int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const return suggestedWordsCount; } -void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, +void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, unsigned short *outWords, int *frequencies) { PROF_OPEN; PROF_START(0); - initSuggestions(codes, codesSize, outWords, frequencies); + initSuggestions( + proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies); if (DEBUG_DICT) assert(codesSize == mInputLength); const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); @@ -275,16 +275,18 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, PROF_END(6); } -void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, +void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, unsigned short *outWords, int *frequencies) { if (DEBUG_DICT) { LOGI("initSuggest"); } mFrequencies = frequencies; mOutputChars = outWords; - mInputCodes = codes; mInputLength = codesSize; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; + proximityInfo->setInputParams(codes, codesSize); + mProximityInfo = proximityInfo; } static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) { @@ -301,8 +303,8 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; -#endif LOGI("Found word = %s, freq = %d", s, frequency); +#endif } if (length > MAX_WORD_LENGTH) { if (DEBUG_DICT) { @@ -325,8 +327,8 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; -#endif LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX); +#endif } memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), @@ -348,33 +350,6 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) return false; } -static inline unsigned short toBaseLowerCase(unsigned short c) { - if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { - c = BASE_CHARS[c]; - } - if (c >='A' && c <= 'Z') { - c |= 32; - } else if (c > 127) { - c = latin_tolower(c); - } - return c; -} - -bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const { - if (length != mInputLength) { - return false; - } - const int *inputCodes = mInputCodes; - while (length--) { - if ((unsigned int) *inputCodes != (unsigned int) *word) { - return false; - } - inputCodes += MAX_PROXIMITY_CHARS; - word++; - } - return true; -} - static const char QUOTE = '\''; static const char SPACE = ' '; @@ -568,7 +543,9 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); if (excessivePos >= 0) { multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); - if (!existsAdjacentProximityChars(inputIndex, mInputLength)) { + if (!mProximityInfo->existsAdjacentProximityChars(inputIndex)) { + // If an excessive character is not adjacent to the left char or the right char, + // we will demote this word. multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); } } @@ -602,75 +579,11 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth) { - const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0]; + const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex); // Skip the ' or other letter and continue deeper return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth; } -inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex, - const int inputLength) const { - if (inputIndex < 0 || inputIndex >= inputLength) return false; - const int currentChar = *getInputCharsAt(inputIndex); - const int leftIndex = inputIndex - 1; - if (leftIndex >= 0) { - const int *leftChars = getInputCharsAt(leftIndex); - int i = 0; - while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { - if (leftChars[i++] == currentChar) return true; - } - } - const int rightIndex = inputIndex + 1; - if (rightIndex < inputLength) { - const int *rightChars = getInputCharsAt(rightIndex); - int i = 0; - while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { - if (rightChars[i++] == currentChar) return true; - } - } - return false; -} - -// In the following function, c is the current character of the dictionary word -// currently examined. -// currentChars is an array containing the keys close to the character the -// user actually typed at the same position. We want to see if c is in it: if so, -// then the word contains at that position a character close to what the user -// typed. -// What the user typed is actually the first character of the array. -// Notice : accented characters do not have a proximity list, so they are alone -// in their list. The non-accented version of the character should be considered -// "close", but not the other keys close to the non-accented version. -inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId( - const int *currentChars, const unsigned short c, const int skipPos, - const int excessivePos, const int transposedPos) { - const unsigned short baseLowerC = toBaseLowerCase(c); - - // The first char in the array is what user typed. If it matches right away, - // that means the user typed that same char for this pos. - if (currentChars[0] == baseLowerC || currentChars[0] == c) - return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; - - // If one of those is true, we should not check for close characters at all. - if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) - return UNRELATED_CHAR; - - // If the non-accented, lowercased version of that first character matches c, - // then we have a non-accented version of the accented character the user - // typed. Treat it as a close char. - if (toBaseLowerCase(currentChars[0]) == baseLowerC) - return NEAR_PROXIMITY_CHAR; - - // Not an exact nor an accent-alike match: search the list of close keys - int j = 1; - while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) { - const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); - if (matched) return NEAR_PROXIMITY_CHAR; - ++j; - } - - // Was not included, signal this as an unrelated character. - return UNRELATED_CHAR; -} inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth, const uint8_t* const root, const uint8_t flags, const int pos, @@ -678,7 +591,7 @@ inline void UnigramDictionary::onTerminal(unsigned short int* word, const int de const int excessivePos, const int transposedPos, const int freq, const bool sameLength, int* nextLetters, const int nextLettersSize) { - const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false; + const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false; if (isSameAsTyped) return; if (depth >= MIN_SUGGEST_DEPTH) { @@ -809,9 +722,9 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, char s[inputLength + 1]; for (int i = 0; i < inputLength; ++i) s[i] = word[i]; s[inputLength] = 0; -#endif LOGI("New missing space word found: %d > %d (%s), %d, %d", newFreq, maxFreq, s, inputLength, depth); +#endif } maxFreq = newFreq; } @@ -836,15 +749,14 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) { const int inputIndex = startInputIndex + depth; - const int *currentChars = getInputCharsAt(inputIndex); unsigned short c; *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos, &c, newChildPosition, newTerminal, newFreq); - const unsigned int inputC = currentChars[0]; + const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex); if (DEBUG_DICT) { assert(inputC <= U_SHORT_MAX); } - const unsigned short baseLowerC = toBaseLowerCase(c); + const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); const bool matched = (inputC == baseLowerC || inputC == c); const bool hasChild = *newChildPosition != 0; if (matched) { @@ -962,20 +874,20 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in *newDiffs = diffs; *newInputIndex = inputIndex; } else { - const int *currentChars = getInputCharsAt(inputIndex); + int inputIndexForProximity = inputIndex; if (transposedPos >= 0) { - if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; - if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; + if (inputIndex == transposedPos) ++inputIndexForProximity; + if (inputIndex == (transposedPos + 1)) --inputIndexForProximity; } - int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos, - transposedPos); - if (UNRELATED_CHAR == matchedProximityCharId) return false; + ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId( + inputIndexForProximity, c, skipPos, excessivePos, transposedPos); + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false; mWord[depth] = c; // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. - if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); } bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 @@ -988,7 +900,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in // Start traversing all nodes after the index exceeds the user typed length *newTraverseAllNodes = isSameAsUserTypedLength; *newMatchRate = matchWeight; - *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); + *newDiffs = diffs + + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); *newInputIndex = inputIndex + 1; } // Optimization: Prune out words that are too long compared to how much was typed. @@ -1017,7 +930,7 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, uint16_t inWord[inputLength]; for (int i = 0; i < inputLength; ++i) { - inWord[i] = *getInputCharsAt(startInputIndex + i); + inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i); } return getMostFrequentWordLikeInner(inWord, inputLength, word); } @@ -1041,8 +954,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - int32_t baseChar = toBaseLowerCase(character); - const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); + int32_t baseChar = Dictionary::toBaseLowerCase(character); + const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; @@ -1054,8 +967,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, if (hasMultipleChars) { character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); while (NOT_A_CHARACTER != character) { - baseChar = toBaseLowerCase(character); - if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + baseChar = Dictionary::toBaseLowerCase(character); + if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false; @@ -1300,7 +1213,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags); // This has to be done for each virtual char (this forwards the "inputIndex" which - // is the index in the user-inputted chars, as read by getInputCharsAt. + // is the index in the user-inputted chars, as read by proximity chars. if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { mWord[depth] = c; @@ -1324,16 +1237,16 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in return false; } } else { - const int *currentChars = getInputCharsAt(inputIndex); + int inputIndexForProximity = inputIndex; if (transposedPos >= 0) { - if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; - if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; + if (inputIndex == transposedPos) ++inputIndexForProximity; + if (inputIndex == (transposedPos + 1)) --inputIndexForProximity; } - const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, - excessivePos, transposedPos); - if (UNRELATED_CHAR == matchedProximityCharId) { + int matchedProximityCharId = mProximityInfo->getMatchedProximityId( + inputIndexForProximity, c, skipPos, excessivePos, transposedPos); + if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { // We found that this is an unrelated character, so we should give up traversing // this node and its children entirely. // However we may not be on the last virtual node yet so we skip the remaining @@ -1352,7 +1265,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in mWord[depth] = c; // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. - if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); } const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 @@ -1376,7 +1289,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in } // Start traversing all nodes after the index exceeds the user typed length traverseAllNodes = isSameAsUserTypedLength; - diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); + diffs = diffs + + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); // Finally, we are ready to go to the next character, the next "virtual node". // We should advance the input index. // We do this in this branch of the 'if traverseAllNodes' because we are still matching diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index dcc8f2a9a..f6045c6ef 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -29,12 +29,6 @@ namespace latinime { class UnigramDictionary { - typedef enum { // Used as a return value for character comparison - SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR, // Same char, possibly with different case or accent - NEAR_PROXIMITY_CHAR, // It is a char located nearby on the keyboard - UNRELATED_CHAR // It is an unrelated char - } ProximityType; - public: #ifdef NEW_DICTIONARY_FORMAT @@ -82,26 +76,26 @@ public: int maxAlternatives); #endif // NEW_DICTIONARY_FORMAT int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; - int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies); ~UnigramDictionary(); private: - void getWordSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, unsigned short *outWords, int *frequencies); bool isDigraph(const int* codes, const int i, const int codesSize) const; - void getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo, + void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies); - void initSuggestions(const int *codes, const int codesSize, unsigned short *outWords, - int *frequencies); + void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, + unsigned short *outWords, int *frequencies); void getSuggestionCandidates(const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, const int maxDepth); - bool sameAsTyped(const unsigned short *word, int length) const; bool addWord(unsigned short *word, int length, int frequency); bool getSplitTwoWordsSuggestion(const int inputLength, const int firstWordStartPos, const int firstWordLength, @@ -118,8 +112,6 @@ private: int *nextLetters, const int nextLettersSize); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); - ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c, - const int skipPos, const int excessivePos, const int transposedPos); // Process a node by considering proximity, missing and excessive character bool processCurrentNode(const int initialPos, const int initialDepth, const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex, @@ -127,10 +119,6 @@ private: const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex); - bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const; - inline const int* getInputCharsAt(const int index) const { - return mInputCodes + (index * MAX_PROXIMITY_CHARS); - } int getMostFrequentWordLike(const int startInputIndex, const int inputLength, unsigned short *word); #ifndef NEW_DICTIONARY_FORMAT @@ -174,7 +162,7 @@ private: int *mFrequencies; unsigned short *mOutputChars; - const int *mInputCodes; + const ProximityInfo *mProximityInfo; int mInputLength; // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; @@ -189,7 +177,6 @@ private: int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL]; int mNextLettersFrequency[NEXT_LETTERS_SIZE]; }; - } // namespace latinime #endif // LATINIME_UNIGRAM_DICTIONARY_H |