aboutsummaryrefslogtreecommitdiffstats
path: root/native/src/unigram_dictionary.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r--native/src/unigram_dictionary.cpp176
1 files changed, 45 insertions, 131 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 698584e54..afa8bc545 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -20,7 +20,6 @@
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
-#include "basechars.h"
#include "char_utils.h"
#include "dictionary.h"
#include "unigram_dictionary.h"
@@ -54,7 +53,7 @@ UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typed
// TODO : remove this variable.
ROOT_POS(0),
#endif // NEW_DICTIONARY_FORMAT
- BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
+ BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
if (DEBUG_DICT) {
LOGI("UnigramDictionary - constructor");
@@ -93,7 +92,7 @@ bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codes
// codesDest is the current point in the work buffer.
// codesSrc is the current point in the user-input, original, content-unmodified buffer.
// codesRemain is the remaining size in codesSrc.
-void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
+void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
@@ -143,7 +142,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *pr
(codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
}
-int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
+int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
unsigned short *outWords, int *frequencies) {
@@ -172,8 +171,8 @@ int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const
short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH;
char s[MAX_WORD_LENGTH];
for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
-#endif
LOGI("%s %i", s, mFrequencies[j]);
+#endif
}
LOGI("Next letters: ");
for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
@@ -187,13 +186,14 @@ int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const
return suggestedWordsCount;
}
-void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
+void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
unsigned short *outWords, int *frequencies) {
PROF_OPEN;
PROF_START(0);
- initSuggestions(codes, codesSize, outWords, frequencies);
+ initSuggestions(
+ proximityInfo, xcoordinates, ycoordinates, codes, codesSize, outWords, frequencies);
if (DEBUG_DICT) assert(codesSize == mInputLength);
const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
@@ -275,16 +275,18 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
PROF_END(6);
}
-void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
+void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
+ const int *ycoordinates, const int *codes, const int codesSize,
unsigned short *outWords, int *frequencies) {
if (DEBUG_DICT) {
LOGI("initSuggest");
}
mFrequencies = frequencies;
mOutputChars = outWords;
- mInputCodes = codes;
mInputLength = codesSize;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
+ proximityInfo->setInputParams(codes, codesSize);
+ mProximityInfo = proximityInfo;
}
static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
@@ -301,8 +303,8 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
#ifdef FLAG_DBG
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
-#endif
LOGI("Found word = %s, freq = %d", s, frequency);
+#endif
}
if (length > MAX_WORD_LENGTH) {
if (DEBUG_DICT) {
@@ -325,8 +327,8 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
#ifdef FLAG_DBG
char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i];
-#endif
LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX);
+#endif
}
memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
@@ -348,33 +350,6 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
return false;
}
-static inline unsigned short toBaseLowerCase(unsigned short c) {
- if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
- c = BASE_CHARS[c];
- }
- if (c >='A' && c <= 'Z') {
- c |= 32;
- } else if (c > 127) {
- c = latin_tolower(c);
- }
- return c;
-}
-
-bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const {
- if (length != mInputLength) {
- return false;
- }
- const int *inputCodes = mInputCodes;
- while (length--) {
- if ((unsigned int) *inputCodes != (unsigned int) *word) {
- return false;
- }
- inputCodes += MAX_PROXIMITY_CHARS;
- word++;
- }
- return true;
-}
-
static const char QUOTE = '\'';
static const char SPACE = ' ';
@@ -568,7 +543,9 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
if (excessivePos >= 0) {
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
- if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
+ if (!mProximityInfo->existsAdjacentProximityChars(inputIndex)) {
+ // If an excessive character is not adjacent to the left char or the right char,
+ // we will demote this word.
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
}
}
@@ -602,75 +579,11 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth) {
- const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
+ const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(inputIndex);
// Skip the ' or other letter and continue deeper
return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
}
-inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
- const int inputLength) const {
- if (inputIndex < 0 || inputIndex >= inputLength) return false;
- const int currentChar = *getInputCharsAt(inputIndex);
- const int leftIndex = inputIndex - 1;
- if (leftIndex >= 0) {
- const int *leftChars = getInputCharsAt(leftIndex);
- int i = 0;
- while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
- if (leftChars[i++] == currentChar) return true;
- }
- }
- const int rightIndex = inputIndex + 1;
- if (rightIndex < inputLength) {
- const int *rightChars = getInputCharsAt(rightIndex);
- int i = 0;
- while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
- if (rightChars[i++] == currentChar) return true;
- }
- }
- return false;
-}
-
-// In the following function, c is the current character of the dictionary word
-// currently examined.
-// currentChars is an array containing the keys close to the character the
-// user actually typed at the same position. We want to see if c is in it: if so,
-// then the word contains at that position a character close to what the user
-// typed.
-// What the user typed is actually the first character of the array.
-// Notice : accented characters do not have a proximity list, so they are alone
-// in their list. The non-accented version of the character should be considered
-// "close", but not the other keys close to the non-accented version.
-inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
- const int *currentChars, const unsigned short c, const int skipPos,
- const int excessivePos, const int transposedPos) {
- const unsigned short baseLowerC = toBaseLowerCase(c);
-
- // The first char in the array is what user typed. If it matches right away,
- // that means the user typed that same char for this pos.
- if (currentChars[0] == baseLowerC || currentChars[0] == c)
- return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
-
- // If one of those is true, we should not check for close characters at all.
- if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
- return UNRELATED_CHAR;
-
- // If the non-accented, lowercased version of that first character matches c,
- // then we have a non-accented version of the accented character the user
- // typed. Treat it as a close char.
- if (toBaseLowerCase(currentChars[0]) == baseLowerC)
- return NEAR_PROXIMITY_CHAR;
-
- // Not an exact nor an accent-alike match: search the list of close keys
- int j = 1;
- while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
- const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
- if (matched) return NEAR_PROXIMITY_CHAR;
- ++j;
- }
-
- // Was not included, signal this as an unrelated character.
- return UNRELATED_CHAR;
-}
inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
const uint8_t* const root, const uint8_t flags, const int pos,
@@ -678,7 +591,7 @@ inline void UnigramDictionary::onTerminal(unsigned short int* word, const int de
const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
int* nextLetters, const int nextLettersSize) {
- const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false;
+ const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false;
if (isSameAsTyped) return;
if (depth >= MIN_SUGGEST_DEPTH) {
@@ -809,9 +722,9 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
char s[inputLength + 1];
for (int i = 0; i < inputLength; ++i) s[i] = word[i];
s[inputLength] = 0;
-#endif
LOGI("New missing space word found: %d > %d (%s), %d, %d",
newFreq, maxFreq, s, inputLength, depth);
+#endif
}
maxFreq = newFreq;
}
@@ -836,15 +749,14 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
const int inputIndex = startInputIndex + depth;
- const int *currentChars = getInputCharsAt(inputIndex);
unsigned short c;
*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
&c, newChildPosition, newTerminal, newFreq);
- const unsigned int inputC = currentChars[0];
+ const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex);
if (DEBUG_DICT) {
assert(inputC <= U_SHORT_MAX);
}
- const unsigned short baseLowerC = toBaseLowerCase(c);
+ const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
const bool matched = (inputC == baseLowerC || inputC == c);
const bool hasChild = *newChildPosition != 0;
if (matched) {
@@ -962,20 +874,20 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
*newDiffs = diffs;
*newInputIndex = inputIndex;
} else {
- const int *currentChars = getInputCharsAt(inputIndex);
+ int inputIndexForProximity = inputIndex;
if (transposedPos >= 0) {
- if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
- if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
+ if (inputIndex == transposedPos) ++inputIndexForProximity;
+ if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
}
- int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
- transposedPos);
- if (UNRELATED_CHAR == matchedProximityCharId) return false;
+ ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId(
+ inputIndexForProximity, c, skipPos, excessivePos, transposedPos);
+ if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false;
mWord[depth] = c;
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
- if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
+ if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
}
bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
@@ -988,7 +900,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
// Start traversing all nodes after the index exceeds the user typed length
*newTraverseAllNodes = isSameAsUserTypedLength;
*newMatchRate = matchWeight;
- *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
+ *newDiffs = diffs
+ + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
*newInputIndex = inputIndex + 1;
}
// Optimization: Prune out words that are too long compared to how much was typed.
@@ -1017,7 +930,7 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
uint16_t inWord[inputLength];
for (int i = 0; i < inputLength; ++i) {
- inWord[i] = *getInputCharsAt(startInputIndex + i);
+ inWord[i] = (uint16_t)mProximityInfo->getPrimaryCharAt(startInputIndex + i);
}
return getMostFrequentWordLikeInner(inWord, inputLength, word);
}
@@ -1041,8 +954,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags));
int pos = startPos;
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
- int32_t baseChar = toBaseLowerCase(character);
- const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]);
+ int32_t baseChar = Dictionary::toBaseLowerCase(character);
+ const uint16_t wChar = Dictionary::toBaseLowerCase(inWord[startInputIndex]);
if (baseChar != wChar) {
*outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos;
@@ -1054,8 +967,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags,
if (hasMultipleChars) {
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
while (NOT_A_CHARACTER != character) {
- baseChar = toBaseLowerCase(character);
- if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
+ baseChar = Dictionary::toBaseLowerCase(character);
+ if (Dictionary::toBaseLowerCase(inWord[++inputIndex]) != baseChar) {
*outPos = BinaryFormat::skipOtherCharacters(root, pos);
*outInputIndex = startInputIndex;
return false;
@@ -1300,7 +1213,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags);
// This has to be done for each virtual char (this forwards the "inputIndex" which
- // is the index in the user-inputted chars, as read by getInputCharsAt.
+ // is the index in the user-inputted chars, as read by proximity chars.
if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
mWord[depth] = c;
@@ -1324,16 +1237,16 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
return false;
}
} else {
- const int *currentChars = getInputCharsAt(inputIndex);
+ int inputIndexForProximity = inputIndex;
if (transposedPos >= 0) {
- if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
- if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
+ if (inputIndex == transposedPos) ++inputIndexForProximity;
+ if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
}
- const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos,
- excessivePos, transposedPos);
- if (UNRELATED_CHAR == matchedProximityCharId) {
+ int matchedProximityCharId = mProximityInfo->getMatchedProximityId(
+ inputIndexForProximity, c, skipPos, excessivePos, transposedPos);
+ if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
// We found that this is an unrelated character, so we should give up traversing
// this node and its children entirely.
// However we may not be on the last virtual node yet so we skip the remaining
@@ -1352,7 +1265,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
mWord[depth] = c;
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
- if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
+ if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
}
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
@@ -1376,7 +1289,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
}
// Start traversing all nodes after the index exceeds the user typed length
traverseAllNodes = isSameAsUserTypedLength;
- diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
+ diffs = diffs
+ + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
// Finally, we are ready to go to the next character, the next "virtual node".
// We should advance the input index.
// We do this in this branch of the 'if traverseAllNodes' because we are still matching