aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsatok <satok@google.com>2010-12-10 01:38:51 -0800
committerAndroid (Google) Code Review <android-gerrit@google.com>2010-12-10 01:38:51 -0800
commitf6a6429e10817c80a7c00f0f2acae12a8e31cb15 (patch)
tree702540328aedc7eb1f81f3ab2a0f289f69203147
parente26ef1bccddc942fdaeada3409c8e8ff18a35008 (diff)
parenta3d78f606e8e764c22637299a58c27e195b4e1d3 (diff)
downloadlatinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.tar.gz
latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.tar.xz
latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.zip
Merge "Suggest words with transposed chars"
-rw-r--r--native/src/defines.h14
-rw-r--r--native/src/unigram_dictionary.cpp120
-rw-r--r--native/src/unigram_dictionary.h22
3 files changed, 110 insertions, 46 deletions
diff --git a/native/src/defines.h b/native/src/defines.h
index 98e93b93d..52191beea 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -24,13 +24,17 @@
#define LOG_TAG "LatinIME: "
#endif
#define DEBUG_DICT true
-#define DEBUG_SHOW_FOUND_WORD false
-#define DEBUG_NODE true
+#define DEBUG_DICT_FULL true
+#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
+#define DEBUG_NODE DEBUG_DICT_FULL
+#define DEBUG_TRACE DEBUG_DICT_FULL
#else // FLAG_DBG
#define LOGI
#define DEBUG_DICT false
+#define DEBUG_DICT_FULL false
#define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE false
+#define DEBUG_TRACE false
#endif // FLAG_DBG
#ifndef U_SHORT_MAX
@@ -58,6 +62,12 @@
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
+#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
+
+#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
+#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 46332c74d..7ecf1c9c0 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -36,7 +36,7 @@ UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterM
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
- LOGI("UnigramDictionary - constructor");
+ if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");
}
UnigramDictionary::~UnigramDictionary() {}
@@ -45,26 +45,36 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
int *frequencies, int *nextLetters, int nextLettersSize)
{
initSuggestions(codes, codesSize, outWords, frequencies);
- getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);
+ const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
+ getSuggestionCandidates(codesSize, -1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);
// Suggestion with missing character
if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
for (int i = 0; i < codesSize; ++i) {
if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
- getSuggestionCandidates(codesSize, i, -1, NULL, 0);
+ getSuggestionCandidates(codesSize, i, -1, -1, NULL, 0, MAX_DEPTH);
}
}
// Suggestion with excessive character
- if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {
+ if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
for (int i = 0; i < codesSize; ++i) {
if (existsAdjacentProximityChars(i, codesSize)) {
if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
- getSuggestionCandidates(codesSize, -1, i, NULL, 0);
+ getSuggestionCandidates(codesSize, -1, i, -1, NULL, 0, MAX_DEPTH);
}
}
}
+ // Suggestion with transposed characters
+ // Only suggest words that length is mInputLength
+ if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
+ for (int i = 0; i < codesSize; ++i) {
+ if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);
+ getSuggestionCandidates(codesSize, -1, -1, i, NULL, 0, mInputLength - 1);
+ }
+ }
+
// Suggestions with missing space
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
for (int i = 1; i < codesSize; ++i) {
@@ -187,10 +197,13 @@ static const char QUOTE = '\'';
static const char SPACE = ' ';
void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,
- const int excessivePos, int *nextLetters, const int nextLettersSize) {
- if (DEBUG_DICT) LOGI("getSuggestionCandidates");
+ const int excessivePos, const int transposedPos, int *nextLetters,
+ const int nextLettersSize, const int maxDepth) {
+ if (DEBUG_DICT) LOGI("getSuggestionCandidates %d", maxDepth);
+ if (DEBUG_DICT) assert(transposedPos + 1 < inputLength);
+ if (DEBUG_DICT) assert(excessivePos < inputLength);
+ if (DEBUG_DICT) assert(missingPos < inputLength);
int rootPosition = ROOT_POS;
- const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
// Get the number of child of root, then increment the position
int childCount = Dictionary::getCount(DICT, &rootPosition);
int depth = 0;
@@ -212,12 +225,12 @@ void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int
int diffs = mStackDiffs[depth];
int siblingPos = mStackSiblingPos[depth];
int firstChildPos;
- // depth will never be greater than MAX_DEPTH because in that case,
+ // depth will never be greater than maxDepth because in that case,
// needsToTraverseChildrenNodes should be false
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
- MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
- nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
- &snr, &inputIndex, &diffs, &siblingPos);
+ maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
+ transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,
+ &traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);
// Update next sibling pos
mStackSiblingPos[depth] = siblingPos;
if (needsToTraverseChildrenNodes) {
@@ -252,7 +265,7 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
}
const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
- if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
+ if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
if (secondFreq <= 0) return false;
word[missingSpacePos] = SPACE;
@@ -262,24 +275,27 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
int pairFreq = ((firstFreq + secondFreq) / 2);
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
+ pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;
addWord(word, newWordLength, pairFreq);
return true;
}
// Keep this for comparing spec to new getWords
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
- const int excessivePos, int *nextLetters, const int nextLettersSize) {
+ const int excessivePos, const int transposedPos,int *nextLetters,
+ const int nextLettersSize) {
int initialPosition = initialPos;
const int count = Dictionary::getCount(DICT, &initialPosition);
getWordsRec(count, initialPosition, 0,
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
- mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
+ mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
+ nextLettersSize);
}
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
- const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
- const int nextLettersSize) {
+ const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+ int *nextLetters, const int nextLettersSize) {
int siblingPos = pos;
for (int i = 0; i < childrenCount; ++i) {
int newCount;
@@ -291,34 +307,50 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
int newDiffs;
int newSiblingPos;
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
- traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,
- nextLettersSize,
+ traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,
+ nextLetters, nextLettersSize,
&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
&newInputIndex, &newDiffs, &newSiblingPos);
siblingPos = newSiblingPos;
if (needsToTraverseChildrenNodes) {
getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
- newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,
- nextLettersSize);
+ newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
+ nextLetters, nextLettersSize);
}
}
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
unsigned short *word, const int inputLength, const int depth, const int snr,
- int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
- if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);
+ int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
+ const int transposedPos, const int freq) {
+ int finalFreq = freq * snr;
+ // TODO: Demote by edit distance
+ if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
+ if (excessivePos >= 0) finalFreq = finalFreq
+ * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
+ if (transposedPos >= 0) finalFreq = finalFreq
+ * WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
+
+ if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
if (depth >= inputLength && skipPos < 0) {
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
}
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
- unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,
- const int addedWeight) {
+ unsigned short *word, const int depth, const int snr, const int skipPos,
+ const int excessivePos, const int transposedPos, const int freq, const int addedWeight) {
if (!sameAsTyped(word, depth + 1)) {
int finalFreq = freq * snr * addedWeight;
+ // TODO: Demote by edit distance
+ if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
+ if (excessivePos >= 0) finalFreq = finalFreq
+ * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
+ if (transposedPos >= 0) finalFreq = finalFreq
+ * WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
+
// Proximity collection will promote a word of the same length as
// what user typed.
if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@@ -357,16 +389,18 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
}
inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
- const unsigned short c, const int skipPos) {
+ const unsigned short c, const int skipPos, const int excessivePos,
+ const int transposedPos) {
const unsigned short lowerC = toLowerCase(c);
int j = 0;
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
// If skipPos is defined, not to search proximity collections.
- // First char is what user typed.
+ // First char is what user typed.
if (matched) {
return j;
- } else if (skipPos >= 0) {
+ } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
+ // Not to check proximity characters
return -1;
}
++j;
@@ -376,10 +410,17 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
- const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
- const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
- int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
- if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);
+ const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+ int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
+ bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
+ int *nextSiblingPosition) {
+ if (DEBUG_DICT) {
+ int inputCount = 0;
+ if (skipPos >= 0) ++inputCount;
+ if (excessivePos >= 0) ++inputCount;
+ if (transposedPos >= 0) ++inputCount;
+ assert(inputCount <= 1);
+ }
unsigned short c;
int childPosition;
bool terminal;
@@ -397,7 +438,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
mWord[depth] = c;
if (traverseAllNodes && terminal) {
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
- snr, nextLetters, nextLettersSize, skipPos, freq);
+ snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);
}
if (!needsToTraverseChildrenNodes) return false;
*newTraverseAllNodes = traverseAllNodes;
@@ -406,7 +447,14 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
*newInputIndex = inputIndex;
} else {
int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
- int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
+
+ if (transposedPos >= 0) {
+ if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
+ if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
+ }
+
+ int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
+ transposedPos);
if (matchedProximityCharId < 0) return false;
mWord[depth] = c;
// If inputIndex is greater than mInputLength, that means there is no
@@ -415,13 +463,13 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
if (isSameAsUserTypedLength && terminal) {
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
- skipPos, freq, addedWeight);
+ skipPos, excessivePos, transposedPos, freq, addedWeight);
}
if (!needsToTraverseChildrenNodes) return false;
// Start traversing all nodes after the index exceeds the user typed length
*newTraverseAllNodes = isSameAsUserTypedLength;
*newSnr = snr * addedWeight;
- *newDiffs = diffs + (matchedProximityCharId > 0);
+ *newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
*newInputIndex = inputIndex + 1;
}
// Optimization: Prune out words that are too long compared to how much was typed.
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index f8af55c92..abfdb8d87 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -32,7 +32,8 @@ public:
private:
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
void getSuggestionCandidates(const int inputLength, const int skipPos, const int excessivePos,
- int *nextLetters, const int nextLettersSize);
+ const int transposedPos, int *nextLetters, const int nextLettersSize,
+ const int maxDepth);
void getVersionNumber();
bool checkIfDictVersionIsLatest();
int getAddress(int *pos);
@@ -43,25 +44,30 @@ private:
unsigned short toLowerCase(unsigned short c);
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
- const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
+ const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
+ const int nextLettersSize);
bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
// Keep getWordsOld for comparing performance between getWords and getWordsOld
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
- const int excessivePos, int *nextLetters, const int nextLettersSize);
+ const int excessivePos, const int transposedPos, int *nextLetters,
+ const int nextLettersSize);
void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
const int mInputLength, const int depth, const int snr, int *nextLetters,
- const int nextLettersSize, const int skipPos, const int freq);
+ const int nextLettersSize, const int skipPos, const int excessivePos,
+ const int transposedPos, const int freq);
void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word, const int depth,
- const int snr, const int skipPos, const int freq, const int addedWeight);
+ const int snr, const int skipPos, const int excessivePos, const int transposedPos,
+ const int freq, const int addedWeight);
bool needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth);
- int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
+ int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
+ const int excessivePos, const int transposedPos);
// Process a node by considering proximity, missing and excessive character
bool processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
- const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
- const int nextLettersSize, int *newCount, int *newChildPosition,
+ const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+ int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
int *nextSiblingPosition);
int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);