Merge "Suggest words with transposed chars"

author: satok <satok@google.com> 2010-12-10 01:38:51 -0800
committer: Android (Google) Code Review <android-gerrit@google.com> 2010-12-10 01:38:51 -0800
commit: f6a6429e10817c80a7c00f0f2acae12a8e31cb15 (patch)
tree: 702540328aedc7eb1f81f3ab2a0f289f69203147
parent: e26ef1bccddc942fdaeada3409c8e8ff18a35008 (diff)
parent: a3d78f606e8e764c22637299a58c27e195b4e1d3 (diff)
download: latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.tar.gz
latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.tar.xz
latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.zip
3 files changed, 110 insertions, 46 deletions
diff --git a/native/src/defines.h b/native/src/defines.h
index 98e93b93d..52191beea 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -24,13 +24,17 @@
 #define LOG_TAG "LatinIME: "
 #endif
 #define DEBUG_DICT true
-#define DEBUG_SHOW_FOUND_WORD false
-#define DEBUG_NODE true
+#define DEBUG_DICT_FULL true
+#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
+#define DEBUG_NODE DEBUG_DICT_FULL
+#define DEBUG_TRACE DEBUG_DICT_FULL
 #else // FLAG_DBG
 #define LOGI
 #define DEBUG_DICT false
+#define DEBUG_DICT_FULL false
 #define DEBUG_SHOW_FOUND_WORD false
 #define DEBUG_NODE false
+#define DEBUG_TRACE false
 #endif // FLAG_DBG
 
 #ifndef U_SHORT_MAX
@@ -58,6 +62,12 @@
 #define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
 #define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
 #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
+#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
+
+#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
+#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
 
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 46332c74d..7ecf1c9c0 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -36,7 +36,7 @@ UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterM
     MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
     TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
     ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
-    LOGI("UnigramDictionary - constructor");
+    if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");
 }
 
 UnigramDictionary::~UnigramDictionary() {}
@@ -45,26 +45,36 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
         int *frequencies, int *nextLetters, int nextLettersSize)
 {
     initSuggestions(codes, codesSize, outWords, frequencies);
-    getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);
+    const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
+    getSuggestionCandidates(codesSize, -1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);
 
     // Suggestion with missing character
     if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
         for (int i = 0; i < codesSize; ++i) {
             if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
-            getSuggestionCandidates(codesSize, i, -1, NULL, 0);
+            getSuggestionCandidates(codesSize, i, -1, -1, NULL, 0, MAX_DEPTH);
         }
     }
 
     // Suggestion with excessive character
-    if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {
+    if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
         for (int i = 0; i < codesSize; ++i) {
             if (existsAdjacentProximityChars(i, codesSize)) {
                 if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
-                getSuggestionCandidates(codesSize, -1, i, NULL, 0);
+                getSuggestionCandidates(codesSize, -1, i, -1, NULL, 0, MAX_DEPTH);
             }
         }
     }
 
+    // Suggestion with transposed characters
+    // Only suggest words that length is mInputLength
+    if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
+        for (int i = 0; i < codesSize; ++i) {
+            if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);
+            getSuggestionCandidates(codesSize, -1, -1, i, NULL, 0, mInputLength - 1);
+        }
+    }
+
     // Suggestions with missing space
     if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
         for (int i = 1; i < codesSize; ++i) {
@@ -187,10 +197,13 @@ static const char QUOTE = '\'';
 static const char SPACE = ' ';
 
 void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,
-        const int excessivePos, int *nextLetters, const int nextLettersSize) {
-    if (DEBUG_DICT) LOGI("getSuggestionCandidates");
+        const int excessivePos, const int transposedPos, int *nextLetters,
+        const int nextLettersSize, const int maxDepth) {
+    if (DEBUG_DICT) LOGI("getSuggestionCandidates %d", maxDepth);
+    if (DEBUG_DICT) assert(transposedPos + 1 < inputLength);
+    if (DEBUG_DICT) assert(excessivePos < inputLength);
+    if (DEBUG_DICT) assert(missingPos < inputLength);
     int rootPosition = ROOT_POS;
-    const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
     // Get the number of child of root, then increment the position
     int childCount = Dictionary::getCount(DICT, &rootPosition);
     int depth = 0;
@@ -212,12 +225,12 @@ void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int
             int diffs = mStackDiffs[depth];
             int siblingPos = mStackSiblingPos[depth];
             int firstChildPos;
-            // depth will never be greater than MAX_DEPTH because in that case,
+            // depth will never be greater than maxDepth because in that case,
             // needsToTraverseChildrenNodes should be false
             const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
-                    MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
-                    nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
-                    &snr, &inputIndex, &diffs, &siblingPos);
+                    maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
+                    transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,
+                    &traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);
             // Update next sibling pos
             mStackSiblingPos[depth] = siblingPos;
             if (needsToTraverseChildrenNodes) {
@@ -252,7 +265,7 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
     }
 
     const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
-    if (DEBUG_DICT) LOGI("Second freq:  %d", secondFreq);
+    if (DEBUG_DICT) LOGI("Second  freq:  %d", secondFreq);
     if (secondFreq <= 0) return false;
 
     word[missingSpacePos] = SPACE;
@@ -262,24 +275,27 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
 
     int pairFreq = ((firstFreq + secondFreq) / 2);
     for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
+    pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;
     addWord(word, newWordLength, pairFreq);
     return true;
 }
 
 // Keep this for comparing spec to new getWords
 void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
-        const int excessivePos, int *nextLetters, const int nextLettersSize) {
+        const int excessivePos, const int transposedPos,int *nextLetters,
+        const int nextLettersSize) {
     int initialPosition = initialPos;
     const int count = Dictionary::getCount(DICT, &initialPosition);
     getWordsRec(count, initialPosition, 0,
             min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
-            mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
+            mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
+            nextLettersSize);
 }
 
 void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
         const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
-        const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
-        const int nextLettersSize) {
+        const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+        int *nextLetters, const int nextLettersSize) {
     int siblingPos = pos;
     for (int i = 0; i < childrenCount; ++i) {
         int newCount;
@@ -291,34 +307,50 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
         int newDiffs;
         int newSiblingPos;
         const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
-                traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,
-                nextLettersSize,
+                traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,
+                nextLetters, nextLettersSize,
                 &newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
                 &newInputIndex, &newDiffs, &newSiblingPos);
         siblingPos = newSiblingPos;
 
         if (needsToTraverseChildrenNodes) {
             getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
-                    newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,
-                    nextLettersSize);
+                    newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
+                    nextLetters, nextLettersSize);
         }
     }
 }
 
 inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
         unsigned short *word, const int inputLength, const int depth, const int snr,
-        int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
-    if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);
+        int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
+        const int transposedPos, const int freq) {
+    int finalFreq = freq * snr;
+    // TODO: Demote by edit distance
+    if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
+    if (excessivePos >= 0) finalFreq = finalFreq
+            * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
+    if (transposedPos >= 0) finalFreq = finalFreq
+            * WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
+
+    if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
     if (depth >= inputLength && skipPos < 0) {
         registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
     }
 }
 
 inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
-        unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,
-        const int addedWeight) {
+        unsigned short *word, const int depth, const int snr, const int skipPos,
+        const int excessivePos, const int transposedPos, const int freq, const int addedWeight) {
     if (!sameAsTyped(word, depth + 1)) {
         int finalFreq = freq * snr * addedWeight;
+        // TODO: Demote by edit distance
+        if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
+        if (excessivePos >= 0) finalFreq = finalFreq
+                * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
+        if (transposedPos >= 0) finalFreq = finalFreq
+                * WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
+
         // Proximity collection will promote a word of the same length as
         // what user typed.
         if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@@ -357,16 +389,18 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
 }
 
 inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
-        const unsigned short c, const int skipPos) {
+        const unsigned short c, const int skipPos, const int excessivePos,
+        const int transposedPos) {
     const unsigned short lowerC = toLowerCase(c);
     int j = 0;
     while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
         const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
         // If skipPos is defined, not to search proximity collections.
-        // First char is what user typed.
+        // First char is what user  typed.
         if (matched) {
             return j;
-        } else if (skipPos >= 0) {
+        } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
+            // Not to check proximity characters
             return -1;
         }
         ++j;
@@ -376,10 +410,17 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
 
 inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
         const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
-        const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
-        const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
-        int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
-    if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);
+        const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+        int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
+        bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
+        int *nextSiblingPosition) {
+    if (DEBUG_DICT) {
+        int inputCount = 0;
+        if (skipPos >= 0) ++inputCount;
+        if (excessivePos >= 0) ++inputCount;
+        if (transposedPos >= 0) ++inputCount;
+        assert(inputCount <= 1);
+    }
     unsigned short c;
     int childPosition;
     bool terminal;
@@ -397,7 +438,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
         mWord[depth] = c;
         if (traverseAllNodes && terminal) {
             onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
-                    snr, nextLetters, nextLettersSize, skipPos, freq);
+                    snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);
         }
         if (!needsToTraverseChildrenNodes) return false;
         *newTraverseAllNodes = traverseAllNodes;
@@ -406,7 +447,14 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
         *newInputIndex = inputIndex;
     } else {
         int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
-        int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
+
+        if (transposedPos >= 0) {
+            if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
+            if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
+        }
+
+        int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
+                transposedPos);
         if (matchedProximityCharId < 0) return false;
         mWord[depth] = c;
         // If inputIndex is greater than mInputLength, that means there is no
@@ -415,13 +463,13 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
         const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
         if (isSameAsUserTypedLength && terminal) {
             onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
-                    skipPos, freq, addedWeight);
+                    skipPos, excessivePos, transposedPos, freq, addedWeight);
         }
         if (!needsToTraverseChildrenNodes) return false;
         // Start traversing all nodes after the index exceeds the user typed length
         *newTraverseAllNodes = isSameAsUserTypedLength;
         *newSnr = snr * addedWeight;
-        *newDiffs = diffs + (matchedProximityCharId > 0);
+        *newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
         *newInputIndex = inputIndex + 1;
     }
     // Optimization: Prune out words that are too long compared to how much was typed.
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index f8af55c92..abfdb8d87 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -32,7 +32,8 @@ public:
 private:
     void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
     void getSuggestionCandidates(const int inputLength, const int skipPos, const int excessivePos,
-            int *nextLetters, const int nextLettersSize);
+            const int transposedPos, int *nextLetters, const int nextLettersSize,
+            const int maxDepth);
     void getVersionNumber();
     bool checkIfDictVersionIsLatest();
     int getAddress(int *pos);
@@ -43,25 +44,30 @@ private:
     unsigned short toLowerCase(unsigned short c);
     void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
             const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
-            const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
+            const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
+            const int nextLettersSize);
     bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
     // Keep getWordsOld for comparing performance between getWords and getWordsOld
     void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
-            const int excessivePos, int *nextLetters, const int nextLettersSize);
+            const int excessivePos, const int transposedPos, int *nextLetters,
+            const int nextLettersSize);
     void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
     void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
             const int mInputLength, const int depth, const int snr, int *nextLetters,
-            const int nextLettersSize, const int skipPos, const int freq);
+            const int nextLettersSize, const int skipPos, const int excessivePos,
+            const int transposedPos, const int freq);
     void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word, const int depth,
-            const int snr, const int skipPos, const int freq, const int addedWeight);
+            const int snr, const int skipPos, const int excessivePos, const int transposedPos,
+            const int freq, const int addedWeight);
     bool needsToSkipCurrentNode(const unsigned short c,
             const int inputIndex, const int skipPos, const int depth);
-    int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
+    int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
+            const int excessivePos, const int transposedPos);
     // Process a node by considering proximity, missing and excessive character
     bool processCurrentNode(const int pos, const int depth,
             const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
-            const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
-            const int nextLettersSize, int *newCount, int *newChildPosition,
+            const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+            int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
             bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
             int *nextSiblingPosition);
     int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);
author	satok <satok@google.com>	2010-12-10 01:38:51 -0800
committer	Android (Google) Code Review <android-gerrit@google.com>	2010-12-10 01:38:51 -0800
commit	f6a6429e10817c80a7c00f0f2acae12a8e31cb15 (patch)
tree	702540328aedc7eb1f81f3ab2a0f289f69203147
parent	e26ef1bccddc942fdaeada3409c8e8ff18a35008 (diff)
parent	a3d78f606e8e764c22637299a58c27e195b4e1d3 (diff)
download	latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.tar.gz latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.tar.xz latinime-f6a6429e10817c80a7c00f0f2acae12a8e31cb15.zip