New dict format, step 6

Copy the modified functions to be able to see the diff Bug: 4392433 Change-Id: Ic9b83b4b4b7b89cc922eed1825507d7d516aff24
author: Jean Chalard <jchalard@google.com> 2011-06-21 22:23:21 +0900
committer: Jean Chalard <jchalard@google.com> 2011-06-21 22:24:54 +0900
commit: 85a1d1ea749a70211fb25c43f3398461d7375da5 (patch)
tree: d1b24bcb6cf8aa37925472b1dbdf0079fb217643
parent: 839fb35f2bf2dfceaeff64524d51877c30023c4b (diff)
download: latinime-85a1d1ea749a70211fb25c43f3398461d7375da5.tar.gz
latinime-85a1d1ea749a70211fb25c43f3398461d7375da5.tar.xz
latinime-85a1d1ea749a70211fb25c43f3398461d7375da5.zip
1 files changed, 130 insertions, 0 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 36233b741..e3296f12a 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -992,6 +992,136 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
 }
 
 #else // NEW_DICTIONARY_FORMAT
+
+bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
+        const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
+        const int secondWordLength, const bool isSpaceProximity) {
+    if (inputLength >= MAX_WORD_LENGTH) return false;
+    if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
+            || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
+        return false;
+    const int newWordLength = firstWordLength + secondWordLength + 1;
+    // Allocating variable length array on stack
+    unsigned short word[newWordLength];
+    const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord);
+    if (DEBUG_DICT) {
+        LOGI("First freq: %d", firstFreq);
+    }
+    if (firstFreq <= 0) return false;
+
+    for (int i = 0; i < firstWordLength; ++i) {
+        word[i] = mWord[i];
+    }
+
+    const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord);
+    if (DEBUG_DICT) {
+        LOGI("Second  freq:  %d", secondFreq);
+    }
+    if (secondFreq <= 0) return false;
+
+    word[firstWordLength] = SPACE;
+    for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
+        word[i] = mWord[i - firstWordLength - 1];
+    }
+
+    int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength,
+            secondWordLength, firstFreq, secondFreq, isSpaceProximity);
+    if (DEBUG_DICT) {
+        LOGI("Split two words:  %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength,
+                TYPED_LETTER_MULTIPLIER);
+    }
+    addWord(word, newWordLength, pairFreq);
+    return true;
+}
+
+inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
+        const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,
+        const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+        int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
+        bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
+        int *nextSiblingPosition, int *nextOutputIndex) {
+    if (DEBUG_DICT) {
+        int inputCount = 0;
+        if (skipPos >= 0) ++inputCount;
+        if (excessivePos >= 0) ++inputCount;
+        if (transposedPos >= 0) ++inputCount;
+        assert(inputCount <= 1);
+    }
+    unsigned short c;
+    int childPosition;
+    bool terminal;
+    int freq;
+    bool isSameAsUserTypedLength = false;
+
+    const uint8_t flags = 0; // No flags for now
+
+    if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
+
+    *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
+            &c, &childPosition, &terminal, &freq);
+    *nextOutputIndex = depth + 1;
+
+    const bool needsToTraverseChildrenNodes = childPosition != 0;
+
+    // If we are only doing traverseAllNodes, no need to look at the typed characters.
+    if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
+        mWord[depth] = c;
+        if (traverseAllNodes && terminal) {
+            onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
+                       excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
+        }
+        if (!needsToTraverseChildrenNodes) return false;
+        *newTraverseAllNodes = traverseAllNodes;
+        *newMatchRate = matchWeight;
+        *newDiffs = diffs;
+        *newInputIndex = inputIndex;
+    } else {
+        const int *currentChars = getInputCharsAt(inputIndex);
+
+        if (transposedPos >= 0) {
+            if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
+            if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
+        }
+
+        int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
+                transposedPos);
+        if (UNRELATED_CHAR == matchedProximityCharId) return false;
+        mWord[depth] = c;
+        // If inputIndex is greater than mInputLength, that means there is no
+        // proximity chars. So, we don't need to check proximity.
+        if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
+            multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
+        }
+        bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
+                || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
+        if (isSameAsUserTypedLength && terminal) {
+            onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
+                    excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
+        }
+        if (!needsToTraverseChildrenNodes) return false;
+        // Start traversing all nodes after the index exceeds the user typed length
+        *newTraverseAllNodes = isSameAsUserTypedLength;
+        *newMatchRate = matchWeight;
+        *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
+        *newInputIndex = inputIndex + 1;
+    }
+    // Optimization: Prune out words that are too long compared to how much was typed.
+    if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
+        return false;
+    }
+
+    // If inputIndex is greater than mInputLength, that means there are no proximity chars.
+    // TODO: Check if this can be isSameAsUserTypedLength only.
+    if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
+        *newTraverseAllNodes = true;
+    }
+    // get the count of nodes and increment childAddress.
+    *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
+    *newChildPosition = childPosition;
+    if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
+    return needsToTraverseChildrenNodes;
+}
+
 #endif // NEW_DICTIONARY_FORMAT
 
 } // namespace latinime
author	Jean Chalard <jchalard@google.com>	2011-06-21 22:23:21 +0900
committer	Jean Chalard <jchalard@google.com>	2011-06-21 22:24:54 +0900
commit	85a1d1ea749a70211fb25c43f3398461d7375da5 (patch)
tree	d1b24bcb6cf8aa37925472b1dbdf0079fb217643
parent	839fb35f2bf2dfceaeff64524d51877c30023c4b (diff)
download	latinime-85a1d1ea749a70211fb25c43f3398461d7375da5.tar.gz latinime-85a1d1ea749a70211fb25c43f3398461d7375da5.tar.xz latinime-85a1d1ea749a70211fb25c43f3398461d7375da5.zip