1 files changed, 30 insertions, 310 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index bccd37a61..f0bb384fb 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -24,9 +24,7 @@
 #include "dictionary.h"
 #include "unigram_dictionary.h"
 
-#ifdef NEW_DICTIONARY_FORMAT
 #include "binary_format.h"
-#endif // NEW_DICTIONARY_FORMAT
 
 namespace latinime {
 
@@ -39,28 +37,23 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
 UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
         int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
         const bool isLatestDictVersion)
-#ifndef NEW_DICTIONARY_FORMAT
-    : DICT_ROOT(streamStart),
-#else // NEW_DICTIONARY_FORMAT
     : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
-#endif // NEW_DICTIONARY_FORMAT
     MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
     MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
     TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
-#ifndef NEW_DICTIONARY_FORMAT
-    ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
-#else // NEW_DICTIONARY_FORMAT
       // TODO : remove this variable.
     ROOT_POS(0),
-#endif // NEW_DICTIONARY_FORMAT
     BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
     MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
     if (DEBUG_DICT) {
         LOGI("UnigramDictionary - constructor");
     }
+    mCorrectionState = new CorrectionState();
 }
 
-UnigramDictionary::~UnigramDictionary() {}
+UnigramDictionary::~UnigramDictionary() {
+    delete mCorrectionState;
+}
 
 static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
         const int MAX_PROXIMITY_CHARS) {
@@ -362,6 +355,8 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
         assert(excessivePos < mInputLength);
         assert(missingPos < mInputLength);
     }
+    mCorrectionState->setCorrectionParams(mProximityInfo, mInputLength, skipPos, excessivePos,
+            transposedPos);
     int rootPosition = ROOT_POS;
     // Get the number of children of root, then increment the position
     int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
@@ -389,8 +384,8 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
             // depth will never be greater than maxDepth because in that case,
             // needsToTraverseChildrenNodes should be false
             const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
-                    maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
-                    excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
+                    maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs,
+                    nextLetters, nextLettersSize, mCorrectionState, &childCount,
                     &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
                     &siblingPos, &outputIndex);
             // Update next sibling pos
@@ -521,8 +516,12 @@ bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int s
 }
 
 inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
-        const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
-        const int freq, const bool sameLength) const {
+        const int matchWeight, const int freq, const bool sameLength,
+        CorrectionState *correctionState) const {
+    const int skipPos = correctionState->getSkipPos();
+    const int excessivePos = correctionState->getExcessivePos();
+    const int transposedPos = correctionState->getTransposedPos();
+
     // TODO: Demote by edit distance
     int finalFreq = freq * matchWeight;
     if (skipPos >= 0) {
@@ -587,16 +586,16 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
 
 inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth,
         const uint8_t* const root, const uint8_t flags, const int pos,
-        const int inputIndex, const int matchWeight, const int skipPos,
-        const int excessivePos, const int transposedPos, const int freq, const bool sameLength,
-        int* nextLetters, const int nextLettersSize) {
+        const int inputIndex, const int matchWeight, const int freq, const bool sameLength,
+        int* nextLetters, const int nextLettersSize, CorrectionState *correctionState) {
+    const int skipPos = correctionState->getSkipPos();
 
     const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false;
     if (isSameAsTyped) return;
 
     if (depth >= MIN_SUGGEST_DEPTH) {
-        const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
-                excessivePos, transposedPos, freq, sameLength);
+        const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight,
+                freq, sameLength, correctionState);
         if (!isSameAsTyped)
             addWord(word, depth + 1, finalFreq);
     }
@@ -647,282 +646,6 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
     return true;
 }
 
-#ifndef NEW_DICTIONARY_FORMAT
-// The following functions will be entirely replaced with new implementations.
-void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
-        const int excessivePos, const int transposedPos,int *nextLetters,
-        const int nextLettersSize) {
-    int initialPosition = initialPos;
-    const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
-    getWordsRec(count, initialPosition, 0,
-            min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
-            mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
-            nextLettersSize);
-}
-
-void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
-        const int maxDepth, const bool traverseAllNodes, const int matchWeight,
-        const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
-        const int transposedPos, int *nextLetters, const int nextLettersSize) {
-    int siblingPos = pos;
-    for (int i = 0; i < childrenCount; ++i) {
-        int newCount;
-        int newChildPosition;
-        bool newTraverseAllNodes;
-        int newMatchRate;
-        int newInputIndex;
-        int newDiffs;
-        int newSiblingPos;
-        int newOutputIndex;
-        const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
-                traverseAllNodes, matchWeight, inputIndex, diffs,
-                skipPos, excessivePos, transposedPos,
-                nextLetters, nextLettersSize,
-                &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
-                &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
-        siblingPos = newSiblingPos;
-
-        if (needsToTraverseChildrenNodes) {
-            getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
-                    newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
-                    nextLetters, nextLettersSize);
-        }
-    }
-}
-
-inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
-        const int inputLength, unsigned short *word) {
-    int pos = ROOT_POS;
-    int count = Dictionary::getCount(DICT_ROOT, &pos);
-    int maxFreq = 0;
-    int depth = 0;
-    unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
-    bool terminal = false;
-
-    mStackChildCount[0] = count;
-    mStackSiblingPos[0] = pos;
-
-    while (depth >= 0) {
-        if (mStackChildCount[depth] > 0) {
-            --mStackChildCount[depth];
-            int firstChildPos;
-            int newFreq;
-            int siblingPos = mStackSiblingPos[depth];
-            const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
-                    startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
-                    &siblingPos);
-            mStackSiblingPos[depth] = siblingPos;
-            if (depth == (inputLength - 1)) {
-                // Traverse sibling node
-                if (terminal) {
-                    if (newFreq > maxFreq) {
-                        for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
-                        if (DEBUG_DICT && DEBUG_NODE) {
-#ifdef FLAG_DBG
-                            char s[inputLength + 1];
-                            for (int i = 0; i < inputLength; ++i) s[i] = word[i];
-                            s[inputLength] = 0;
-                            LOGI("New missing space word found: %d > %d (%s), %d, %d",
-                                    newFreq, maxFreq, s, inputLength, depth);
-#endif
-                        }
-                        maxFreq = newFreq;
-                    }
-                }
-            } else if (needsToTraverseChildrenNodes) {
-                // Traverse children nodes
-                ++depth;
-                mStackChildCount[depth] = count;
-                mStackSiblingPos[depth] = firstChildPos;
-            }
-        } else {
-            // Traverse parent node
-            --depth;
-        }
-    }
-
-    word[inputLength] = 0;
-    return maxFreq;
-}
-
-inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
-        const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
-        int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
-    const int inputIndex = startInputIndex + depth;
-    unsigned short c;
-    *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
-            &c, newChildPosition, newTerminal, newFreq);
-    const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex);
-    if (DEBUG_DICT) {
-        assert(inputC <= U_SHORT_MAX);
-    }
-    const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
-    const bool matched = (inputC == baseLowerC || inputC == c);
-    const bool hasChild = *newChildPosition != 0;
-    if (matched) {
-        word[depth] = c;
-        if (DEBUG_DICT && DEBUG_NODE) {
-            LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
-            if (*newTerminal) {
-                LOGI("Terminal %d", *newFreq);
-            }
-        }
-        if (hasChild) {
-            *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
-            return true;
-        } else {
-            return false;
-        }
-    } else {
-        // If this node is not user typed character, this method treats this word as unmatched.
-        // Thus newTerminal shouldn't be true.
-        *newTerminal = false;
-        return false;
-    }
-}
-
-// TODO: use uint32_t instead of unsigned short
-bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
-    if (IS_LATEST_DICT_VERSION) {
-        return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
-    } else {
-        return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
-    }
-}
-
-
-// Require strict exact match.
-int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
-        int length) const {
-    // returns address of bigram data of that word
-    // return -99 if not found
-
-    int count = Dictionary::getCount(DICT_ROOT, &pos);
-    unsigned short currentChar = (unsigned short) word[offset];
-    for (int j = 0; j < count; j++) {
-        unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
-        int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
-        int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
-        if (c == currentChar) {
-            if (offset == length - 1) {
-                if (terminal) {
-                    return (pos+1);
-                }
-            } else {
-                if (childPos != 0) {
-                    int t = getBigramPosition(childPos, word, offset + 1, length);
-                    if (t > 0) {
-                        return t;
-                    }
-                }
-            }
-        }
-        if (terminal) {
-            Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
-        }
-        // There could be two instances of each alphabet - upper and lower case. So continue
-        // looking ...
-    }
-    return NOT_VALID_WORD;
-}
-
-// The following functions will be modified.
-inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
-        const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
-        const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
-        int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
-        bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
-        int *nextSiblingPosition, int *nextOutputIndex) {
-    if (DEBUG_DICT) {
-        int inputCount = 0;
-        if (skipPos >= 0) ++inputCount;
-        if (excessivePos >= 0) ++inputCount;
-        if (transposedPos >= 0) ++inputCount;
-        assert(inputCount <= 1);
-    }
-    unsigned short c;
-    int childPosition;
-    bool terminal;
-    int freq;
-    bool isSameAsUserTypedLength = false;
-
-    const int pos = initialPos;
-    const int depth = initialDepth;
-    const int traverseAllNodes = initialTraverseAllNodes;
-    const int diffs = initialDiffs;
-
-    const uint8_t flags = 0; // No flags for now
-
-    if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
-
-    *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
-            &c, &childPosition, &terminal, &freq);
-    *nextOutputIndex = depth + 1;
-
-    const bool needsToTraverseChildrenNodes = childPosition != 0;
-
-    // If we are only doing traverseAllNodes, no need to look at the typed characters.
-    if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
-        mWord[depth] = c;
-        if (traverseAllNodes && terminal) {
-            onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
-                       excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
-        }
-        if (!needsToTraverseChildrenNodes) return false;
-        *newTraverseAllNodes = traverseAllNodes;
-        *newMatchRate = matchWeight;
-        *newDiffs = diffs;
-        *newInputIndex = inputIndex;
-    } else {
-        int inputIndexForProximity = inputIndex;
-
-        if (transposedPos >= 0) {
-            if (inputIndex == transposedPos) ++inputIndexForProximity;
-            if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
-        }
-
-        ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId(
-                inputIndexForProximity, c, skipPos, excessivePos, transposedPos);
-        if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false;
-        mWord[depth] = c;
-        // If inputIndex is greater than mInputLength, that means there is no
-        // proximity chars. So, we don't need to check proximity.
-        if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
-            multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
-        }
-        bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
-                || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
-        if (isSameAsUserTypedLength && terminal) {
-            onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
-                    excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
-        }
-        if (!needsToTraverseChildrenNodes) return false;
-        // Start traversing all nodes after the index exceeds the user typed length
-        *newTraverseAllNodes = isSameAsUserTypedLength;
-        *newMatchRate = matchWeight;
-        *newDiffs = diffs
-                + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
-        *newInputIndex = inputIndex + 1;
-    }
-    // Optimization: Prune out words that are too long compared to how much was typed.
-    if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
-        return false;
-    }
-
-    // If inputIndex is greater than mInputLength, that means there are no proximity chars.
-    // TODO: Check if this can be isSameAsUserTypedLength only.
-    if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
-        *newTraverseAllNodes = true;
-    }
-    // get the count of nodes and increment childAddress.
-    *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
-    *newChildPosition = childPosition;
-    if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
-    return needsToTraverseChildrenNodes;
-}
-
-#else // NEW_DICTIONARY_FORMAT
-
 // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
 // interface.
 inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
@@ -1081,16 +804,15 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs
 // given level, as output into newCount when traversing this level's parent.
 inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
         const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
-        const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos,
-        int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition,
+        const int initialDiffs, int *nextLetters, const int nextLettersSize,
+        CorrectionState *correctionState, int *newCount, int *newChildrenPosition,
         bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
         int *nextSiblingPosition, int *newOutputIndex) {
+    const int skipPos = correctionState->getSkipPos();
+    const int excessivePos = correctionState->getExcessivePos();
+    const int transposedPos = correctionState->getTransposedPos();
     if (DEBUG_DICT) {
-        int inputCount = 0;
-        if (skipPos >= 0) ++inputCount;
-        if (excessivePos >= 0) ++inputCount;
-        if (transposedPos >= 0) ++inputCount;
-        assert(inputCount <= 1);
+        correctionState->checkState();
     }
     int pos = initialPos;
     int depth = initialDepth;
@@ -1146,8 +868,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
                 // The frequency should be here, because we come here only if this is actually
                 // a terminal node, and we are on its last char.
                 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
-                onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
-                           excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize);
+                onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
+                        freq, false, nextLetters, nextLettersSize, mCorrectionState);
             }
             if (!hasChildren) {
                 // If we don't have children here, that means we finished processing all
@@ -1170,7 +892,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
             }
 
             int matchedProximityCharId = mProximityInfo->getMatchedProximityId(
-                    inputIndexForProximity, c, skipPos, excessivePos, transposedPos);
+                    inputIndexForProximity, c, mCorrectionState);
             if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
                 // We found that this is an unrelated character, so we should give up traversing
                 // this node and its children entirely.
@@ -1197,8 +919,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
                     || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
             if (isSameAsUserTypedLength && isTerminal) {
                 const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos);
-                onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos,
-                        excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize);
+                onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
+                        freq, true, nextLetters, nextLettersSize, mCorrectionState);
             }
             // This character matched the typed character (enough to traverse the node at least)
             // so we just evaluated it. Now we should evaluate this virtual node's children - that
@@ -1276,6 +998,4 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
     return true;
 }
 
-#endif // NEW_DICTIONARY_FORMAT
-
 } // namespace latinime