diff options
-rw-r--r-- | native/Android.mk | 4 | ||||
-rw-r--r-- | native/src/correction_state.cpp | 52 | ||||
-rw-r--r-- | native/src/correction_state.h | 52 | ||||
-rw-r--r-- | native/src/proximity_info.cpp | 8 | ||||
-rw-r--r-- | native/src/proximity_info.h | 5 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 340 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 47 |
7 files changed, 156 insertions, 352 deletions
diff --git a/native/Android.mk b/native/Android.mk index bc246a990..04819e456 100644 --- a/native/Android.mk +++ b/native/Android.mk @@ -8,15 +8,13 @@ LOCAL_CFLAGS += -Werror -Wall # To suppress compiler warnings for unused variables/functions used for debug features etc. LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function -# Use the new dictionary format -LOCAL_CFLAGS += -DNEW_DICTIONARY_FORMAT - LOCAL_SRC_FILES := \ jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \ jni/com_android_inputmethod_latin_BinaryDictionary.cpp \ jni/jni_common.cpp \ src/bigram_dictionary.cpp \ src/char_utils.cpp \ + src/correction_state.cpp \ src/dictionary.cpp \ src/proximity_info.cpp \ src/unigram_dictionary.cpp diff --git a/native/src/correction_state.cpp b/native/src/correction_state.cpp new file mode 100644 index 000000000..aa5efce40 --- /dev/null +++ b/native/src/correction_state.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <assert.h> +#include <stdio.h> +#include <string.h> + +#define LOG_TAG "LatinIME: correction_state.cpp" + +#include "correction_state.h" + +namespace latinime { + +CorrectionState::CorrectionState() { +} + +void CorrectionState::setCorrectionParams(const ProximityInfo *pi, const int inputLength, + const int skipPos, const int excessivePos, const int transposedPos) { + mProximityInfo = pi; + mSkipPos = skipPos; + mExcessivePos = excessivePos; + mTransposedPos = transposedPos; +} + +void CorrectionState::checkState() { + if (DEBUG_DICT) { + int inputCount = 0; + if (mSkipPos >= 0) ++inputCount; + if (mExcessivePos >= 0) ++inputCount; + if (mTransposedPos >= 0) ++inputCount; + // TODO: remove this assert + assert(inputCount <= 1); + } +} + +CorrectionState::~CorrectionState() { +} + +} // namespace latinime diff --git a/native/src/correction_state.h b/native/src/correction_state.h new file mode 100644 index 000000000..5b7392590 --- /dev/null +++ b/native/src/correction_state.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_CORRECTION_STATE_H +#define LATINIME_CORRECTION_STATE_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class ProximityInfo; + +class CorrectionState { +public: + CorrectionState(); + void setCorrectionParams(const ProximityInfo *pi, const int inputLength, const int skipPos, + const int excessivePos, const int transposedPos); + void checkState(); + virtual ~CorrectionState(); + int getSkipPos() const { + return mSkipPos; + } + int getExcessivePos() const { + return mExcessivePos; + } + int getTransposedPos() const { + return mTransposedPos; + } +private: + const ProximityInfo *mProximityInfo; + int mInputLength; + int mSkipPos; + int mExcessivePos; + int mTransposedPos; +}; +} // namespace latinime +#endif // LATINIME_CORRECTION_INFO_H diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index c45393f18..bed92cf9e 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -78,7 +78,7 @@ unsigned short ProximityInfo::getPrimaryCharAt(const int index) const { return getProximityCharsAt(index)[0]; } -bool ProximityInfo::existsCharInProximityAt(const int index, const int c) const { +inline bool ProximityInfo::existsCharInProximityAt(const int index, const int c) const { const int *chars = getProximityCharsAt(index); int i = 0; while (chars[i] > 0 && i < MAX_PROXIMITY_CHARS_SIZE) { @@ -114,8 +114,10 @@ bool ProximityInfo::existsAdjacentProximityChars(const int index) const { // in their list. The non-accented version of the character should be considered // "close", but not the other keys close to the non-accented version. ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId( - const int index, const unsigned short c, const int skipPos, - const int excessivePos, const int transposedPos) const { + const int index, const unsigned short c, CorrectionState *correctionState) const { + const int skipPos = correctionState->getSkipPos(); + const int excessivePos = correctionState->getExcessivePos(); + const int transposedPos = correctionState->getTransposedPos(); const int *currentChars = getProximityCharsAt(index); const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index 435a60151..b28191d01 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -23,6 +23,8 @@ namespace latinime { +class CorrectionState; + class ProximityInfo { public: typedef enum { // Used as a return value for character comparison @@ -42,8 +44,7 @@ public: bool existsCharInProximityAt(const int index, const int c) const; bool existsAdjacentProximityChars(const int index) const; ProximityType getMatchedProximityId( - const int index, const unsigned short c, const int skipPos, - const int excessivePos, const int transposedPos) const; + const int index, const unsigned short c, CorrectionState *correctionState) const; bool sameAsTyped(const unsigned short *word, int length) const; private: int getStartIndexFromCoordinates(const int x, const int y) const; diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index bccd37a61..f0bb384fb 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -24,9 +24,7 @@ #include "dictionary.h" #include "unigram_dictionary.h" -#ifdef NEW_DICTIONARY_FORMAT #include "binary_format.h" -#endif // NEW_DICTIONARY_FORMAT namespace latinime { @@ -39,28 +37,23 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) -#ifndef NEW_DICTIONARY_FORMAT - : DICT_ROOT(streamStart), -#else // NEW_DICTIONARY_FORMAT : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE), -#endif // NEW_DICTIONARY_FORMAT MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), -#ifndef NEW_DICTIONARY_FORMAT - ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0), -#else // NEW_DICTIONARY_FORMAT // TODO : remove this variable. ROOT_POS(0), -#endif // NEW_DICTIONARY_FORMAT BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { if (DEBUG_DICT) { LOGI("UnigramDictionary - constructor"); } + mCorrectionState = new CorrectionState(); } -UnigramDictionary::~UnigramDictionary() {} +UnigramDictionary::~UnigramDictionary() { + delete mCorrectionState; +} static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize, const int MAX_PROXIMITY_CHARS) { @@ -362,6 +355,8 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, assert(excessivePos < mInputLength); assert(missingPos < mInputLength); } + mCorrectionState->setCorrectionParams(mProximityInfo, mInputLength, skipPos, excessivePos, + transposedPos); int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); @@ -389,8 +384,8 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, // depth will never be greater than maxDepth because in that case, // needsToTraverseChildrenNodes should be false const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex, - maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos, - excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount, + maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, + nextLetters, nextLettersSize, mCorrectionState, &childCount, &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs, &siblingPos, &outputIndex); // Update next sibling pos @@ -521,8 +516,12 @@ bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int s } inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, - const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos, - const int freq, const bool sameLength) const { + const int matchWeight, const int freq, const bool sameLength, + CorrectionState *correctionState) const { + const int skipPos = correctionState->getSkipPos(); + const int excessivePos = correctionState->getExcessivePos(); + const int transposedPos = correctionState->getTransposedPos(); + // TODO: Demote by edit distance int finalFreq = freq * matchWeight; if (skipPos >= 0) { @@ -587,16 +586,16 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth, const uint8_t* const root, const uint8_t flags, const int pos, - const int inputIndex, const int matchWeight, const int skipPos, - const int excessivePos, const int transposedPos, const int freq, const bool sameLength, - int* nextLetters, const int nextLettersSize) { + const int inputIndex, const int matchWeight, const int freq, const bool sameLength, + int* nextLetters, const int nextLettersSize, CorrectionState *correctionState) { + const int skipPos = correctionState->getSkipPos(); const bool isSameAsTyped = sameLength ? mProximityInfo->sameAsTyped(word, depth + 1) : false; if (isSameAsTyped) return; if (depth >= MIN_SUGGEST_DEPTH) { - const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, - excessivePos, transposedPos, freq, sameLength); + const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, + freq, sameLength, correctionState); if (!isSameAsTyped) addWord(word, depth + 1, finalFreq); } @@ -647,282 +646,6 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, return true; } -#ifndef NEW_DICTIONARY_FORMAT -// The following functions will be entirely replaced with new implementations. -void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos, - const int excessivePos, const int transposedPos,int *nextLetters, - const int nextLettersSize) { - int initialPosition = initialPos; - const int count = Dictionary::getCount(DICT_ROOT, &initialPosition); - getWordsRec(count, initialPosition, 0, - min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH), - mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters, - nextLettersSize); -} - -void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth, - const int maxDepth, const bool traverseAllNodes, const int matchWeight, - const int inputIndex, const int diffs, const int skipPos, const int excessivePos, - const int transposedPos, int *nextLetters, const int nextLettersSize) { - int siblingPos = pos; - for (int i = 0; i < childrenCount; ++i) { - int newCount; - int newChildPosition; - bool newTraverseAllNodes; - int newMatchRate; - int newInputIndex; - int newDiffs; - int newSiblingPos; - int newOutputIndex; - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth, - traverseAllNodes, matchWeight, inputIndex, diffs, - skipPos, excessivePos, transposedPos, - nextLetters, nextLettersSize, - &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate, - &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex); - siblingPos = newSiblingPos; - - if (needsToTraverseChildrenNodes) { - getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes, - newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos, - nextLetters, nextLettersSize); - } - } -} - -inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, - const int inputLength, unsigned short *word) { - int pos = ROOT_POS; - int count = Dictionary::getCount(DICT_ROOT, &pos); - int maxFreq = 0; - int depth = 0; - unsigned short newWord[MAX_WORD_LENGTH_INTERNAL]; - bool terminal = false; - - mStackChildCount[0] = count; - mStackSiblingPos[0] = pos; - - while (depth >= 0) { - if (mStackChildCount[depth] > 0) { - --mStackChildCount[depth]; - int firstChildPos; - int newFreq; - int siblingPos = mStackSiblingPos[depth]; - const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos, - startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq, - &siblingPos); - mStackSiblingPos[depth] = siblingPos; - if (depth == (inputLength - 1)) { - // Traverse sibling node - if (terminal) { - if (newFreq > maxFreq) { - for (int i = 0; i < inputLength; ++i) word[i] = newWord[i]; - if (DEBUG_DICT && DEBUG_NODE) { -#ifdef FLAG_DBG - char s[inputLength + 1]; - for (int i = 0; i < inputLength; ++i) s[i] = word[i]; - s[inputLength] = 0; - LOGI("New missing space word found: %d > %d (%s), %d, %d", - newFreq, maxFreq, s, inputLength, depth); -#endif - } - maxFreq = newFreq; - } - } - } else if (needsToTraverseChildrenNodes) { - // Traverse children nodes - ++depth; - mStackChildCount[depth] = count; - mStackSiblingPos[depth] = firstChildPos; - } - } else { - // Traverse parent node - --depth; - } - } - - word[inputLength] = 0; - return maxFreq; -} - -inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos, - const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition, - int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) { - const int inputIndex = startInputIndex + depth; - unsigned short c; - *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos, - &c, newChildPosition, newTerminal, newFreq); - const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex); - if (DEBUG_DICT) { - assert(inputC <= U_SHORT_MAX); - } - const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); - const bool matched = (inputC == baseLowerC || inputC == c); - const bool hasChild = *newChildPosition != 0; - if (matched) { - word[depth] = c; - if (DEBUG_DICT && DEBUG_NODE) { - LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq); - if (*newTerminal) { - LOGI("Terminal %d", *newFreq); - } - } - if (hasChild) { - *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition); - return true; - } else { - return false; - } - } else { - // If this node is not user typed character, this method treats this word as unmatched. - // Thus newTerminal shouldn't be true. - *newTerminal = false; - return false; - } -} - -// TODO: use uint32_t instead of unsigned short -bool UnigramDictionary::isValidWord(unsigned short *word, int length) { - if (IS_LATEST_DICT_VERSION) { - return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); - } else { - return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD); - } -} - - -// Require strict exact match. -int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, - int length) const { - // returns address of bigram data of that word - // return -99 if not found - - int count = Dictionary::getCount(DICT_ROOT, &pos); - unsigned short currentChar = (unsigned short) word[offset]; - for (int j = 0; j < count; j++) { - unsigned short c = Dictionary::getChar(DICT_ROOT, &pos); - int terminal = Dictionary::getTerminal(DICT_ROOT, &pos); - int childPos = Dictionary::getAddress(DICT_ROOT, &pos); - if (c == currentChar) { - if (offset == length - 1) { - if (terminal) { - return (pos+1); - } - } else { - if (childPos != 0) { - int t = getBigramPosition(childPos, word, offset + 1, length); - if (t > 0) { - return t; - } - } - } - } - if (terminal) { - Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos); - } - // There could be two instances of each alphabet - upper and lower case. So continue - // looking ... - } - return NOT_VALID_WORD; -} - -// The following functions will be modified. -inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth, - const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex, - const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos, - int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, - bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, - int *nextSiblingPosition, int *nextOutputIndex) { - if (DEBUG_DICT) { - int inputCount = 0; - if (skipPos >= 0) ++inputCount; - if (excessivePos >= 0) ++inputCount; - if (transposedPos >= 0) ++inputCount; - assert(inputCount <= 1); - } - unsigned short c; - int childPosition; - bool terminal; - int freq; - bool isSameAsUserTypedLength = false; - - const int pos = initialPos; - const int depth = initialDepth; - const int traverseAllNodes = initialTraverseAllNodes; - const int diffs = initialDiffs; - - const uint8_t flags = 0; // No flags for now - - if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; - - *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos, - &c, &childPosition, &terminal, &freq); - *nextOutputIndex = depth + 1; - - const bool needsToTraverseChildrenNodes = childPosition != 0; - - // If we are only doing traverseAllNodes, no need to look at the typed characters. - if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { - mWord[depth] = c; - if (traverseAllNodes && terminal) { - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize); - } - if (!needsToTraverseChildrenNodes) return false; - *newTraverseAllNodes = traverseAllNodes; - *newMatchRate = matchWeight; - *newDiffs = diffs; - *newInputIndex = inputIndex; - } else { - int inputIndexForProximity = inputIndex; - - if (transposedPos >= 0) { - if (inputIndex == transposedPos) ++inputIndexForProximity; - if (inputIndex == (transposedPos + 1)) --inputIndexForProximity; - } - - ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId( - inputIndexForProximity, c, skipPos, excessivePos, transposedPos); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false; - mWord[depth] = c; - // If inputIndex is greater than mInputLength, that means there is no - // proximity chars. So, we don't need to check proximity. - if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { - multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); - } - bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 - || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); - if (isSameAsUserTypedLength && terminal) { - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize); - } - if (!needsToTraverseChildrenNodes) return false; - // Start traversing all nodes after the index exceeds the user typed length - *newTraverseAllNodes = isSameAsUserTypedLength; - *newMatchRate = matchWeight; - *newDiffs = diffs - + ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); - *newInputIndex = inputIndex + 1; - } - // Optimization: Prune out words that are too long compared to how much was typed. - if (depth >= maxDepth || *newDiffs > mMaxEditDistance) { - return false; - } - - // If inputIndex is greater than mInputLength, that means there are no proximity chars. - // TODO: Check if this can be isSameAsUserTypedLength only. - if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) { - *newTraverseAllNodes = true; - } - // get the count of nodes and increment childAddress. - *newCount = Dictionary::getCount(DICT_ROOT, &childPosition); - *newChildPosition = childPosition; - if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); - return needsToTraverseChildrenNodes; -} - -#else // NEW_DICTIONARY_FORMAT - // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, @@ -1081,16 +804,15 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth, const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex, - const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos, - int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition, + const int initialDiffs, int *nextLetters, const int nextLettersSize, + CorrectionState *correctionState, int *newCount, int *newChildrenPosition, bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, int *nextSiblingPosition, int *newOutputIndex) { + const int skipPos = correctionState->getSkipPos(); + const int excessivePos = correctionState->getExcessivePos(); + const int transposedPos = correctionState->getTransposedPos(); if (DEBUG_DICT) { - int inputCount = 0; - if (skipPos >= 0) ++inputCount; - if (excessivePos >= 0) ++inputCount; - if (transposedPos >= 0) ++inputCount; - assert(inputCount <= 1); + correctionState->checkState(); } int pos = initialPos; int depth = initialDepth; @@ -1146,8 +868,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in // The frequency should be here, because we come here only if this is actually // a terminal node, and we are on its last char. const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize); + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, + freq, false, nextLetters, nextLettersSize, mCorrectionState); } if (!hasChildren) { // If we don't have children here, that means we finished processing all @@ -1170,7 +892,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in } int matchedProximityCharId = mProximityInfo->getMatchedProximityId( - inputIndexForProximity, c, skipPos, excessivePos, transposedPos); + inputIndexForProximity, c, mCorrectionState); if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { // We found that this is an unrelated character, so we should give up traversing // this node and its children entirely. @@ -1197,8 +919,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); if (isSameAsUserTypedLength && isTerminal) { const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, - excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize); + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, + freq, true, nextLetters, nextLettersSize, mCorrectionState); } // This character matched the typed character (enough to traverse the node at least) // so we just evaluated it. Now we should evaluate this virtual node's children - that @@ -1276,6 +998,4 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in return true; } -#endif // NEW_DICTIONARY_FORMAT - } // namespace latinime diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 97198ef13..41e381860 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -18,6 +18,7 @@ #define LATINIME_UNIGRAM_DICTIONARY_H #include <stdint.h> +#include "correction_state.h" #include "defines.h" #include "proximity_info.h" @@ -30,7 +31,6 @@ namespace latinime { class UnigramDictionary { public: -#ifdef NEW_DICTIONARY_FORMAT // Mask and flags for children address type selection. static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; @@ -62,21 +62,16 @@ public: static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; -#endif // NEW_DICTIONARY_FORMAT UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion); -#ifndef NEW_DICTIONARY_FORMAT - bool isValidWord(unsigned short *word, int length); -#else // NEW_DICTIONARY_FORMAT bool isValidWord(const uint16_t* const inWord, const int length) const; -#endif // NEW_DICTIONARY_FORMAT int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies); - ~UnigramDictionary(); + virtual ~UnigramDictionary(); private: void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, @@ -99,42 +94,25 @@ private: const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity); bool getMissingSpaceWords(const int inputLength, const int missingSpacePos); bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos); - int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, - const int excessivePos, const int transposedPos, const int freq, - const bool sameLength) const; + int calculateFinalFreq(const int inputIndex, const int depth, const int snr, + const int freq, const bool sameLength, CorrectionState *correctionState) const; void onTerminal(unsigned short int* word, const int depth, const uint8_t* const root, const uint8_t flags, const int pos, - const int inputIndex, const int matchWeight, const int skipPos, - const int excessivePos, const int transposedPos, const int freq, const bool sameLength, - int *nextLetters, const int nextLettersSize); + const int inputIndex, const int matchWeight, const int freq, const bool sameLength, + int* nextLetters, const int nextLettersSize, CorrectionState *correctionState); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character bool processCurrentNode(const int initialPos, const int initialDepth, - const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex, - const int initialDiffs, const int skipPos, const int excessivePos, - const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount, - int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, - int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex); + const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex, + const int initialDiffs, int *nextLetters, const int nextLettersSize, + CorrectionState *correctionState, int *newCount, int *newChildPosition, + bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, + int *nextSiblingPosition, int *nextOutputIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, unsigned short *word); -#ifndef NEW_DICTIONARY_FORMAT - void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth, - const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs, - const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, - const int nextLettersSize); - // Keep getWordsOld for comparing performance between getWords and getWordsOld - void getWordsOld(const int initialPos, const int inputLength, const int skipPos, - const int excessivePos, const int transposedPos, int *nextLetters, - const int nextLettersSize); - // Process a node by considering missing space - bool processCurrentNodeForExactMatch(const int firstChildPos, - const int startInputIndex, const int depth, unsigned short *word, - int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos); -#else // NEW_DICTIONARY_FORMAT int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, short unsigned int* outWord); -#endif // NEW_DICTIONARY_FORMAT const uint8_t* const DICT_ROOT; const int MAX_WORD_LENGTH; @@ -158,7 +136,8 @@ private: int *mFrequencies; unsigned short *mOutputChars; - const ProximityInfo *mProximityInfo; + ProximityInfo *mProximityInfo; + CorrectionState *mCorrectionState; int mInputLength; // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; |