diff options
Diffstat (limited to 'native/jni/src/unigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 83 |
1 files changed, 41 insertions, 42 deletions
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index d4c51df63..b7e245a6c 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -58,12 +58,12 @@ UnigramDictionary::~UnigramDictionary() { } static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize) { - return sizeof(*codes) * codesSize; + return static_cast<unsigned int>(sizeof(*codes)) * codesSize; } // TODO: This needs to take a const unsigned short* and not tinker with its contents -static inline void addWord( - unsigned short *word, int length, int frequency, WordsPriorityQueue *queue, int type) { +static inline void addWord(unsigned short *word, int length, int frequency, + WordsPriorityQueue *queue, int type) { queue->push(frequency, word, length, type); } @@ -106,7 +106,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit WordsPriorityQueuePool *queuePool, const digraph_t *const digraphs, const unsigned int digraphsSize) const { - const int startIndex = codesDest - codesBuffer; + const int startIndex = static_cast<int>(codesDest - codesBuffer); if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) { for (int i = 0; i < codesRemain; ++i) { xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i]; @@ -170,8 +170,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit // bigramMap contains the association <bigram address> -> <bigram frequency> // bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter // in bigram_dictionary.cpp -int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, - const int *xcoordinates, +int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords, int *frequencies, @@ -597,11 +596,10 @@ int UnigramDictionary::getSubStringSuggestion( void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputSize, - Correction *correction, WordsPriorityQueuePool *queuePool, - const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex, - const int outputWordLength, int *freqArray, int *wordLengthArray, - unsigned short *outputWord) const { + const bool useFullEditDistance, const int inputSize, Correction *correction, + WordsPriorityQueuePool *queuePool, const bool hasAutoCorrectionCandidate, + const int startInputPos, const int startWordIndex, const int outputWordLength, + int *freqArray, int *wordLengthArray, unsigned short *outputWord) const { if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { // Return if the last word index return; @@ -641,7 +639,7 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, // Missing space inputWordStartPos = i; inputWordLength = inputSize - i; - if(getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + if (getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate, startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, false /* missing space */, freqArray, wordLengthArray, outputWord, 0) @@ -724,13 +722,13 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, // In and out parameters may point to the same location. This function takes care // not to use any input parameters after it wrote into its outputs. static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, - const uint8_t *const root, const int startPos, - const uint16_t *const inWord, const int startInputIndex, - int32_t *outNewWord, int *outInputIndex, int *outPos) { + const uint8_t *const root, const int startPos, const uint16_t *const inWord, + const int startInputIndex, const int inputSize, int32_t *outNewWord, int *outInputIndex, + int *outPos) { const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; - int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - int32_t baseChar = toBaseLowerCase(character); + int32_t codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); + int32_t baseChar = toBaseLowerCase(codePoint); const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { @@ -739,18 +737,18 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, return false; } int inputIndex = startInputIndex; - outNewWord[inputIndex] = character; + outNewWord[inputIndex] = codePoint; if (hasMultipleChars) { - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - while (NOT_A_CHARACTER != character) { - baseChar = toBaseLowerCase(character); - if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); + while (NOT_A_CODE_POINT != codePoint) { + baseChar = toBaseLowerCase(codePoint); + if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false; } - outNewWord[inputIndex] = character; - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + outNewWord[inputIndex] = codePoint; + codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); } } *outInputIndex = inputIndex + 1; @@ -765,8 +763,9 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, static inline void onTerminalWordLike(const int freq, int32_t *newWord, const int length, short unsigned int *outWord, int *maxFreq) { if (freq > *maxFreq) { - for (int q = 0; q < length; ++q) + for (int q = 0; q < length; ++q) { outWord[q] = newWord[q]; + } outWord[length] = 0; *maxFreq = freq; } @@ -775,7 +774,7 @@ static inline void onTerminalWordLike(const int freq, int32_t *newWord, const in // Will find the highest frequency of the words like the one passed as an argument, // that is, everything that only differs by case/accents. int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord, - const int length, short unsigned int *outWord) const { + const int inputSize, short unsigned int *outWord) const { int32_t newWord[MAX_WORD_LENGTH_INTERNAL]; int depth = 0; int maxFreq = -1; @@ -795,12 +794,12 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord int inputIndex = stackInputIndex[depth]; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); // Test whether all chars in this group match with the word we are searching for. If so, - // we want to traverse its children (or if the length match, evaluate its frequency). + // we want to traverse its children (or if the inputSize match, evaluate its frequency). // Note that this function will output the position regardless, but will only write // into inputIndex if there is a match. const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, - inputIndex, newWord, &inputIndex, &pos); - if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == length)) { + inputIndex, inputSize, newWord, &inputIndex, &pos); + if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) { const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); } @@ -809,8 +808,8 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); // If we had a match and the word has children, we want to traverse them. We don't have // to traverse words longer than the one we are searching for, since they will not match - // anyway, so don't traverse unless inputIndex < length. - if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) { + // anyway, so don't traverse unless inputIndex < inputSize. + if (isAlike && (-1 != childrenNodePos) && (inputIndex < inputSize)) { // Save position for this depth, to get back to this once children are done stackChildCount[depth] = charGroupIndex; stackSiblingPos[depth] = siblingPos; @@ -853,7 +852,7 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt if (hasMultipleChars) { pos = BinaryFormat::skipOtherCharacters(root, pos); } else { - BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); + BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); } const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); return unigramFreq; @@ -907,23 +906,23 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // else if FLAG_IS_TERMINAL: the frequency // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. - int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); - assert(NOT_A_CHARACTER != c); + int32_t c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); + assert(NOT_A_CODE_POINT != c); // We are going to loop through each character and make it look like it's a different // node each time. To do that, we will process characters in this node in order until - // we find the character terminator. This is signalled by getCharCode* returning - // NOT_A_CHARACTER. + // we find the character terminator. This is signalled by getCodePoint* returning + // NOT_A_CODE_POINT. // As a special case, if there is only one character in this node, we must not read the - // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. + // next bytes so we will simulate the NOT_A_CODE_POINT return by testing the flags. // This way, each loop run will look like a "virtual node". do { // We prefetch the next char. If 'c' is the last char of this node, we will have - // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node + // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node // should behave as a terminal or not and whether we have children. const int32_t nextc = hasMultipleChars - ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; - const bool isLastChar = (NOT_A_CHARACTER == nextc); + ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT; + const bool isLastChar = (NOT_A_CODE_POINT == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. // If we are on the last char, this virtual node is a terminal if this node is. const bool isTerminal = isLastChar && isTerminalNode; @@ -952,9 +951,9 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // Prepare for the next character. Promote the prefetched char to current char - the loop // will take care of prefetching the next. If we finally found our last char, nextc will - // contain NOT_A_CHARACTER. + // contain NOT_A_CODE_POINT. c = nextc; - } while (NOT_A_CHARACTER != c); + } while (NOT_A_CODE_POINT != c); if (isTerminalNode) { // The frequency should be here, because we come here only if this is actually |