diff options
Diffstat (limited to 'native/jni/src/unigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 124 |
1 files changed, 56 insertions, 68 deletions
diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index e3649bd4b..def4a5bf8 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -41,14 +41,10 @@ const UnigramDictionary::digraph_t UnigramDictionary::FRENCH_LIGATURES_DIGRAPHS[ { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE // TODO: check the header -UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, int typedLetterMultiplier, - int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags) - : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), - TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), - // TODO : remove this variable. - ROOT_POS(0), - BYTES_IN_ONE_CHAR(sizeof(int)), - MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), FLAGS(flags) { +UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, int maxWordLength, + int maxWords, const unsigned int flags) + : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), + ROOT_POS(0), MAX_DIGRAPH_SEARCH_DEPTH(DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH), FLAGS(flags) { if (DEBUG_DICT) { AKLOGI("UnigramDictionary - constructor"); } @@ -57,13 +53,12 @@ UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, int typed UnigramDictionary::~UnigramDictionary() { } -static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize) { - return static_cast<unsigned int>(sizeof(*codes)) * codesSize; +static inline int getCodesBufferSize(const int *codes, const int codesSize) { + return sizeof(*codes) * codesSize; } -// TODO: This needs to take a const unsigned short* and not tinker with its contents -static inline void addWord(unsigned short *word, int length, int frequency, - WordsPriorityQueue *queue, int type) { +// TODO: This needs to take a const int* and not tinker with its contents +static void addWord(int *word, int length, int frequency, WordsPriorityQueue *queue, int type) { queue->push(frequency, word, length, type); } @@ -105,6 +100,9 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, WordsPriorityQueuePool *queuePool, const digraph_t *const digraphs, const unsigned int digraphsSize) const { + assert(sizeof(codesDest[0]) == sizeof(codesSrc[0])); + assert(sizeof(xCoordinatesBuffer[0]) == sizeof(xcoordinates[0])); + assert(sizeof(yCoordinatesBuffer[0]) == sizeof(ycoordinates[0])); const int startIndex = static_cast<int>(codesDest - codesBuffer); if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) { @@ -125,9 +123,8 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit // Make i the index of the second char of the digraph for simplicity. Forgetting // to do that results in an infinite recursion so take care! ++i; - memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR); - codesDest[(i - 1) * (BYTES_IN_ONE_CHAR / sizeof(codesDest[0]))] = - replacementCodePoint; + memcpy(codesDest, codesSrc, i * sizeof(codesDest[0])); + codesDest[i - 1] = replacementCodePoint; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1, @@ -137,7 +134,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit // Copy the second char of the digraph in place, then continue processing on // the remaining part of the word. // In our example, after "pru" in the buffer copy the "e", and continue on "fen" - memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR); + memcpy(codesDest + i, codesSrc + i, sizeof(codesDest[0])); getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize, bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i, @@ -153,13 +150,13 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit // If the word contains several digraphs, we'll come it for the product of them. // eg. if the word is "ueberpruefen" we'll test, in order, against // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen". - const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain; + const unsigned int remainingBytes = sizeof(codesDest[0]) * codesRemain; if (0 != remainingBytes) { memcpy(codesDest, codesSrc, remainingBytes); memcpy(&xCoordinatesBuffer[startIndex], &xcoordinates[codesBufferSize - codesRemain], - sizeof(int) * codesRemain); + sizeof(xCoordinatesBuffer[0]) * codesRemain); memcpy(&yCoordinatesBuffer[startIndex], &ycoordinates[codesBufferSize - codesRemain], - sizeof(int) * codesRemain); + sizeof(yCoordinatesBuffer[0]) * codesRemain); } getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer, @@ -173,9 +170,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, - const bool useFullEditDistance, unsigned short *outWords, int *frequencies, - int *outputTypes) const { - + const bool useFullEditDistance, int *outWords, int *frequencies, int *outputTypes) const { WordsPriorityQueuePool queuePool(MAX_WORDS, SUB_QUEUE_MAX_WORDS, MAX_WORD_LENGTH); queuePool.clearAll(); Correction masterCorrection; @@ -188,8 +183,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, useFullEditDistance, codes, codesSize, 0, codesBuffer, &masterCorrection, - &queuePool, GERMAN_UMLAUT_DIGRAPHS, - sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); + &queuePool, GERMAN_UMLAUT_DIGRAPHS, NELEMS(GERMAN_UMLAUT_DIGRAPHS)); } else if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & FLAGS) { int codesBuffer[getCodesBufferSize(codes, codesSize)]; int xCoordinatesBuffer[codesSize]; @@ -197,8 +191,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, useFullEditDistance, codes, codesSize, 0, codesBuffer, &masterCorrection, - &queuePool, FRENCH_LIGATURES_DIGRAPHS, - sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); + &queuePool, FRENCH_LIGATURES_DIGRAPHS, NELEMS(FRENCH_LIGATURES_DIGRAPHS)); } else { // Normal processing getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool); @@ -222,7 +215,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { - short unsigned int *w = outWords + j * MAX_WORD_LENGTH; + int *w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; (void)s; // To suppress compiler warning @@ -234,12 +227,11 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *x return suggestedWordsCount; } -void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, - const int *xcoordinates, const int *ycoordinates, const int *codes, - const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, - const bool useFullEditDistance, Correction *correction, - WordsPriorityQueuePool *queuePool) const { - +void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int inputSize, + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) + const { PROF_OPEN; PROF_START(0); PROF_END(0); @@ -288,7 +280,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, if (queue->size() > 0) { WordsPriorityQueue::SuggestedWord *sw = queue->top(); const int score = sw->mScore; - const unsigned short *word = sw->mWord; + const int *word = sw->mWord; const int wordLength = sw->mWordLength; float ns = Correction::RankingAlgorithm::calcNormalizedScore( correction->getPrimaryInputWord(), i, word, wordLength, score); @@ -307,15 +299,13 @@ void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int Correction *correction) const { if (DEBUG_DICT) { AKLOGI("initSuggest"); - DUMP_WORD_INT(codes, inputSize); + DUMP_WORD(codes, inputSize); } correction->initInputParams(proximityInfo, codes, inputSize, xCoordinates, yCoordinates); const int maxDepth = min(inputSize * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); correction->initCorrection(proximityInfo, inputSize, maxDepth); } -static const char SPACE = ' '; - void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, @@ -374,7 +364,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, } } -inline void UnigramDictionary::onTerminal(const int probability, +void UnigramDictionary::onTerminal(const int probability, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex) const { @@ -382,7 +372,7 @@ inline void UnigramDictionary::onTerminal(const int probability, const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; int wordLength; - unsigned short *wordPointer; + int *wordPointer; if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); @@ -410,7 +400,7 @@ inline void UnigramDictionary::onTerminal(const int probability, // so that the insert order is protected inside the queue for words // with the same score. For the moment we use -1 to make sure the shortcut will // never be in front of the word. - uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; + int shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; int shortcutFrequency; const int shortcutTargetStringLength = iterator.getNextShortcutTarget( MAX_WORD_LENGTH_INTERNAL, shortcutTarget, &shortcutFrequency); @@ -450,7 +440,7 @@ int UnigramDictionary::getSubStringSuggestion( const bool hasAutoCorrectionCandidate, const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, - int *wordLengthArray, unsigned short *outputWord, int *outputWordLength) const { + int *wordLengthArray, int *outputWord, int *outputWordLength) const { if (inputWordLength > MULTIPLE_WORDS_SUGGESTION_MAX_WORD_LENGTH) { return FLAG_MULTIPLE_SUGGEST_ABORT; } @@ -493,13 +483,13 @@ int UnigramDictionary::getSubStringSuggestion( // TODO: Remove the safety net above // ////////////////////////////////////////////// - unsigned short *tempOutputWord = 0; + int *tempOutputWord = 0; int nextWordLength = 0; // TODO: Optimize init suggestion initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputSize, correction); - unsigned short word[MAX_WORD_LENGTH_INTERNAL]; + int word[MAX_WORD_LENGTH_INTERNAL]; int freq = getMostFrequentWordLike( inputWordStartPos, inputWordLength, correction, word); if (freq > 0) { @@ -570,7 +560,7 @@ int UnigramDictionary::getSubStringSuggestion( if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) { return FLAG_MULTIPLE_SUGGEST_SKIP; } - outputWord[tempOutputWordLength] = SPACE; + outputWord[tempOutputWordLength] = KEYCODE_SPACE; if (outputWordLength) { ++*outputWordLength; } @@ -598,7 +588,7 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, const bool useFullEditDistance, const int inputSize, Correction *correction, WordsPriorityQueuePool *queuePool, const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex, const int outputWordLength, - int *freqArray, int *wordLengthArray, unsigned short *outputWord) const { + int *freqArray, int *wordLengthArray, int *outputWord) const { if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { // Return if the last word index return; @@ -684,7 +674,7 @@ void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximit } // Allocating fixed length array on stack - unsigned short outputWord[MAX_WORD_LENGTH]; + int outputWord[MAX_WORD_LENGTH]; int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; const int outputWordLength = 0; @@ -698,12 +688,11 @@ void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximit // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. -inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, - const int inputSize, Correction *correction, unsigned short *word) const { - uint16_t inWord[inputSize]; - +int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, const int inputSize, + Correction *correction, int *word) const { + int inWord[inputSize]; for (int i = 0; i < inputSize; ++i) { - inWord[i] = (uint16_t)correction->getPrimaryCharAt(startInputIndex + i); + inWord[i] = correction->getPrimaryCodePointAt(startInputIndex + i); } return getMostFrequentWordLikeInner(inWord, inputSize, word); } @@ -721,14 +710,14 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, // In and out parameters may point to the same location. This function takes care // not to use any input parameters after it wrote into its outputs. static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, - const uint8_t *const root, const int startPos, const uint16_t *const inWord, - const int startInputIndex, const int inputSize, int32_t *outNewWord, int *outInputIndex, + const uint8_t *const root, const int startPos, const int *const inWord, + const int startInputIndex, const int inputSize, int *outNewWord, int *outInputIndex, int *outPos) { const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; - int32_t codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); - int32_t baseChar = toBaseLowerCase(codePoint); - const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); + int codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); + int baseChar = toBaseLowerCase(codePoint); + const int wChar = toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; @@ -759,8 +748,8 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, // It will compare the frequency to the max frequency, and if greater, will // copy the word into the output buffer. In output value maxFreq, it will // write the new maximum frequency if it changed. -static inline void onTerminalWordLike(const int freq, int32_t *newWord, const int length, - short unsigned int *outWord, int *maxFreq) { +static inline void onTerminalWordLike(const int freq, int *newWord, const int length, int *outWord, + int *maxFreq) { if (freq > *maxFreq) { for (int q = 0; q < length; ++q) { outWord[q] = newWord[q]; @@ -772,9 +761,9 @@ static inline void onTerminalWordLike(const int freq, int32_t *newWord, const in // Will find the highest frequency of the words like the one passed as an argument, // that is, everything that only differs by case/accents. -int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord, - const int inputSize, short unsigned int *outWord) const { - int32_t newWord[MAX_WORD_LENGTH_INTERNAL]; +int UnigramDictionary::getMostFrequentWordLikeInner(const int *const inWord, const int inputSize, + int *outWord) const { + int newWord[MAX_WORD_LENGTH_INTERNAL]; int depth = 0; int maxFreq = -1; const uint8_t *const root = DICT_ROOT; @@ -834,7 +823,7 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord return maxFreq; } -int UnigramDictionary::getFrequency(const int32_t *const inWord, const int length) const { +int UnigramDictionary::getFrequency(const int *const inWord, const int length) const { const uint8_t *const root = DICT_ROOT; int pos = BinaryFormat::getTerminalPosition(root, inWord, length, false /* forceLowerCaseSearch */); @@ -859,8 +848,7 @@ int UnigramDictionary::getFrequency(const int32_t *const inWord, const int lengt } // TODO: remove this function. -int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, - int length) const { +int UnigramDictionary::getBigramPosition(int pos, int *word, int offset, int length) const { return -1; } @@ -878,7 +866,7 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // there aren't any more nodes at this level, it merely returns the address of the first byte after // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. -inline bool UnigramDictionary::processCurrentNode(const int initialPos, +bool UnigramDictionary::processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) const { @@ -906,7 +894,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // else if FLAG_IS_TERMINAL: the frequency // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. - int32_t c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); + int c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); assert(NOT_A_CODE_POINT != c); // We are going to loop through each character and make it look like it's a different @@ -920,7 +908,7 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // We prefetch the next char. If 'c' is the last char of this node, we will have // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node // should behave as a terminal or not and whether we have children. - const int32_t nextc = hasMultipleChars + const int nextc = hasMultipleChars ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT; const bool isLastChar = (NOT_A_CODE_POINT == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. |