diff options
Diffstat (limited to 'native/jni/src/bigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 83 |
1 files changed, 40 insertions, 43 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index e62ae6fd9..44dc75e9c 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -26,8 +26,7 @@ namespace latinime { -BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, int maxPredictions) - : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_PREDICTIONS(maxPredictions) { +BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } @@ -36,7 +35,7 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, BigramDictionary::~BigramDictionary() { } -bool BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq, +void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq, int *bigramCodePoints, int *outputTypes) const { word[length] = 0; if (DEBUG_DICT) { @@ -49,7 +48,7 @@ bool BigramDictionary::addWordBigram(int *word, int length, int frequency, int * // Find the right insertion point int insertAt = 0; - while (insertAt < MAX_PREDICTIONS) { + while (insertAt < MAX_RESULTS) { if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency && length < Dictionary::wideStrLen( bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { @@ -58,35 +57,34 @@ bool BigramDictionary::addWordBigram(int *word, int length, int frequency, int * insertAt++; } if (DEBUG_DICT) { - AKLOGI("Bigram: InsertAt -> %d MAX_PREDICTIONS: %d", insertAt, MAX_PREDICTIONS); + AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS); } - if (insertAt < MAX_PREDICTIONS) { - memmove(bigramFreq + (insertAt + 1), - bigramFreq + insertAt, - (MAX_PREDICTIONS - insertAt - 1) * sizeof(bigramFreq[0])); - bigramFreq[insertAt] = frequency; - outputTypes[insertAt] = Dictionary::KIND_PREDICTION; - memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, - bigramCodePoints + insertAt * MAX_WORD_LENGTH, - (MAX_PREDICTIONS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); - int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; - while (length--) { - *dest++ = *word++; - } - *dest = 0; // NULL terminate - if (DEBUG_DICT) { - AKLOGI("Bigram: Added word at %d", insertAt); - } - return true; + if (insertAt >= MAX_RESULTS) { + return; + } + memmove(bigramFreq + (insertAt + 1), + bigramFreq + insertAt, + (MAX_RESULTS - insertAt - 1) * sizeof(bigramFreq[0])); + bigramFreq[insertAt] = frequency; + outputTypes[insertAt] = Dictionary::KIND_PREDICTION; + memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, + bigramCodePoints + insertAt * MAX_WORD_LENGTH, + (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); + int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; + while (length--) { + *dest++ = *word++; + } + *dest = 0; // NULL terminate + if (DEBUG_DICT) { + AKLOGI("Bigram: Added word at %d", insertAt); } - return false; } /* Parameters : * prevWord: the word before, the one for which we need to look up bigrams. * prevWordLength: its length. - * inputCodes: what user typed, in the same format as for UnigramDictionary::getSuggestions. - * codesSize: the size of the codes array. + * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions. + * inputSize: the size of the codes array. * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions. * bigramFreq: an array to output frequencies. * outputTypes: an array to output types. @@ -98,12 +96,12 @@ bool BigramDictionary::addWordBigram(int *word, int length, int frequency, int * * and the bigrams are used to boost unigram result scores, it makes little sense to * reduce their scope to the ones that match the first letter. */ -int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodes, - int codesSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const { +int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, + int inputSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name - const uint8_t *const root = DICT; + const uint8_t *const root = DICT_ROOT; int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams @@ -125,8 +123,8 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, bigramBuffer, &unigramFreq); - // codesSize == 0 means we are trying to find bigram predictions. - if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) { + // inputSize == 0 means we are trying to find bigram predictions. + if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; // Due to space constraints, the frequency for bigrams is approximate - the lower the // unigram frequency, the worse the precision. The theoritical maximum error in @@ -135,13 +133,12 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // here, but it can't get too bad. const int frequency = BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp); - if (addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints, - outputTypes)) { - ++bigramCount; - } + addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints, + outputTypes); + ++bigramCount; } } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); - return bigramCount; + return min(bigramCount, MAX_RESULTS); } // Returns a pointer to the start of the bigram list. @@ -149,7 +146,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return 0; - const uint8_t *const root = DICT; + const uint8_t *const root = DICT_ROOT; int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, forceLowerCaseSearch); @@ -170,7 +167,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); - const uint8_t *const root = DICT; + const uint8_t *const root = DICT_ROOT; int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); if (0 == pos) { @@ -191,17 +188,17 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevW } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } -bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodes) const { +bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) const { // Checks whether this word starts with same character or neighboring characters of // what user typed. int maxAlt = MAX_ALTERNATIVES; - const int firstBaseChar = toBaseLowerCase(*word); + const int firstBaseLowerCodePoint = toBaseLowerCase(*word); while (maxAlt > 0) { - if (toBaseLowerCase(*inputCodes) == firstBaseChar) { + if (toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) { return true; } - inputCodes++; + inputCodePoints++; maxAlt--; } return false; @@ -209,7 +206,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodes) const { bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const { - const uint8_t *const root = DICT; + const uint8_t *const root = DICT_ROOT; int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; |