diff options
Diffstat (limited to 'native/jni/src/bigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 145 |
1 files changed, 77 insertions, 68 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index 9ef024dc4..f1d538095 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -1,21 +1,20 @@ /* -** -** Copyright 2010, The Android Open Source Project -** -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** -** http://www.apache.org/licenses/LICENSE-2.0 -** -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -*/ - -#include <string.h> + * Copyright (C) 2010, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstring> #define LOG_TAG "LatinIME: bigram_dictionary.cpp" @@ -27,9 +26,8 @@ namespace latinime { -BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, - Dictionary *parentDictionary) - : DICT(dict), MAX_WORD_LENGTH(maxWordLength), mParentDictionary(parentDictionary) { +BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, int maxPredictions) + : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_PREDICTIONS(maxPredictions) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } @@ -38,7 +36,8 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, BigramDictionary::~BigramDictionary() { } -bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) { +bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency, + int *bigramFreq, unsigned short *bigramChars, int *outputTypes) const { word[length] = 0; if (DEBUG_DICT) { #ifdef FLAG_DBG @@ -50,25 +49,26 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ // Find the right insertion point int insertAt = 0; - while (insertAt < mMaxBigrams) { - if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency - && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) { + while (insertAt < MAX_PREDICTIONS) { + if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency + && length < Dictionary::wideStrLen(bigramChars + insertAt * MAX_WORD_LENGTH))) { break; } insertAt++; } if (DEBUG_DICT) { - AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + AKLOGI("Bigram: InsertAt -> %d MAX_PREDICTIONS: %d", insertAt, MAX_PREDICTIONS); } - if (insertAt < mMaxBigrams) { - memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), - (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), - (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0])); - mBigramFreq[insertAt] = frequency; - memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short), - (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short), - (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH); - unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH; + if (insertAt < MAX_PREDICTIONS) { + memmove(bigramFreq + (insertAt + 1), + bigramFreq + insertAt, + (MAX_PREDICTIONS - insertAt - 1) * sizeof(bigramFreq[0])); + bigramFreq[insertAt] = frequency; + outputTypes[insertAt] = Dictionary::KIND_PREDICTION; + memmove(bigramChars + (insertAt + 1) * MAX_WORD_LENGTH, + bigramChars + insertAt * MAX_WORD_LENGTH, + (MAX_PREDICTIONS - insertAt - 1) * sizeof(bigramChars[0]) * MAX_WORD_LENGTH); + unsigned short *dest = bigramChars + insertAt * MAX_WORD_LENGTH; while (length--) { *dest++ = *word++; } @@ -84,12 +84,11 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ /* Parameters : * prevWord: the word before, the one for which we need to look up bigrams. * prevWordLength: its length. - * codes: what user typed, in the same format as for UnigramDictionary::getSuggestions. + * inputCodes: what user typed, in the same format as for UnigramDictionary::getSuggestions. * codesSize: the size of the codes array. * bigramChars: an array for output, at the same format as outwords for getSuggestions. * bigramFreq: an array to output frequencies. - * maxWordLength: the maximum size of a word. - * maxBigrams: the maximum number of bigrams fitting in the bigramChars array. + * outputTypes: an array to output types. * This method returns the number of bigrams this word has, for backward compatibility. * Note: this is not the number of bigrams output in the array, which is the number of * bigrams this word has WHOSE first letter also matches the letter the user typed. @@ -98,19 +97,21 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ * and the bigrams are used to boost unigram result scores, it makes little sense to * reduce their scope to the ones that match the first letter. */ -int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, int *codes, - int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength, - int maxBigrams) { +int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, int *inputCodes, + int codesSize, unsigned short *bigramChars, int *bigramFreq, int *outputTypes) const { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name - mBigramFreq = bigramFreq; - mBigramChars = bigramChars; - mInputCodes = codes; - mMaxBigrams = maxBigrams; - const uint8_t* const root = DICT; - int pos = getBigramListPositionForWord(prevWord, prevWordLength); + const uint8_t *const root = DICT; + int pos = getBigramListPositionForWord(prevWord, prevWordLength, + false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams + if (0 == pos) { + // If no bigrams for this exact word, search again in lower case. + pos = getBigramListPositionForWord(prevWord, prevWordLength, + true /* forceLowerCaseSearch */); + } + // If still no bigrams, we really don't have them! if (0 == pos) return 0; int bigramFlags; int bigramCount = 0; @@ -124,35 +125,37 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in bigramBuffer, &unigramFreq); // codesSize == 0 means we are trying to find bigram predictions. - if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) { - const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) { + const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; // Due to space constraints, the frequency for bigrams is approximate - the lower the // unigram frequency, the worse the precision. The theoritical maximum error in // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. const int frequency = - BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq); - if (addWordBigram(bigramBuffer, length, frequency)) { + BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp); + if (addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramChars, + outputTypes)) { ++bigramCount; } } - } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); + } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); return bigramCount; } // Returns a pointer to the start of the bigram list. // If the word is not found or has no bigrams, this function returns 0. int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, - const int prevWordLength) { + const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return 0; - const uint8_t* const root = DICT; - int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength); + const uint8_t *const root = DICT; + int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, + forceLowerCaseSearch); if (NOT_VALID_WORD == pos) return 0; const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0; - if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) { + if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0; + if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { BinaryFormat::getCharCodeAndForwardPointer(root, &pos); } else { pos = BinaryFormat::skipOtherCharacters(root, pos); @@ -164,28 +167,33 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, } void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, - const int prevWordLength, std::map<int, int> *map, uint8_t *filter) { + const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); - const uint8_t* const root = DICT; - int pos = getBigramListPositionForWord(prevWord, prevWordLength); + const uint8_t *const root = DICT; + int pos = getBigramListPositionForWord(prevWord, prevWordLength, + false /* forceLowerCaseSearch */); + if (0 == pos) { + // If no bigrams for this exact string, search again in lower case. + pos = getBigramListPositionForWord(prevWord, prevWordLength, + true /* forceLowerCaseSearch */); + } if (0 == pos) return; int bigramFlags; do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); (*map)[bigramPos] = frequency; setInFilter(filter, bigramPos); - } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); + } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } -bool BigramDictionary::checkFirstCharacter(unsigned short *word) { +bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const { // Checks whether this word starts with same character or neighboring characters of // what user typed. - int *inputCodes = mInputCodes; int maxAlt = MAX_ALTERNATIVES; const unsigned short firstBaseChar = toBaseLowerCase(*word); while (maxAlt > 0) { @@ -199,12 +207,13 @@ bool BigramDictionary::checkFirstCharacter(unsigned short *word) { } bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2, - int length2) { - const uint8_t* const root = DICT; - int pos = getBigramListPositionForWord(word1, length1); + int length2) const { + const uint8_t *const root = DICT; + int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; - int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2); + int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2, + false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == nextWordPos) return false; int bigramFlags; do { @@ -214,7 +223,7 @@ bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const in if (bigramPos == nextWordPos) { return true; } - } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); + } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); return false; } |