diff options
Diffstat (limited to 'native/jni/src/bigram_dictionary.cpp')
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 232 |
1 files changed, 0 insertions, 232 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp deleted file mode 100644 index ebe27994f..000000000 --- a/native/jni/src/bigram_dictionary.cpp +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (C) 2010, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <cstring> - -#define LOG_TAG "LatinIME: bigram_dictionary.cpp" - -#include "bigram_dictionary.h" - -#include "defines.h" -#include "suggest/core/dictionary/binary_format.h" -#include "suggest/core/dictionary/bloom_filter.h" -#include "suggest/core/dictionary/char_utils.h" -#include "suggest/core/dictionary/dictionary.h" - -namespace latinime { - -BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) { - if (DEBUG_DICT) { - AKLOGI("BigramDictionary - constructor"); - } -} - -BigramDictionary::~BigramDictionary() { -} - -void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, - int *bigramCodePoints, int *outputTypes) const { - word[length] = 0; - if (DEBUG_DICT_FULL) { -#ifdef FLAG_DBG - char s[length + 1]; - for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]); - AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); -#endif - } - - // Find the right insertion point - int insertAt = 0; - while (insertAt < MAX_RESULTS) { - if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability - && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH, - bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { - break; - } - insertAt++; - } - if (DEBUG_DICT_FULL) { - AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS); - } - if (insertAt >= MAX_RESULTS) { - return; - } - memmove(bigramProbability + (insertAt + 1), - bigramProbability + insertAt, - (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); - bigramProbability[insertAt] = probability; - outputTypes[insertAt] = Dictionary::KIND_PREDICTION; - memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, - bigramCodePoints + insertAt * MAX_WORD_LENGTH, - (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); - int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; - while (length--) { - *dest++ = *word++; - } - *dest = 0; // NULL terminate - if (DEBUG_DICT_FULL) { - AKLOGI("Bigram: Added word at %d", insertAt); - } -} - -/* Parameters : - * prevWord: the word before, the one for which we need to look up bigrams. - * prevWordLength: its length. - * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions. - * inputSize: the size of the codes array. - * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions. - * bigramProbability: an array to output frequencies. - * outputTypes: an array to output types. - * This method returns the number of bigrams this word has, for backward compatibility. - * Note: this is not the number of bigrams output in the array, which is the number of - * bigrams this word has WHOSE first letter also matches the letter the user typed. - * TODO: this may not be a sensible thing to do. It makes sense when the bigrams are - * used to match the first letter of the second word, but once the user has typed more - * and the bigrams are used to boost unigram result scores, it makes little sense to - * reduce their scope to the ones that match the first letter. - */ -int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, - int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const { - // TODO: remove unused arguments, and refrain from storing stuff in members of this class - // TODO: have "in" arguments before "out" ones, and make out args explicit in the name - - const uint8_t *const root = DICT_ROOT; - int pos = getBigramListPositionForWord(prevWord, prevWordLength, - false /* forceLowerCaseSearch */); - // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams - if (0 == pos) { - // If no bigrams for this exact word, search again in lower case. - pos = getBigramListPositionForWord(prevWord, prevWordLength, - true /* forceLowerCaseSearch */); - } - // If still no bigrams, we really don't have them! - if (0 == pos) return 0; - uint8_t bigramFlags; - int bigramCount = 0; - do { - bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - int bigramBuffer[MAX_WORD_LENGTH]; - int unigramProbability = 0; - const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, - &pos); - const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, - bigramBuffer, &unigramProbability); - - // inputSize == 0 means we are trying to find bigram predictions. - if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { - const int bigramProbabilityTemp = - BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; - // Due to space constraints, the probability for bigrams is approximate - the lower the - // unigram probability, the worse the precision. The theoritical maximum error in - // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 - // in very bad cases. This means that sometimes, we'll see some bigrams interverted - // here, but it can't get too bad. - const int probability = BinaryFormat::computeProbabilityForBigram( - unigramProbability, bigramProbabilityTemp); - addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, - outputTypes); - ++bigramCount; - } - } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); - return min(bigramCount, MAX_RESULTS); -} - -// Returns a pointer to the start of the bigram list. -// If the word is not found or has no bigrams, this function returns 0. -int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, - const bool forceLowerCaseSearch) const { - if (0 >= prevWordLength) return 0; - const uint8_t *const root = DICT_ROOT; - int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, - forceLowerCaseSearch); - - if (NOT_VALID_WORD == pos) return 0; - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0; - if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { - BinaryFormat::getCodePointAndForwardPointer(root, &pos); - } else { - pos = BinaryFormat::skipOtherCharacters(root, pos); - } - pos = BinaryFormat::skipProbability(flags, pos); - pos = BinaryFormat::skipChildrenPosition(flags, pos); - pos = BinaryFormat::skipShortcuts(root, flags, pos); - return pos; -} - -void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, - const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { - memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); - const uint8_t *const root = DICT_ROOT; - int pos = getBigramListPositionForWord(prevWord, prevWordLength, - false /* forceLowerCaseSearch */); - if (0 == pos) { - // If no bigrams for this exact string, search again in lower case. - pos = getBigramListPositionForWord(prevWord, prevWordLength, - true /* forceLowerCaseSearch */); - } - if (0 == pos) return; - - uint8_t bigramFlags; - do { - bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; - const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, - &pos); - (*map)[bigramPos] = probability; - setInFilter(filter, bigramPos); - } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); -} - -bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) const { - // Checks whether this word starts with same character or neighboring characters of - // what user typed. - - int maxAlt = MAX_ALTERNATIVES; - const int firstBaseLowerCodePoint = CharUtils::toBaseLowerCase(*word); - while (maxAlt > 0) { - if (CharUtils::toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) { - return true; - } - inputCodePoints++; - maxAlt--; - } - return false; -} - -bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, - int length2) const { - const uint8_t *const root = DICT_ROOT; - int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); - // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams - if (0 == pos) return false; - int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2, - false /* forceLowerCaseSearch */); - if (NOT_VALID_WORD == nextWordPos) return false; - uint8_t bigramFlags; - do { - bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, - &pos); - if (bigramPos == nextWordPos) { - return true; - } - } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); - return false; -} - -// TODO: Move functions related to bigram to here -} // namespace latinime |