diff options
Diffstat (limited to 'native/jni/src')
34 files changed, 2693 insertions, 1381 deletions
diff --git a/native/jni/src/additional_proximity_chars.cpp b/native/jni/src/additional_proximity_chars.cpp index 224f020f2..f59492741 100644 --- a/native/jni/src/additional_proximity_chars.cpp +++ b/native/jni/src/additional_proximity_chars.cpp @@ -17,7 +17,9 @@ #include "additional_proximity_chars.h" namespace latinime { -const std::string AdditionalProximityChars::LOCALE_EN_US("en"); +// TODO: Stop using hardcoded additional proximity characters. +// TODO: Have proximity character informations in each language's binary dictionary. +const char *AdditionalProximityChars::LOCALE_EN_US = "en"; const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_A[EN_US_ADDITIONAL_A_SIZE] = { 'e', 'i', 'o', 'u' @@ -38,4 +40,4 @@ const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_O[EN_US_ADDITIONAL_O_SI const int32_t AdditionalProximityChars::EN_US_ADDITIONAL_U[EN_US_ADDITIONAL_U_SIZE] = { 'a', 'e', 'i', 'o' }; -} +} // namespace latinime diff --git a/native/jni/src/additional_proximity_chars.h b/native/jni/src/additional_proximity_chars.h index e0ecc0e1d..d420c4664 100644 --- a/native/jni/src/additional_proximity_chars.h +++ b/native/jni/src/additional_proximity_chars.h @@ -17,8 +17,8 @@ #ifndef LATINIME_ADDITIONAL_PROXIMITY_CHARS_H #define LATINIME_ADDITIONAL_PROXIMITY_CHARS_H +#include <cstring> #include <stdint.h> -#include <string> #include "defines.h" @@ -26,7 +26,8 @@ namespace latinime { class AdditionalProximityChars { private: - static const std::string LOCALE_EN_US; + DISALLOW_IMPLICIT_CONSTRUCTORS(AdditionalProximityChars); + static const char *LOCALE_EN_US; static const int EN_US_ADDITIONAL_A_SIZE = 4; static const int32_t EN_US_ADDITIONAL_A[]; static const int EN_US_ADDITIONAL_E_SIZE = 4; @@ -38,17 +39,18 @@ class AdditionalProximityChars { static const int EN_US_ADDITIONAL_U_SIZE = 4; static const int32_t EN_US_ADDITIONAL_U[]; - static bool isEnLocale(const std::string *locale_str) { - return locale_str && locale_str->size() >= LOCALE_EN_US.size() - && LOCALE_EN_US.compare(0, LOCALE_EN_US.size(), *locale_str); + static bool isEnLocale(const char *localeStr) { + const size_t LOCALE_EN_US_SIZE = strlen(LOCALE_EN_US); + return localeStr && strlen(localeStr) >= LOCALE_EN_US_SIZE + && strncmp(localeStr, LOCALE_EN_US, LOCALE_EN_US_SIZE) == 0; } public: - static int getAdditionalCharsSize(const std::string* locale_str, const int32_t c) { - if (!isEnLocale(locale_str)) { + static int getAdditionalCharsSize(const char *localeStr, const int32_t c) { + if (!isEnLocale(localeStr)) { return 0; } - switch(c) { + switch (c) { case 'a': return EN_US_ADDITIONAL_A_SIZE; case 'e': @@ -64,11 +66,11 @@ class AdditionalProximityChars { } } - static const int32_t* getAdditionalChars(const std::string *locale_str, const int32_t c) { - if (!isEnLocale(locale_str)) { + static const int32_t *getAdditionalChars(const char *localeStr, const int32_t c) { + if (!isEnLocale(localeStr)) { return 0; } - switch(c) { + switch (c) { case 'a': return EN_US_ADDITIONAL_A; case 'e': @@ -83,12 +85,6 @@ class AdditionalProximityChars { return 0; } } - - static bool hasAdditionalChars(const std::string *locale_str, const int32_t c) { - return getAdditionalCharsSize(locale_str, c) > 0; - } }; - -} - +} // namespace latinime #endif // LATINIME_ADDITIONAL_PROXIMITY_CHARS_H diff --git a/native/jni/src/basechars.cpp b/native/jni/src/basechars.cpp index 31f1e18a8..379cb6226 100644 --- a/native/jni/src/basechars.cpp +++ b/native/jni/src/basechars.cpp @@ -14,17 +14,19 @@ * limitations under the License. */ +#include <stdint.h> + #include "char_utils.h" namespace latinime { -/** +/* * Table mapping most combined Latin, Greek, and Cyrillic characters * to their base characters. If c is in range, BASE_CHARS[c] == c * if c is not a combined character, or the base character if it * is combined. */ -const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = { +const uint16_t BASE_CHARS[BASE_CHARS_SIZE] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, @@ -187,8 +189,6 @@ const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = { 0x0423, 0x0443, 0x0423, 0x0443, 0x0427, 0x0447, 0x04f6, 0x04f7, 0x042b, 0x044b, 0x04fa, 0x04fb, 0x04fc, 0x04fd, 0x04fe, 0x04ff, }; - // generated with: // cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; $base[hex($foo[0])] = hex($foo[5]);} for ($i = 0; $i < 0x500; $i += 8) { for ($j = $i; $j < $i + 8; $j++) { printf("0x%04x, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' - } // namespace latinime diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index 9ef024dc4..dade4f16b 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -1,21 +1,20 @@ /* -** -** Copyright 2010, The Android Open Source Project -** -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** -** http://www.apache.org/licenses/LICENSE-2.0 -** -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -*/ - -#include <string.h> + * Copyright (C) 2010, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstring> #define LOG_TAG "LatinIME: bigram_dictionary.cpp" @@ -27,9 +26,8 @@ namespace latinime { -BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, - Dictionary *parentDictionary) - : DICT(dict), MAX_WORD_LENGTH(maxWordLength), mParentDictionary(parentDictionary) { +BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, int maxPredictions) + : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_PREDICTIONS(maxPredictions) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } @@ -38,7 +36,8 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, BigramDictionary::~BigramDictionary() { } -bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) { +bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency, + int *bigramFreq, unsigned short *bigramChars, int *outputTypes) const { word[length] = 0; if (DEBUG_DICT) { #ifdef FLAG_DBG @@ -50,25 +49,26 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ // Find the right insertion point int insertAt = 0; - while (insertAt < mMaxBigrams) { - if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency - && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) { + while (insertAt < MAX_PREDICTIONS) { + if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency + && length < Dictionary::wideStrLen(bigramChars + insertAt * MAX_WORD_LENGTH))) { break; } insertAt++; } if (DEBUG_DICT) { - AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + AKLOGI("Bigram: InsertAt -> %d MAX_PREDICTIONS: %d", insertAt, MAX_PREDICTIONS); } - if (insertAt < mMaxBigrams) { - memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), - (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), - (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0])); - mBigramFreq[insertAt] = frequency; - memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short), - (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short), - (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH); - unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH; + if (insertAt < MAX_PREDICTIONS) { + memmove(bigramFreq + (insertAt + 1), + bigramFreq + insertAt, + (MAX_PREDICTIONS - insertAt - 1) * sizeof(bigramFreq[0])); + bigramFreq[insertAt] = frequency; + outputTypes[insertAt] = Dictionary::KIND_PREDICTION; + memmove(bigramChars + (insertAt + 1) * MAX_WORD_LENGTH, + bigramChars + insertAt * MAX_WORD_LENGTH, + (MAX_PREDICTIONS - insertAt - 1) * sizeof(bigramChars[0]) * MAX_WORD_LENGTH); + unsigned short *dest = bigramChars + insertAt * MAX_WORD_LENGTH; while (length--) { *dest++ = *word++; } @@ -84,12 +84,11 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ /* Parameters : * prevWord: the word before, the one for which we need to look up bigrams. * prevWordLength: its length. - * codes: what user typed, in the same format as for UnigramDictionary::getSuggestions. + * inputCodes: what user typed, in the same format as for UnigramDictionary::getSuggestions. * codesSize: the size of the codes array. * bigramChars: an array for output, at the same format as outwords for getSuggestions. * bigramFreq: an array to output frequencies. - * maxWordLength: the maximum size of a word. - * maxBigrams: the maximum number of bigrams fitting in the bigramChars array. + * outputTypes: an array to output types. * This method returns the number of bigrams this word has, for backward compatibility. * Note: this is not the number of bigrams output in the array, which is the number of * bigrams this word has WHOSE first letter also matches the letter the user typed. @@ -98,21 +97,23 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ * and the bigrams are used to boost unigram result scores, it makes little sense to * reduce their scope to the ones that match the first letter. */ -int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, int *codes, - int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength, - int maxBigrams) { +int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, int *inputCodes, + int codesSize, unsigned short *bigramChars, int *bigramFreq, int *outputTypes) const { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name - mBigramFreq = bigramFreq; - mBigramChars = bigramChars; - mInputCodes = codes; - mMaxBigrams = maxBigrams; - const uint8_t* const root = DICT; - int pos = getBigramListPositionForWord(prevWord, prevWordLength); + const uint8_t *const root = DICT; + int pos = getBigramListPositionForWord(prevWord, prevWordLength, + false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams + if (0 == pos) { + // If no bigrams for this exact word, search again in lower case. + pos = getBigramListPositionForWord(prevWord, prevWordLength, + true /* forceLowerCaseSearch */); + } + // If still no bigrams, we really don't have them! if (0 == pos) return 0; - int bigramFlags; + uint8_t bigramFlags; int bigramCount = 0; do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); @@ -124,36 +125,38 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in bigramBuffer, &unigramFreq); // codesSize == 0 means we are trying to find bigram predictions. - if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) { - const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) { + const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; // Due to space constraints, the frequency for bigrams is approximate - the lower the // unigram frequency, the worse the precision. The theoritical maximum error in // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. const int frequency = - BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq); - if (addWordBigram(bigramBuffer, length, frequency)) { + BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp); + if (addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramChars, + outputTypes)) { ++bigramCount; } } - } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); + } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); return bigramCount; } // Returns a pointer to the start of the bigram list. // If the word is not found or has no bigrams, this function returns 0. int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, - const int prevWordLength) { + const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return 0; - const uint8_t* const root = DICT; - int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength); + const uint8_t *const root = DICT; + int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, + forceLowerCaseSearch); if (NOT_VALID_WORD == pos) return 0; - const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - if (0 == (flags & UnigramDictionary::FLAG_HAS_BIGRAMS)) return 0; - if (0 == (flags & UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS)) { - BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0; + if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { + BinaryFormat::getCodePointAndForwardPointer(root, &pos); } else { pos = BinaryFormat::skipOtherCharacters(root, pos); } @@ -164,28 +167,33 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, } void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, - const int prevWordLength, std::map<int, int> *map, uint8_t *filter) { + const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); - const uint8_t* const root = DICT; - int pos = getBigramListPositionForWord(prevWord, prevWordLength); + const uint8_t *const root = DICT; + int pos = getBigramListPositionForWord(prevWord, prevWordLength, + false /* forceLowerCaseSearch */); + if (0 == pos) { + // If no bigrams for this exact string, search again in lower case. + pos = getBigramListPositionForWord(prevWord, prevWordLength, + true /* forceLowerCaseSearch */); + } if (0 == pos) return; - int bigramFlags; + uint8_t bigramFlags; do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); (*map)[bigramPos] = frequency; setInFilter(filter, bigramPos); - } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); + } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } -bool BigramDictionary::checkFirstCharacter(unsigned short *word) { +bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const { // Checks whether this word starts with same character or neighboring characters of // what user typed. - int *inputCodes = mInputCodes; int maxAlt = MAX_ALTERNATIVES; const unsigned short firstBaseChar = toBaseLowerCase(*word); while (maxAlt > 0) { @@ -199,14 +207,15 @@ bool BigramDictionary::checkFirstCharacter(unsigned short *word) { } bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2, - int length2) { - const uint8_t* const root = DICT; - int pos = getBigramListPositionForWord(word1, length1); + int length2) const { + const uint8_t *const root = DICT; + int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; - int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2); + int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2, + false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == nextWordPos) return false; - int bigramFlags; + uint8_t bigramFlags; do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, @@ -214,7 +223,7 @@ bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const in if (bigramPos == nextWordPos) { return true; } - } while (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); + } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); return false; } diff --git a/native/jni/src/bigram_dictionary.h b/native/jni/src/bigram_dictionary.h index b8763a515..5f11ae822 100644 --- a/native/jni/src/bigram_dictionary.h +++ b/native/jni/src/bigram_dictionary.h @@ -24,39 +24,33 @@ namespace latinime { -class Dictionary; class BigramDictionary { public: - BigramDictionary(const unsigned char *dict, int maxWordLength, Dictionary *parentDictionary); - int getBigrams(const int32_t *word, int length, int *codes, int codesSize, - unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams); - int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength); + BigramDictionary(const unsigned char *dict, int maxWordLength, int maxPredictions); + int getBigrams(const int32_t *word, int length, int *inputCodes, int codesSize, + unsigned short *outWords, int *frequencies, int *outputTypes) const; void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, - std::map<int, int> *map, uint8_t *filter); - bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2); + std::map<int, int> *map, uint8_t *filter) const; + bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const; ~BigramDictionary(); private: - bool addWordBigram(unsigned short *word, int length, int frequency); + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); + bool addWordBigram(unsigned short *word, int length, int frequency, + int *bigramFreq, unsigned short *bigramChars, int *outputTypes) const; int getBigramAddress(int *pos, bool advance); int getBigramFreq(int *pos); void searchForTerminalNode(int addressLookingFor, int frequency); bool getFirstBitOfByte(int *pos) { return (DICT[*pos] & 0x80) > 0; } bool getSecondBitOfByte(int *pos) { return (DICT[*pos] & 0x40) > 0; } - bool checkFirstCharacter(unsigned short *word); + bool checkFirstCharacter(unsigned short *word, int *inputCodes) const; + int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength, + const bool forceLowerCaseSearch) const; const unsigned char *DICT; const int MAX_WORD_LENGTH; + const int MAX_PREDICTIONS; // TODO: Re-implement proximity correction for bigram correction static const int MAX_ALTERNATIVES = 1; - - Dictionary *mParentDictionary; - int *mBigramFreq; - int mMaxBigrams; - unsigned short *mBigramChars; - int *mInputCodes; - int mInputLength; }; - } // namespace latinime - #endif // LATINIME_BIGRAM_DICTIONARY_H diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 51bf8ebbc..eec52e323 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -18,18 +18,53 @@ #define LATINIME_BINARY_FORMAT_H #include <limits> +#include <map> #include "bloom_filter.h" -#include "unigram_dictionary.h" +#include "char_utils.h" namespace latinime { class BinaryFormat { - private: - const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; - const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; - const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; - public: + // Mask and flags for children address type selection. + static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; + static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; + static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; + static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; + static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; + + // Flag for single/multiple char group + static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; + + // Flag for terminal groups + static const int FLAG_IS_TERMINAL = 0x10; + + // Flag for shortcut targets presence + static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; + // Flag for bigram presence + static const int FLAG_HAS_BIGRAMS = 0x04; + // Flag for non-words (typically, shortcut only entries) + static const int FLAG_IS_NOT_A_WORD = 0x02; + // Flag for blacklist + static const int FLAG_IS_BLACKLISTED = 0x01; + + // Attribute (bigram/shortcut) related flags: + // Flag for presence of more attributes + static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; + // Flag for sign of offset. If this flag is set, the offset value must be negated. + static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; + + // Mask for attribute frequency, stored on 4 bits inside the flags byte. + static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; + // The numeric value of the shortcut frequency that means 'whitelist'. + static const int WHITELIST_SHORTCUT_FREQUENCY = 15; + + // Mask and flags for attribute address type selection. + static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + const static int UNKNOWN_FORMAT = -1; // Originally, format version 1 had a 16-bit magic number, then the version number `01' // then options that must be 0. Hence the first 32-bits of the format are always as follow @@ -44,29 +79,29 @@ class BinaryFormat { const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; const static int SHORTCUT_LIST_SIZE_SIZE = 2; - static int detectFormat(const uint8_t* const dict); - static unsigned int getHeaderSize(const uint8_t* const dict); - static unsigned int getFlags(const uint8_t* const dict); - static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); - static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); - static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); - static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); - static int skipOtherCharacters(const uint8_t* const dict, const int pos); + static int detectFormat(const uint8_t *const dict); + static unsigned int getHeaderSize(const uint8_t *const dict); + static unsigned int getFlags(const uint8_t *const dict); + static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); + static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); + static int32_t getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); + static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); + static int skipOtherCharacters(const uint8_t *const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); static int skipFrequency(const uint8_t flags, const int pos); - static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos); - static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos); - static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); - static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, + static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); + static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos); + static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); - static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos); + static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos); static bool hasChildrenInFlags(const uint8_t flags); - static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, + static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos); - static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, - const int length); - static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, - uint16_t* outWord, int* outUnigramFrequency); + static int getAttributeFrequencyFromFlags(const int flags); + static int getTerminalPosition(const uint8_t *const root, const int32_t *const inWord, + const int length, const bool forceLowerCaseSearch); + static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, + uint16_t *outWord, int *outUnigramFrequency); static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); @@ -79,9 +114,16 @@ class BinaryFormat { REQUIRES_FRENCH_LIGATURES_PROCESSING = 0x4 }; const static unsigned int NO_FLAGS = 0; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); + const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; + const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; + static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); }; -inline int BinaryFormat::detectFormat(const uint8_t* const dict) { +inline int BinaryFormat::detectFormat(const uint8_t *const dict) { // The magic number is stored big-endian. const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; switch (magicNumber) { @@ -103,7 +145,7 @@ inline int BinaryFormat::detectFormat(const uint8_t* const dict) { } } -inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) { +inline unsigned int BinaryFormat::getFlags(const uint8_t *const dict) { switch (detectFormat(dict)) { case 1: return NO_FLAGS; @@ -112,7 +154,7 @@ inline unsigned int BinaryFormat::getFlags(const uint8_t* const dict) { } } -inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { +inline unsigned int BinaryFormat::getHeaderSize(const uint8_t *const dict) { switch (detectFormat(dict)) { case 1: return FORMAT_VERSION_1_HEADER_SIZE; @@ -124,41 +166,41 @@ inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { } } -inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { +inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos) { const int msb = dict[(*pos)++]; if (msb < 0x80) return msb; return ((msb & 0x7F) << 8) | dict[(*pos)++]; } -inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { +inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict, int *pos) { return dict[(*pos)++]; } -inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) { +inline int32_t BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, int *pos) { const int origin = *pos; - const int32_t character = dict[origin]; - if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { - if (character == CHARACTER_ARRAY_TERMINATOR) { + const int32_t codePoint = dict[origin]; + if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { + if (codePoint == CHARACTER_ARRAY_TERMINATOR) { *pos = origin + 1; - return NOT_A_CHARACTER; + return NOT_A_CODE_POINT; } else { *pos = origin + 3; - const int32_t char_1 = character << 16; + const int32_t char_1 = codePoint << 16; const int32_t char_2 = char_1 + (dict[origin + 1] << 8); return char_2 + dict[origin + 2]; } } else { *pos = origin + 1; - return character; + return codePoint; } } -inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict, +inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos) { return dict[pos]; } -inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) { +inline int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) { int currentPos = pos; int32_t character = dict[currentPos++]; while (CHARACTER_ARRAY_TERMINATOR != character) { @@ -172,22 +214,22 @@ inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const in static inline int attributeAddressSize(const uint8_t flags) { static const int ATTRIBUTE_ADDRESS_SHIFT = 4; - return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; /* Note: this is a value-dependant optimization of what may probably be more readably written this way: - switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; default: return 0; } */ } -static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) { +static inline int skipExistingBigrams(const uint8_t *const dict, const int pos) { int currentPos = pos; uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); - while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { + while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) { currentPos += attributeAddressSize(flags); flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); } @@ -197,11 +239,11 @@ static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) static inline int childrenAddressSize(const uint8_t flags) { static const int CHILDREN_ADDRESS_SHIFT = 6; - return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; + return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; /* See the note in attributeAddressSize. The same applies here */ } -static inline int shortcutByteSize(const uint8_t* const dict, const int pos) { +static inline int shortcutByteSize(const uint8_t *const dict, const int pos) { return ((int)(dict[pos] << 8)) + (dict[pos + 1]); } @@ -210,28 +252,28 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos } inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { - return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; + return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } -inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos) { - if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { + if (FLAG_HAS_SHORTCUT_TARGETS & flags) { return pos + shortcutByteSize(dict, pos); } else { return pos; } } -inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos) { - if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { + if (FLAG_HAS_BIGRAMS & flags) { return skipExistingBigrams(dict, pos); } else { return pos; } } -inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) { // This function skips all attributes: shortcuts and bigrams. int newPos = pos; @@ -240,7 +282,7 @@ inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint return newPos; } -inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, +inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) { int currentPos = pos; currentPos = skipChildrenPosition(flags, currentPos); @@ -248,18 +290,18 @@ inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, return currentPos; } -inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags, +inline int BinaryFormat::readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos) { int offset = 0; - switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + switch (MASK_GROUP_ADDRESS_TYPE & flags) { + case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: offset = dict[pos]; break; - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: offset = dict[pos] << 8; offset += dict[pos + 1]; break; - case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: offset = dict[pos] << 16; offset += dict[pos + 1] << 8; offset += dict[pos + 2]; @@ -273,74 +315,77 @@ inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const u } inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { - return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); + return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags)); } -inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict, +inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos) { int offset = 0; const int origin = *pos; - switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: offset = dict[origin]; *pos = origin + 1; break; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: offset = dict[origin] << 8; offset += dict[origin + 1]; *pos = origin + 2; break; - case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: offset = dict[origin] << 16; offset += dict[origin + 1] << 8; offset += dict[origin + 2]; *pos = origin + 3; break; } - if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { + if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { return origin - offset; } else { return origin + offset; } } +inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) { + return flags & MASK_ATTRIBUTE_FREQUENCY; +} + // This function gets the byte position of the last chargroup of the exact matching word in the // dictionary. If no match is found, it returns NOT_VALID_WORD. -inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, - const int32_t* const inWord, const int length) { +inline int BinaryFormat::getTerminalPosition(const uint8_t *const root, + const int32_t *const inWord, const int length, const bool forceLowerCaseSearch) { int pos = 0; int wordPos = 0; while (true) { // If we already traversed the tree further than the word is long, there means // there was no match (or we would have found it). - if (wordPos > length) return NOT_VALID_WORD; + if (wordPos >= length) return NOT_VALID_WORD; int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); - const int32_t wChar = inWord[wordPos]; + const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos]; while (true) { // If there are no more character groups in this node, it means we could not // find a matching character for this depth, therefore there is no match. if (0 >= charGroupCount) return NOT_VALID_WORD; const int charGroupPos = pos; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + int32_t character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); if (character == wChar) { // This is the correct node. Only one character group may start with the same // char within a node, so either we found our match in this node, or there is // no match and we can return NOT_VALID_WORD. So we will check all the characters // in this character group indeed does match. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - while (NOT_A_CHARACTER != character) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { + character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); + while (NOT_A_CODE_POINT != character) { ++wordPos; // If we shoot the length of the word we search for, or if we find a single // character that does not match, as explained above, it means the word is // not in the dictionary (by virtue of this chargroup being the only one to // match the word on the first character, but not matching the whole word). - if (wordPos > length) return NOT_VALID_WORD; + if (wordPos >= length) return NOT_VALID_WORD; if (inWord[wordPos] != character) return NOT_VALID_WORD; - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); } } // If we come here we know that so far, we do match. Either we are on a terminal @@ -348,14 +393,13 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, // If we don't match the length AND don't have children, then a word in the // dictionary fully matches a prefix of the searched word but not the full word. ++wordPos; - if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { + if (FLAG_IS_TERMINAL & flags) { if (wordPos == length) { return charGroupPos; } - pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); + pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); } - if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { + if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { return NOT_VALID_WORD; } // We have children and we are still shorter than the word we are searching for, so @@ -365,7 +409,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, break; } else { // This chargroup does not match, so skip the remaining part and go to the next. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = BinaryFormat::skipOtherCharacters(root, pos); } pos = BinaryFormat::skipFrequency(flags, pos); @@ -394,8 +438,8 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, * outUnigramFrequency: a pointer to an int to write the frequency into. * Return value : the length of the word, of 0 if the word was not found. */ -inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, - const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { +inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, + const int maxDepth, uint16_t *outWord, int *outUnigramFrequency) { int pos = 0; int wordPos = 0; @@ -413,19 +457,19 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a --charGroupCount) { const int startPos = pos; const uint8_t flags = getFlagsAndForwardPointer(root, &pos); - const int32_t character = getCharCodeAndForwardPointer(root, &pos); + const int32_t character = getCodePointAndForwardPointer(root, &pos); if (address == startPos) { // We found the address. Copy the rest of the word in the buffer and return // the length. outWord[wordPos] = character; - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { - int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); + if (FLAG_HAS_MULTIPLE_CHARS & flags) { + int32_t nextChar = getCodePointAndForwardPointer(root, &pos); // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug int charCount = maxDepth; - while (NOT_A_CHARACTER != nextChar && --charCount > 0) { + while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { outWord[++wordPos] = nextChar; - nextChar = getCharCodeAndForwardPointer(root, &pos); + nextChar = getCodePointAndForwardPointer(root, &pos); } } *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); @@ -433,7 +477,7 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a } // We need to skip past this char group, so skip any remaining chars after the // first and possibly the frequency. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = skipOtherCharacters(root, pos); } pos = skipFrequency(flags, pos); @@ -441,8 +485,8 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a // The fact that this group has children is very important. Since we already know // that this group does not match, if it has no children we know it is irrelevant // to what we are searching for. - const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != - (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); + const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != + (MASK_GROUP_ADDRESS_TYPE & flags)); // We will write in `found' whether we have passed the children address we are // searching for. For example if we search for "beer", the children of b are less // than the address we are searching for and the children of c are greater. When we @@ -479,16 +523,16 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a const uint8_t lastFlags = getFlagsAndForwardPointer(root, &lastCandidateGroupPos); const int32_t lastChar = - getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); + getCodePointAndForwardPointer(root, &lastCandidateGroupPos); // We copy all the characters in this group to the buffer outWord[wordPos] = lastChar; - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { + if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) { int32_t nextChar = - getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); + getCodePointAndForwardPointer(root, &lastCandidateGroupPos); int charCount = maxDepth; while (-1 != nextChar && --charCount > 0) { outWord[++wordPos] = nextChar; - nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); + nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); } } ++wordPos; @@ -538,8 +582,8 @@ inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const // 0 for the bigram frequency represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. - const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); - return (int)(unigramFreq + (bigramFreq + 1) * stepSize); + const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); + return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize); } // This returns a probability in log space. @@ -555,7 +599,5 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int, return backoff(unigramFreq); } } - } // namespace latinime - #endif // LATINIME_BINARY_FORMAT_H diff --git a/native/jni/src/bloom_filter.h b/native/jni/src/bloom_filter.h index 7ae6a1fa4..bcce1f7ea 100644 --- a/native/jni/src/bloom_filter.h +++ b/native/jni/src/bloom_filter.h @@ -23,16 +23,16 @@ namespace latinime { -static inline void setInFilter(uint8_t *filter, const int position) { - const unsigned int bucket = position % BIGRAM_FILTER_MODULO; - filter[bucket >> 3] |= (1 << (bucket & 0x7)); +// TODO: uint32_t position +static inline void setInFilter(uint8_t *filter, const int32_t position) { + const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO); + filter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7)); } -static inline bool isInFilter(const uint8_t *filter, const int position) { - const unsigned int bucket = position % BIGRAM_FILTER_MODULO; - return filter[bucket >> 3] & (1 << (bucket & 0x7)); +// TODO: uint32_t position +static inline bool isInFilter(const uint8_t *filter, const int32_t position) { + const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO); + return filter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7)); } - } // namespace latinime - #endif // LATINIME_BLOOM_FILTER_H diff --git a/native/jni/src/char_utils.cpp b/native/jni/src/char_utils.cpp index a31a0632c..9d886da31 100644 --- a/native/jni/src/char_utils.cpp +++ b/native/jni/src/char_utils.cpp @@ -14,7 +14,9 @@ * limitations under the License. */ -#include <stdlib.h> +#include <cstdlib> + +#include "char_utils.h" namespace latinime { @@ -883,17 +885,16 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { }; static int compare_pair_capital(const void *a, const void *b) { - return (int)(*(unsigned short *)a) - - (int)((struct LatinCapitalSmallPair*)b)->capital; + return static_cast<int>(*static_cast<const unsigned short *>(a)) + - static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital); } -unsigned short latin_tolower(unsigned short c) { +unsigned short latin_tolower(const unsigned short c) { struct LatinCapitalSmallPair *p = - (struct LatinCapitalSmallPair *)bsearch(&c, SORTED_CHAR_MAP, + static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP, sizeof(SORTED_CHAR_MAP) / sizeof(SORTED_CHAR_MAP[0]), sizeof(SORTED_CHAR_MAP[0]), - compare_pair_capital); + compare_pair_capital)); return p ? p->small : c; } - } // namespace latinime diff --git a/native/jni/src/char_utils.h b/native/jni/src/char_utils.h index 607dc5195..b17f262ec 100644 --- a/native/jni/src/char_utils.h +++ b/native/jni/src/char_utils.h @@ -17,21 +17,24 @@ #ifndef LATINIME_CHAR_UTILS_H #define LATINIME_CHAR_UTILS_H +#include <cctype> +#include <stdint.h> + namespace latinime { -inline static int isAsciiUpper(unsigned short c) { - return c >= 'A' && c <= 'Z'; +inline static bool isAsciiUpper(unsigned short c) { + return isupper(static_cast<int>(c)) != 0; } inline static unsigned short toAsciiLower(unsigned short c) { return c - 'A' + 'a'; } -inline static int isAscii(unsigned short c) { - return c <= 127; +inline static bool isAscii(unsigned short c) { + return isascii(static_cast<int>(c)) != 0; } -unsigned short latin_tolower(unsigned short c); +unsigned short latin_tolower(const unsigned short c); /** * Table mapping most combined Latin, Greek, and Cyrillic characters @@ -41,7 +44,7 @@ unsigned short latin_tolower(unsigned short c); */ static const int BASE_CHARS_SIZE = 0x0500; -extern const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; +extern const uint16_t BASE_CHARS[BASE_CHARS_SIZE]; inline static unsigned short toBaseChar(unsigned short c) { if (c < BASE_CHARS_SIZE) { @@ -50,8 +53,7 @@ inline static unsigned short toBaseChar(unsigned short c) { return c; } -inline static unsigned short toBaseLowerCase(unsigned short c) { - c = toBaseChar(c); +inline static unsigned short toLowerCase(const unsigned short c) { if (isAsciiUpper(c)) { return toAsciiLower(c); } else if (isAscii(c)) { @@ -60,6 +62,8 @@ inline static unsigned short toBaseLowerCase(unsigned short c) { return latin_tolower(c); } +inline static unsigned short toBaseLowerCase(const unsigned short c) { + return toLowerCase(toBaseChar(c)); +} } // namespace latinime - #endif // LATINIME_CHAR_UTILS_H diff --git a/native/jni/src/correction.cpp b/native/jni/src/correction.cpp index 99f5b92c1..49e3e3c8c 100644 --- a/native/jni/src/correction.cpp +++ b/native/jni/src/correction.cpp @@ -14,22 +14,22 @@ * limitations under the License. */ -#include <assert.h> -#include <ctype.h> -#include <math.h> -#include <stdio.h> -#include <string.h> +#include <cassert> +#include <cctype> +#include <cmath> +#include <cstring> #define LOG_TAG "LatinIME: correction.cpp" #include "char_utils.h" #include "correction.h" #include "defines.h" -#include "dictionary.h" -#include "proximity_info.h" +#include "proximity_info_state.h" namespace latinime { +class ProximityInfo; + ///////////////////////////// // edit distance funcitons // ///////////////////////////// @@ -55,25 +55,25 @@ inline static void dumpEditDistance10ForDebug(int *editDistanceTable, } AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]", c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); - (void)c; + (void)c; // To suppress compiler warning } } } inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input, - const int inputLength, const unsigned short *output, const int outputLength) { + const int inputSize, const unsigned short *output, const int outputLength) { // TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched. - // Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j]. - // Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated, - // and calculate dp[ouputLength][0] ... dp[outputLength][inputLength]. - int *const current = editDistanceTable + outputLength * (inputLength + 1); - const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1); + // Let dp[i][j] be editDistanceTable[i * (inputSize + 1) + j]. + // Assuming that dp[0][0] ... dp[outputLength - 1][inputSize] are already calculated, + // and calculate dp[ouputLength][0] ... dp[outputLength][inputSize]. + int *const current = editDistanceTable + outputLength * (inputSize + 1); + const int *const prev = editDistanceTable + (outputLength - 1) * (inputSize + 1); const int *const prevprev = - outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0; + outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputSize + 1) : 0; current[0] = outputLength; const uint32_t co = toBaseLowerCase(output[outputLength - 1]); const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0; - for (int i = 1; i <= inputLength; ++i) { + for (int i = 1; i <= inputSize; ++i) { const uint32_t ci = toBaseLowerCase(input[i - 1]); const uint16_t cost = (ci == co) ? 0 : 1; current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost)); @@ -84,42 +84,37 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne } inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth, - const int outputLength, const int inputLength) { + const int outputLength, const int inputSize) { if (DEBUG_EDIT_DISTANCE) { - AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); + AKLOGI("getCurrentEditDistance %d, %d", inputSize, outputLength); } - return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength]; + return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputSize]; } ////////////////////// // inline functions // ////////////////////// -static const char QUOTE = '\''; +static const char SINGLE_QUOTE = '\''; -inline bool Correction::isQuote(const unsigned short c) { - const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex); - return (c == QUOTE && userTypedChar != QUOTE); +inline bool Correction::isSingleQuote(const unsigned short c) { + const unsigned short userTypedChar = mProximityInfoState.getPrimaryCharAt(mInputIndex); + return (c == SINGLE_QUOTE && userTypedChar != SINGLE_QUOTE); } //////////////// // Correction // //////////////// -Correction::Correction(const int typedLetterMultiplier, const int fullWordMultiplier) - : TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) { - initEditDistance(mEditDistanceTable); -} - void Correction::resetCorrection() { mTotalTraverseCount = 0; } -void Correction::initCorrection(const ProximityInfo *pi, const int inputLength, +void Correction::initCorrection(const ProximityInfo *pi, const int inputSize, const int maxDepth) { mProximityInfo = pi; - mInputLength = inputLength; + mInputSize = inputSize; mMaxDepth = maxDepth; - mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; + mMaxEditDistance = mInputSize < 5 ? 2 : mInputSize / 2; // TODO: This is not supposed to be required. Check what's going wrong with // editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] initEditDistance(mEditDistanceTable); @@ -159,11 +154,13 @@ void Correction::checkState() { if (mSkipPos >= 0) ++inputCount; if (mExcessivePos >= 0) ++inputCount; if (mTransposedPos >= 0) ++inputCount; - // TODO: remove this assert - assert(inputCount <= 1); } } +bool Correction::sameAsTyped() { + return mProximityInfoState.sameAsTyped(mWord, mOutputIndex); +} + int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, const int wordCount, const bool isSpaceProximity, const unsigned short *word) { return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray, @@ -171,26 +168,22 @@ int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wo } int Correction::getFinalProbability(const int probability, unsigned short **word, int *wordLength) { - return getFinalProbabilityInternal(probability, word, wordLength, mInputLength); + return getFinalProbabilityInternal(probability, word, wordLength, mInputSize); } int Correction::getFinalProbabilityForSubQueue(const int probability, unsigned short **word, - int *wordLength, const int inputLength) { - return getFinalProbabilityInternal(probability, word, wordLength, inputLength); + int *wordLength, const int inputSize) { + return getFinalProbabilityInternal(probability, word, wordLength, inputSize); } int Correction::getFinalProbabilityInternal(const int probability, unsigned short **word, - int *wordLength, const int inputLength) { + int *wordLength, const int inputSize) { const int outputIndex = mTerminalOutputIndex; const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; - if (outputIndex < MIN_SUGGEST_DEPTH) { - return NOT_A_PROBABILITY; - } - *word = mWord; int finalProbability= Correction::RankingAlgorithm::calculateFinalProbability( - inputIndex, outputIndex, probability, mEditDistanceTable, this, inputLength); + inputIndex, outputIndex, probability, mEditDistanceTable, this, inputSize); return finalProbability; } @@ -233,7 +226,7 @@ int Correction::goDownTree( } // TODO: remove -int Correction::getInputIndex() { +int Correction::getInputIndex() const { return mInputIndex; } @@ -277,13 +270,13 @@ bool Correction::needsToPrune() const { // TODO: use edit distance here return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance // Allow one char longer word for missing character - || (!mDoAutoCompletion && (mOutputIndex > mInputLength)); + || (!mDoAutoCompletion && (mOutputIndex > mInputSize)); } void Correction::addCharToCurrentWord(const int32_t c) { mWord[mOutputIndex] = c; - const unsigned short *primaryInputWord = mProximityInfo->getPrimaryInputWord(); - calcEditDistanceOneStep(mEditDistanceTable, primaryInputWord, mInputLength, + const unsigned short *primaryInputWord = mProximityInfoState.getPrimaryInputWord(); + calcEditDistanceOneStep(mEditDistanceTable, primaryInputWord, mInputSize, mWord, mOutputIndex + 1); } @@ -308,13 +301,12 @@ Correction::CorrectionType Correction::processUnrelatedCorrectionType() { return UNRELATED; } -inline bool isEquivalentChar(ProximityInfo::ProximityType type) { - return type == ProximityInfo::EQUIVALENT_CHAR; +inline bool isEquivalentChar(ProximityType type) { + return type == EQUIVALENT_CHAR; } -inline bool isProximityCharOrEquivalentChar(ProximityInfo::ProximityType type) { - return type == ProximityInfo::EQUIVALENT_CHAR - || type == ProximityInfo::NEAR_PROXIMITY_CHAR; +inline bool isProximityCharOrEquivalentChar(ProximityType type) { + return type == EQUIVALENT_CHAR || type == NEAR_PROXIMITY_CHAR; } Correction::CorrectionType Correction::processCharAndCalcState( @@ -331,25 +323,25 @@ Correction::CorrectionType Correction::processCharAndCalcState( mDistances[mOutputIndex] = NOT_A_DISTANCE; // Skip checking this node - if (mNeedsToTraverseAllNodes || isQuote(c)) { + if (mNeedsToTraverseAllNodes || isSingleQuote(c)) { bool incremented = false; - if (mLastCharExceeded && mInputIndex == mInputLength - 1) { + if (mLastCharExceeded && mInputIndex == mInputSize - 1) { // TODO: Do not check the proximity if EditDistance exceeds the threshold - const ProximityInfo::ProximityType matchId = - mProximityInfo->getMatchedProximityId(mInputIndex, c, true, &proximityIndex); + const ProximityType matchId = mProximityInfoState.getMatchedProximityId( + mInputIndex, c, true, &proximityIndex); if (isEquivalentChar(matchId)) { mLastCharExceeded = false; --mExcessiveCount; mDistances[mOutputIndex] = - mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0); - } else if (matchId == ProximityInfo::NEAR_PROXIMITY_CHAR) { + mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0); + } else if (matchId == NEAR_PROXIMITY_CHAR) { mLastCharExceeded = false; --mExcessiveCount; ++mProximityCount; - mDistances[mOutputIndex] = - mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); + mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance( + mInputIndex, proximityIndex); } - if (!isQuote(c)) { + if (!isSingleQuote(c)) { incrementInputIndex(); incremented = true; } @@ -362,7 +354,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { mExcessivePos = mOutputIndex; } - if (mExcessivePos < mInputLength - 1) { + if (mExcessivePos < mInputSize - 1) { mExceeding = mExcessivePos == mInputIndex && canTryCorrection; } } @@ -370,7 +362,8 @@ Correction::CorrectionType Correction::processCharAndCalcState( if (mSkipPos >= 0) { if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { if (DEBUG_DICT) { - assert(mSkipPos == mOutputIndex - 1); + // TODO: Enable this assertion. + //assert(mSkipPos == mOutputIndex - 1); } mSkipPos = mOutputIndex; } @@ -381,14 +374,15 @@ Correction::CorrectionType Correction::processCharAndCalcState( if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) { mTransposedPos = mOutputIndex; } - if (mTransposedPos < mInputLength - 1) { + if (mTransposedPos < mInputSize - 1) { mTransposing = mInputIndex == mTransposedPos && canTryCorrection; } } bool secondTransposing = false; if (mTransposedCount % 2 == 1) { - if (isEquivalentChar(mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { + if (isEquivalentChar(mProximityInfoState.getMatchedProximityId( + mInputIndex - 1, c, false))) { ++mTransposedCount; secondTransposing = true; } else if (mCorrectionStates[mOutputIndex].mExceeding) { @@ -399,7 +393,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( } else { --mTransposedCount; if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); @@ -417,20 +411,20 @@ Correction::CorrectionType Correction::processCharAndCalcState( ? (noCorrectionsHappenedSoFar || mProximityCount == 0) : (noCorrectionsHappenedSoFar && mProximityCount == 0); - ProximityInfo::ProximityType matchedProximityCharId = secondTransposing - ? ProximityInfo::EQUIVALENT_CHAR - : mProximityInfo->getMatchedProximityId( + ProximityType matchedProximityCharId = secondTransposing + ? EQUIVALENT_CHAR + : mProximityInfoState.getMatchedProximityId( mInputIndex, c, checkProximityChars, &proximityIndex); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId - || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + if (UNRELATED_CHAR == matchedProximityCharId + || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { if (canTryCorrection && mOutputIndex > 0 && mCorrectionStates[mOutputIndex].mProximityMatching && mCorrectionStates[mOutputIndex].mExceeding - && isEquivalentChar(mProximityInfo->getMatchedProximityId( + && isEquivalentChar(mProximityInfoState.getMatchedProximityId( mInputIndex, mWord[mOutputIndex - 1], false))) { if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]); @@ -446,27 +440,27 @@ Correction::CorrectionType Correction::processCharAndCalcState( // Here, we are doing something equivalent to matchedProximityCharId, // but we already know that "excessive char correction" just happened // so that we just need to check "mProximityCount == 0". - matchedProximityCharId = mProximityInfo->getMatchedProximityId( + matchedProximityCharId = mProximityInfoState.getMatchedProximityId( mInputIndex, c, mProximityCount == 0, &proximityIndex); } } - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId - || ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { - if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + if (UNRELATED_CHAR == matchedProximityCharId + || ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { mAdditionalProximityMatching = true; } // TODO: Optimize // As the current char turned out to be an unrelated char, // we will try other correction-types. Please note that mCorrectionStates[mOutputIndex] // here refers to the previous state. - if (mInputIndex < mInputLength - 1 && mOutputIndex > 0 && mTransposedCount > 0 + if (mInputIndex < mInputSize - 1 && mOutputIndex > 0 && mTransposedCount > 0 && !mCorrectionStates[mOutputIndex].mTransposing && mCorrectionStates[mOutputIndex - 1].mTransposing - && isEquivalentChar(mProximityInfo->getMatchedProximityId( + && isEquivalentChar(mProximityInfoState.getMatchedProximityId( mInputIndex, mWord[mOutputIndex - 1], false)) && isEquivalentChar( - mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { + mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) { // Conversion t->e // Example: // occaisional -> occa sional @@ -478,7 +472,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( && !mCorrectionStates[mOutputIndex].mTransposing && mCorrectionStates[mOutputIndex - 1].mTransposing && isEquivalentChar( - mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { + mProximityInfoState.getMatchedProximityId(mInputIndex - 1, c, false))) { // Conversion t->s // Example: // chcolate -> chocolate @@ -490,28 +484,28 @@ Correction::CorrectionType Correction::processCharAndCalcState( && mCorrectionStates[mOutputIndex].mProximityMatching && mCorrectionStates[mOutputIndex].mSkipping && isEquivalentChar( - mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) { + mProximityInfoState.getMatchedProximityId(mInputIndex - 1, c, false))) { // Conversion p->s // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of // proximity chars of "s", but it should rather be handled as a skipped char. ++mSkippedCount; --mProximityCount; return processSkipChar(c, isTerminal, false); - } else if (mInputIndex - 1 < mInputLength + } else if (mInputIndex - 1 < mInputSize && mSkippedCount > 0 && mCorrectionStates[mOutputIndex].mSkipping && mCorrectionStates[mOutputIndex].mAdditionalProximityMatching && isProximityCharOrEquivalentChar( - mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { + mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) { // Conversion s->a incrementInputIndex(); --mSkippedCount; mProximityMatching = true; ++mProximityCount; mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; - } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength + } else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputSize && isEquivalentChar( - mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) { + mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) { // 1.2. Excessive or transpose correction if (mTransposing) { ++mTransposedCount; @@ -520,7 +514,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( incrementInputIndex(); } if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); @@ -536,20 +530,20 @@ Correction::CorrectionType Correction::processCharAndCalcState( // 3. Skip correction ++mSkippedCount; if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } return processSkipChar(c, isTerminal, false); - } else if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { + } else if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) { // As a last resort, use additional proximity characters mProximityMatching = true; ++mProximityCount; mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO; if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, @@ -557,7 +551,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( } } else { if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); @@ -567,20 +561,20 @@ Correction::CorrectionType Correction::processCharAndCalcState( return processUnrelatedCorrectionType(); } } else if (secondTransposing) { - // If inputIndex is greater than mInputLength, that means there is no + // If inputIndex is greater than mInputSize, that means there is no // proximity chars. So, we don't need to check proximity. mMatching = true; } else if (isEquivalentChar(matchedProximityCharId)) { mMatching = true; ++mEquivalentCharCount; - mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0); - } else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) { + mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0); + } else if (NEAR_PROXIMITY_CHAR == matchedProximityCharId) { mProximityMatching = true; ++mProximityCount; mDistances[mOutputIndex] = - mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex); + mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, proximityIndex); if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, @@ -592,8 +586,8 @@ Correction::CorrectionType Correction::processCharAndCalcState( // 4. Last char excessive correction mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0 - && mProximityCount == 0 && (mInputIndex == mInputLength - 2); - const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded; + && mProximityCount == 0 && (mInputIndex == mInputSize - 2); + const bool isSameAsUserTypedLength = (mInputSize == mInputIndex + 1) || mLastCharExceeded; if (mLastCharExceeded) { ++mExcessiveCount; } @@ -604,7 +598,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( } const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar = - mExceeding && mInputIndex == mInputLength - 2; + mExceeding && mInputIndex == mInputSize - 2; // Finally, we are ready to go to the next character, the next "virtual node". // We should advance the input index. @@ -620,7 +614,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( mTerminalInputIndex = mInputIndex - 1; mTerminalOutputIndex = mOutputIndex - 1; if (DEBUG_CORRECTION - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength) + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputSize) && (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) { DUMP_WORD(mWord, mOutputIndex); AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, @@ -634,13 +628,10 @@ Correction::CorrectionType Correction::processCharAndCalcState( } } -Correction::~Correction() { -} - -inline static int getQuoteCount(const unsigned short* word, const int length) { +inline static int getQuoteCount(const unsigned short *word, const int length) { int quoteCount = 0; for (int i = 0; i < length; ++i) { - if(word[i] == '\'') { + if (word[i] == SINGLE_QUOTE) { ++quoteCount; } } @@ -657,12 +648,12 @@ inline static bool isUpperCase(unsigned short c) { /* static */ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex, - const int outputIndex, const int freq, int* editDistanceTable, const Correction* correction, - const int inputLength) { + const int outputIndex, const int freq, int *editDistanceTable, const Correction *correction, + const int inputSize) { const int excessivePos = correction->getExcessivePos(); const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; - const ProximityInfo *proximityInfo = correction->mProximityInfo; + const ProximityInfoState *proximityInfoState = &correction->mProximityInfoState; const int skippedCount = correction->mSkippedCount; const int transposedCount = correction->mTransposedCount / 2; const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2; @@ -670,55 +661,55 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex const bool lastCharExceeded = correction->mLastCharExceeded; const bool useFullEditDistance = correction->mUseFullEditDistance; const int outputLength = outputIndex + 1; - if (skippedCount >= inputLength || inputLength == 0) { + if (skippedCount >= inputSize || inputSize == 0) { return -1; } // TODO: find more robust way - bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2) - : (inputLength == inputIndex + 1); + bool sameLength = lastCharExceeded ? (inputSize == inputIndex + 2) + : (inputSize == inputIndex + 1); // TODO: use mExcessiveCount - const int matchCount = inputLength - correction->mProximityCount - excessiveCount; + const int matchCount = inputSize - correction->mProximityCount - excessiveCount; - const unsigned short* word = correction->mWord; + const unsigned short *word = correction->mWord; const bool skipped = skippedCount > 0; const int quoteDiffCount = max(0, getQuoteCount(word, outputLength) - - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); + - getQuoteCount(proximityInfoState->getPrimaryInputWord(), inputSize)); // TODO: Calculate edit distance for transposed and excessive int ed = 0; if (DEBUG_DICT_FULL) { - dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength); + dumpEditDistance10ForDebug(editDistanceTable, correction->mInputSize, outputLength); } int adjustedProximityMatchedCount = proximityMatchedCount; int finalFreq = freq; if (DEBUG_CORRECTION_FREQ - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) { AKLOGI("FinalFreq0: %d", finalFreq); } // TODO: Optimize this. if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) { - ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength, - inputLength) - transposedCount; + ed = getCurrentEditDistance(editDistanceTable, correction->mInputSize, outputLength, + inputSize) - transposedCount; const int matchWeight = powerIntCapped(typedLetterMultiplier, - max(inputLength, outputLength) - ed); + max(inputSize, outputLength) - ed); multiplyIntCapped(matchWeight, &finalFreq); // TODO: Demote further if there are two or more excessive chars with longer user input? - if (inputLength > outputLength) { + if (inputSize > outputLength) { multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq); } ed = max(0, ed - quoteDiffCount); - adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)), + adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputSize)), proximityMatchedCount); if (transposedCount <= 0) { - if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) { + if (ed == 1 && (inputSize == outputLength - 1 || inputSize == outputLength + 1)) { // Promote a word with just one skipped or excessive char if (sameLength) { multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE @@ -737,8 +728,7 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex multiplyIntCapped(matchWeight, &finalFreq); } - if (proximityInfo->getMatchedProximityId(0, word[0], true) - == ProximityInfo::UNRELATED_CHAR) { + if (proximityInfoState->getMatchedProximityId(0, word[0], true) == UNRELATED_CHAR) { multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq); } @@ -748,8 +738,8 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex // Demotion for a word with missing character if (skipped) { const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE - * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) - / (10 * inputLength + * (10 * inputSize - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) + / (10 * inputSize - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); if (DEBUG_DICT_FULL) { AKLOGI("Demotion rate for missing character is %d.", demotionRate); @@ -764,7 +754,7 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex // Demotion for a word with excessive character if (excessiveCount > 0) { multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); - if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) { + if (!lastCharExceeded && !proximityInfoState->existsAdjacentProximityChars(excessivePos)) { if (DEBUG_DICT_FULL) { AKLOGI("Double excessive demotion"); } @@ -775,8 +765,9 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex } const bool performTouchPositionCorrection = - CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled() - && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0; + CALIBRATE_SCORE_BY_TOUCH_COORDINATES + && proximityInfoState->touchPositionCorrectionEnabled() + && skippedCount == 0 && excessiveCount == 0 && transposedCount == 0; // Score calibration by touch coordinates is being done only for pure-fat finger typing error // cases. int additionalProximityCount = 0; @@ -795,8 +786,8 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex static const float MIN = 0.3f; static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS; static const float R2 = HALF_SCORE_SQUARED_RADIUS; - const float x = (float)squaredDistance - / ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; + const float x = static_cast<float>(squaredDistance) + / ProximityInfoState::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; const float factor = max((x < R1) ? (A * (R1 - x) + B * x) / R1 : (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN); @@ -850,7 +841,7 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex ? adjustedProximityMatchedCount : (proximityMatchedCount + transposedCount); multiplyRate( - 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq); + 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputSize, &finalFreq); // Promotion for an exactly matched word if (ed == 0) { @@ -885,7 +876,7 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex e ... exceeding p ... proximity matching */ - if (matchCount == inputLength && matchCount >= 2 && !skipped + if (matchCount == inputSize && matchCount >= 2 && !skipped && word[matchCount] == word[matchCount - 1]) { multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); } @@ -895,8 +886,8 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex multiplyIntCapped(fullWordMultiplier, &finalFreq); } - if (useFullEditDistance && outputLength > inputLength + 1) { - const int diff = outputLength - inputLength - 1; + if (useFullEditDistance && outputLength > inputSize + 1) { + const int diff = outputLength - inputSize - 1; const int divider = diff < 31 ? 1 << diff : S_INT_MAX; finalFreq = divider > finalFreq ? 1 : finalFreq / divider; } @@ -906,8 +897,8 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex } if (DEBUG_CORRECTION_FREQ - && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) { - DUMP_WORD(proximityInfo->getPrimaryInputWord(), inputLength); + && (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputSize)) { + DUMP_WORD(correction->getPrimaryInputWord(), inputSize); DUMP_WORD(correction->mWord, outputLength); AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount, skippedCount, transposedCount, excessiveCount, additionalProximityCount, @@ -920,7 +911,7 @@ int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex /* static */ int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( const int *freqArray, const int *wordLengthArray, const int wordCount, - const Correction* correction, const bool isSpaceProximity, const unsigned short *word) { + const Correction *correction, const bool isSpaceProximity, const unsigned short *word) { const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; bool firstCapitalizedWordDemotion = false; @@ -946,7 +937,7 @@ int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( int totalLength = 0; int totalFreq = 0; - for (int i = 0; i < wordCount; ++i){ + for (int i = 0; i < wordCount; ++i) { const int wordLength = wordLengthArray[i]; if (wordLength <= 0) { return 0; @@ -1050,10 +1041,10 @@ int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords( /* Damerau-Levenshtein distance */ inline static int editDistanceInternal( - int* editDistanceTable, const unsigned short* before, - const int beforeLength, const unsigned short* after, const int afterLength) { + int *editDistanceTable, const unsigned short *before, + const int beforeLength, const unsigned short *after, const int afterLength) { // dp[li][lo] dp[a][b] = dp[ a * lo + b] - int* dp = editDistanceTable; + int *dp = editDistanceTable; const int li = beforeLength + 1; const int lo = afterLength + 1; for (int i = 0; i < li; ++i) { @@ -1089,8 +1080,8 @@ inline static int editDistanceInternal( return dp[li * lo - 1]; } -int Correction::RankingAlgorithm::editDistance(const unsigned short* before, - const int beforeLength, const unsigned short* after, const int afterLength) { +int Correction::RankingAlgorithm::editDistance(const unsigned short *before, + const int beforeLength, const unsigned short *after, const int afterLength) { int table[(beforeLength + 1) * (afterLength + 1)]; return editDistanceInternal(table, before, beforeLength, after, afterLength); } @@ -1099,7 +1090,7 @@ int Correction::RankingAlgorithm::editDistance(const unsigned short* before, // In dictionary.cpp, getSuggestion() method, // suggestion scores are computed using the below formula. // original score -// := pow(mTypedLetterMultiplier (this is defined 2), +// := powf(mTypedLetterMultiplier (this is defined 2), // (the number of matched characters between typed word and suggested word)) // * (individual word's score which defined in the unigram dictionary, // and this score is defined in range [0, 255].) @@ -1111,15 +1102,15 @@ int Correction::RankingAlgorithm::editDistance(const unsigned short* before, // capitalization, then treat it as if the score was 255. // - If before.length() == after.length() // => multiply by mFullWordMultiplier (this is defined 2)) -// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 +// So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2 // For historical reasons we ignore the 1.2 modifier (because the measure for a good // autocorrection threshold was done at a time when it didn't exist). This doesn't change // the result. -// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. +// So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2. /* static */ -float Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before, - const int beforeLength, const unsigned short* after, const int afterLength, +float Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short *before, + const int beforeLength, const unsigned short *after, const int afterLength, const int score) { if (0 == beforeLength || 0 == afterLength) { return 0; @@ -1136,15 +1127,16 @@ float Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* be return 0; } - const float maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE - * pow((float)TYPED_LETTER_MULTIPLIER, - (float)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER; + const float maxScore = score >= S_INT_MAX ? static_cast<float>(S_INT_MAX) + : static_cast<float>(MAX_INITIAL_SCORE) + * powf(static_cast<float>(TYPED_LETTER_MULTIPLIER), + static_cast<float>(min(beforeLength, afterLength - spaceCount))) + * static_cast<float>(FULL_WORD_MULTIPLIER); // add a weight based on edit distance. // distance <= max(afterLength, beforeLength) == afterLength, // so, 0 <= distance / afterLength <= 1 - const float weight = 1.0 - (float) distance / afterLength; - return (score / maxScore) * weight; + const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength); + return (static_cast<float>(score) / maxScore) * weight; } - } // namespace latinime diff --git a/native/jni/src/correction.h b/native/jni/src/correction.h index 3300a8491..f016d5453 100644 --- a/native/jni/src/correction.h +++ b/native/jni/src/correction.h @@ -17,11 +17,13 @@ #ifndef LATINIME_CORRECTION_H #define LATINIME_CORRECTION_H -#include <assert.h> +#include <cassert> +#include <cstring> // for memset() #include <stdint.h> -#include "correction_state.h" +#include "correction_state.h" #include "defines.h" +#include "proximity_info_state.h" namespace latinime { @@ -37,10 +39,108 @@ class Correction { NOT_ON_TERMINAL } CorrectionType; + Correction() + : mProximityInfo(0), mUseFullEditDistance(false), mDoAutoCompletion(false), + mMaxEditDistance(0), mMaxDepth(0), mInputSize(0), mSpaceProximityPos(0), + mMissingSpacePos(0), mTerminalInputIndex(0), mTerminalOutputIndex(0), mMaxErrors(0), + mTotalTraverseCount(0), mNeedsToTraverseAllNodes(false), mOutputIndex(0), + mInputIndex(0), mEquivalentCharCount(0), mProximityCount(0), mExcessiveCount(0), + mTransposedCount(0), mSkippedCount(0), mTransposedPos(0), mExcessivePos(0), + mSkipPos(0), mLastCharExceeded(false), mMatching(false), mProximityMatching(false), + mAdditionalProximityMatching(false), mExceeding(false), mTransposing(false), + mSkipping(false), mProximityInfoState() { + memset(mWord, 0, sizeof(mWord)); + memset(mDistances, 0, sizeof(mDistances)); + memset(mEditDistanceTable, 0, sizeof(mEditDistanceTable)); + // NOTE: mCorrectionStates is an array of instances. + // No need to initialize it explicitly here. + } + + virtual ~Correction() {} + void resetCorrection(); + void initCorrection( + const ProximityInfo *pi, const int inputSize, const int maxWordLength); + void initCorrectionState(const int rootPos, const int childCount, const bool traverseAll); + + // TODO: remove + void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, + const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance, + const bool doAutoCompletion, const int maxErrors); + void checkState(); + bool sameAsTyped(); + bool initProcessState(const int index); + + int getInputIndex() const; + + bool needsToPrune() const; + + int pushAndGetTotalTraverseCount() { + return ++mTotalTraverseCount; + } + + int getFreqForSplitMultipleWords( + const int *freqArray, const int *wordLengthArray, const int wordCount, + const bool isSpaceProximity, const unsigned short *word); + int getFinalProbability(const int probability, unsigned short **word, int *wordLength); + int getFinalProbabilityForSubQueue(const int probability, unsigned short **word, + int *wordLength, const int inputSize); + + CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); + + ///////////////////////// + // Tree helper methods + int goDownTree(const int parentIndex, const int childCount, const int firstChildPos); + + inline int getTreeSiblingPos(const int index) const { + return mCorrectionStates[index].mSiblingPos; + } + + inline void setTreeSiblingPos(const int index, const int pos) { + mCorrectionStates[index].mSiblingPos = pos; + } + + inline int getTreeParentIndex(const int index) const { + return mCorrectionStates[index].mParentIndex; + } + + class RankingAlgorithm { + public: + static int calculateFinalProbability(const int inputIndex, const int depth, + const int probability, int *editDistanceTable, const Correction *correction, + const int inputSize); + static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, + const int wordCount, const Correction *correction, const bool isSpaceProximity, + const unsigned short *word); + static float calcNormalizedScore(const unsigned short *before, const int beforeLength, + const unsigned short *after, const int afterLength, const int score); + static int editDistance(const unsigned short *before, + const int beforeLength, const unsigned short *after, const int afterLength); + private: + static const int CODE_SPACE = ' '; + static const int MAX_INITIAL_SCORE = 255; + }; + + // proximity info state + void initInputParams(const ProximityInfo *proximityInfo, const int32_t *inputCodes, + const int inputSize, const int *xCoordinates, const int *yCoordinates) { + mProximityInfoState.initInputParams(0, MAX_POINT_TO_KEY_LENGTH, + proximityInfo, inputCodes, inputSize, xCoordinates, yCoordinates, 0, 0, false); + } + + const unsigned short *getPrimaryInputWord() const { + return mProximityInfoState.getPrimaryInputWord(); + } + + unsigned short getPrimaryCharAt(const int index) const { + return mProximityInfoState.getPrimaryCharAt(index); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Correction); + ///////////////////////// // static inline utils // ///////////////////////// - static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); @@ -93,112 +193,45 @@ class Correction { } } - Correction(const int typedLetterMultiplier, const int fullWordMultiplier); - void resetCorrection(); - void initCorrection( - const ProximityInfo *pi, const int inputLength, const int maxWordLength); - void initCorrectionState(const int rootPos, const int childCount, const bool traverseAll); - - // TODO: remove - void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, - const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance, - const bool doAutoCompletion, const int maxErrors); - void checkState(); - bool initProcessState(const int index); - - int getInputIndex(); - - virtual ~Correction(); - int getSpaceProximityPos() const { + inline int getSpaceProximityPos() const { return mSpaceProximityPos; } - int getMissingSpacePos() const { + inline int getMissingSpacePos() const { return mMissingSpacePos; } - int getSkipPos() const { + inline int getSkipPos() const { return mSkipPos; } - int getExcessivePos() const { + inline int getExcessivePos() const { return mExcessivePos; } - int getTransposedPos() const { + inline int getTransposedPos() const { return mTransposedPos; } - bool needsToPrune() const; - - int pushAndGetTotalTraverseCount() { - return ++mTotalTraverseCount; - } - - int getFreqForSplitMultipleWords( - const int *freqArray, const int *wordLengthArray, const int wordCount, - const bool isSpaceProximity, const unsigned short *word); - int getFinalProbability(const int probability, unsigned short **word, int* wordLength); - int getFinalProbabilityForSubQueue(const int probability, unsigned short **word, - int* wordLength, const int inputLength); - - CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); - - ///////////////////////// - // Tree helper methods - int goDownTree(const int parentIndex, const int childCount, const int firstChildPos); - - inline int getTreeSiblingPos(const int index) const { - return mCorrectionStates[index].mSiblingPos; - } - - inline void setTreeSiblingPos(const int index, const int pos) { - mCorrectionStates[index].mSiblingPos = pos; - } - - inline int getTreeParentIndex(const int index) const { - return mCorrectionStates[index].mParentIndex; - } - - class RankingAlgorithm { - public: - static int calculateFinalProbability(const int inputIndex, const int depth, - const int probability, int *editDistanceTable, const Correction* correction, - const int inputLength); - static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray, - const int wordCount, const Correction* correction, const bool isSpaceProximity, - const unsigned short *word); - static float calcNormalizedScore(const unsigned short* before, const int beforeLength, - const unsigned short* after, const int afterLength, const int score); - static int editDistance(const unsigned short* before, - const int beforeLength, const unsigned short* after, const int afterLength); - private: - static const int CODE_SPACE = ' '; - static const int MAX_INITIAL_SCORE = 255; - static const int TYPED_LETTER_MULTIPLIER = 2; - static const int FULL_WORD_MULTIPLIER = 2; - }; - - private: inline void incrementInputIndex(); inline void incrementOutputIndex(); inline void startToTraverseAllNodes(); - inline bool isQuote(const unsigned short c); + inline bool isSingleQuote(const unsigned short c); inline CorrectionType processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented); inline CorrectionType processUnrelatedCorrectionType(); inline void addCharToCurrentWord(const int32_t c); inline int getFinalProbabilityInternal(const int probability, unsigned short **word, - int* wordLength, const int inputLength); + int *wordLength, const int inputSize); - const int TYPED_LETTER_MULTIPLIER; - const int FULL_WORD_MULTIPLIER; + static const int TYPED_LETTER_MULTIPLIER = 2; + static const int FULL_WORD_MULTIPLIER = 2; const ProximityInfo *mProximityInfo; bool mUseFullEditDistance; bool mDoAutoCompletion; int mMaxEditDistance; int mMaxDepth; - int mInputLength; + int mInputSize; int mSpaceProximityPos; int mMissingSpacePos; int mTerminalInputIndex; @@ -240,7 +273,7 @@ class Correction { bool mExceeding; bool mTransposing; bool mSkipping; - + ProximityInfoState mProximityInfoState; }; } // namespace latinime #endif // LATINIME_CORRECTION_H diff --git a/native/jni/src/correction_state.h b/native/jni/src/correction_state.h index 5b2cbd3a2..a63d4aa94 100644 --- a/native/jni/src/correction_state.h +++ b/native/jni/src/correction_state.h @@ -79,6 +79,5 @@ inline static void initCorrectionState(CorrectionState *state, const int rootPos state->mSkipping = false; state->mAdditionalProximityMatching = false; } - } // namespace latinime #endif // LATINIME_CORRECTION_STATE_H diff --git a/native/jni/src/debug.h b/native/jni/src/debug.h deleted file mode 100644 index 376ba59d9..000000000 --- a/native/jni/src/debug.h +++ /dev/null @@ -1,72 +0,0 @@ -/* -** -** Copyright 2011, The Android Open Source Project -** -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** -** http://www.apache.org/licenses/LICENSE-2.0 -** -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -*/ - -#ifndef LATINIME_DEBUG_H -#define LATINIME_DEBUG_H - -#include "defines.h" - -static inline unsigned char* convertToUnibyteString(unsigned short* input, unsigned char* output, - const unsigned int length) { - unsigned int i = 0; - for (; i <= length && input[i] != 0; ++i) - output[i] = input[i] & 0xFF; - output[i] = 0; - return output; -} - -static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input, - unsigned char* output, const unsigned int length, unsigned char c) { - unsigned int i = 0; - for (; i <= length && input[i] != 0; ++i) - output[i] = input[i] & 0xFF; - if (i > 0) output[i-1] = c; - output[i] = 0; - return output; -} - -static inline void LOGI_S16(unsigned short* string, const unsigned int length) { - unsigned char tmp_buffer[length]; - convertToUnibyteString(string, tmp_buffer, length); - AKLOGI(">> %s", tmp_buffer); - // The log facility is throwing out log that comes too fast. The following - // is a dirty way of slowing down processing so that we can see all log. - // TODO : refactor this in a blocking log or something. - // usleep(10); -} - -static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length, - unsigned char c) { - unsigned char tmp_buffer[length+1]; - convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c); - AKLOGI(">> %s", tmp_buffer); - // Likewise - // usleep(10); -} - -static inline void printDebug(const char* tag, int* codes, int codesSize, int MAX_PROXIMITY_CHARS) { - unsigned char *buf = (unsigned char*)malloc((1 + codesSize) * sizeof(*buf)); - - buf[codesSize] = 0; - while (--codesSize >= 0) - buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS]; - AKLOGI("%s, WORD = %s", tag, buf); - - free(buf); -} - -#endif // LATINIME_DEBUG_H diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index cd2fc634a..ad526fb7f 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -1,42 +1,86 @@ /* -** -** Copyright 2010, The Android Open Source Project -** -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** -** http://www.apache.org/licenses/LICENSE-2.0 -** -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -*/ + * Copyright (C) 2010, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef LATINIME_DEFINES_H #define LATINIME_DEFINES_H +#include <stdint.h> + #if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) -#include <cutils/log.h> -#define AKLOGE ALOGE -#define AKLOGI ALOGI +#include <android/log.h> +#ifndef LOG_TAG +#define LOG_TAG "LatinIME: " +#endif +#define AKLOGE(fmt, ...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, fmt, ##__VA_ARGS__) +#define AKLOGI(fmt, ...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, fmt, ##__VA_ARGS__) + +#define DUMP_RESULT(words, frequencies, maxWordCount, maxWordLength) do { \ + dumpResult(words, frequencies, maxWordCount, maxWordLength); } while (0) +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while (0) +#define DUMP_WORD_INT(word, length) do { dumpWordInt(word, length); } while (0) +// TODO: INTS_TO_CHARS +#define SHORTS_TO_CHARS(input, length, output) do { \ + shortArrayToCharArray(input, length, output); } while (0) + +static inline void dumpWordInfo(const unsigned short *word, const int length, + const int rank, const int frequency) { + static char charBuf[50]; + int i = 0; + for (; i < length; ++i) { + const unsigned short c = word[i]; + if (c == 0) { + break; + } + // static_cast only for debugging + charBuf[i] = static_cast<char>(c); + } + charBuf[i] = 0; + if (i > 1) { + AKLOGI("%2d [ %s ] (%d)", rank, charBuf, frequency); + } +} -#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) -#define DUMP_WORD_INT(word, length) do { dumpWordInt(word, length); } while(0) +static inline void dumpResult( + const unsigned short *outWords, const int *frequencies, const int maxWordCounts, + const int maxWordLength) { + AKLOGI("--- DUMP RESULT ---------"); + for (int i = 0; i < maxWordCounts; ++i) { + dumpWordInfo(&outWords[i * maxWordLength], maxWordLength, i, frequencies[i]); + } + AKLOGI("-------------------------"); +} -static inline void dumpWord(const unsigned short* word, const int length) { +static inline void dumpWord(const unsigned short *word, const int length) { static char charBuf[50]; - - for (int i = 0; i < length; ++i) { - charBuf[i] = word[i]; + int i = 0; + for (; i < length; ++i) { + const unsigned short c = word[i]; + if (c == 0) { + break; + } + // static_cast only for debugging + charBuf[i] = static_cast<char>(c); + } + charBuf[i] = 0; + if (i > 1) { + AKLOGI("[ %s ]", charBuf); } - charBuf[length] = 0; - AKLOGI("[ %s ]", charBuf); } -static inline void dumpWordInt(const int* word, const int length) { +static inline void dumpWordInt(const int *word, const int length) { static char charBuf[50]; for (int i = 0; i < length; ++i) { @@ -46,11 +90,58 @@ static inline void dumpWordInt(const int* word, const int length) { AKLOGI("i[ %s ]", charBuf); } +// TODO: Change this to intArrayToCharArray +static inline void shortArrayToCharArray( + const unsigned short *input, const int length, char *output) { + int i = 0; + for (;i < length; ++i) { + const unsigned short c = input[i]; + if (c == 0) { + break; + } + // static_cast only for debugging + output[i] = static_cast<char>(c); + } + output[i] = 0; +} + +#ifndef __ANDROID__ +#include <cassert> +#include <execinfo.h> +#include <stdlib.h> + +#define ASSERT(success) do { if (!(success)) { showStackTrace(); assert(success);} } while (0) +#define SHOW_STACK_TRACE do { showStackTrace(); } while (0) + +static inline void showStackTrace() { + void *callstack[128]; + int i, frames = backtrace(callstack, 128); + char **strs = backtrace_symbols(callstack, frames); + for (i = 0; i < frames; ++i) { + if (i == 0) { + AKLOGI("=== Trace ==="); + continue; + } + AKLOGI("%s", strs[i]); + } + free(strs); +} +#else +#include <cassert> +#define ASSERT(success) assert(success) +#define SHOW_STACK_TRACE +#endif + #else #define AKLOGE(fmt, ...) #define AKLOGI(fmt, ...) +#define DUMP_RESULT(words, frequencies, maxWordCount, maxWordLength) #define DUMP_WORD(word, length) #define DUMP_WORD_INT(word, length) +#define ASSERT(success) +#define SHOW_STACK_TRACE +// TODO: INTS_TO_CHARS +#define SHORTS_TO_CHARS(input, length, output) #endif #ifdef FLAG_DO_PROFILE @@ -64,14 +155,14 @@ static unsigned int profile_counter[PROF_BUF_SIZE]; #define PROF_RESET prof_reset() #define PROF_COUNT(prof_buf_id) ++profile_counter[prof_buf_id] -#define PROF_OPEN do { PROF_RESET; PROF_START(PROF_BUF_SIZE - 1); } while(0) +#define PROF_OPEN do { PROF_RESET; PROF_START(PROF_BUF_SIZE - 1); } while (0) #define PROF_START(prof_buf_id) do { \ - PROF_COUNT(prof_buf_id); profile_old[prof_buf_id] = (clock()); } while(0) -#define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while(0) + PROF_COUNT(prof_buf_id); profile_old[prof_buf_id] = (clock()); } while (0) +#define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while (0) #define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id]) #define PROF_CLOCKOUT(prof_buf_id) \ AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) -#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0) +#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while (0) static inline void prof_reset(void) { for (int i = 0; i < PROF_BUF_SIZE; ++i) { @@ -86,17 +177,18 @@ static inline void prof_out(void) { AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); } AKLOGI("Total time is %6.3f ms.", - profile_buf[PROF_BUF_SIZE - 1] * 1000 / (float)CLOCKS_PER_SEC); + profile_buf[PROF_BUF_SIZE - 1] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC)); float all = 0; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { all += profile_buf[i]; } if (all == 0) all = 1; for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { - if (profile_buf[i] != 0) { + if (profile_buf[i]) { AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", i, (profile_buf[i] * 100 / all), - profile_buf[i] * 1000 / (float)CLOCKS_PER_SEC, profile_counter[i]); + profile_buf[i] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC), + profile_counter[i]); } } } @@ -116,10 +208,6 @@ static inline void prof_out(void) { #endif // FLAG_DO_PROFILE #ifdef FLAG_DBG -#include <cutils/log.h> -#ifndef LOG_TAG -#define LOG_TAG "LatinIME: " -#endif #define DEBUG_DICT true #define DEBUG_DICT_FULL false #define DEBUG_EDIT_DISTANCE false @@ -132,6 +220,12 @@ static inline void prof_out(void) { #define DEBUG_CORRECTION_FREQ false #define DEBUG_WORDS_PRIORITY_QUEUE false +#ifdef FLAG_FULL_DBG +#define DEBUG_GEO_FULL true +#else +#define DEBUG_GEO_FULL false +#endif + #else // FLAG_DBG #define DEBUG_DICT false @@ -146,6 +240,7 @@ static inline void prof_out(void) { #define DEBUG_CORRECTION_FREQ false #define DEBUG_WORDS_PRIORITY_QUEUE false +#define DEBUG_GEO_FULL false #endif // FLAG_DBG @@ -176,15 +271,15 @@ static inline void prof_out(void) { #define FLAG_BIGRAM_FREQ 0x7F #define DICTIONARY_VERSION_MIN 200 -#define NOT_VALID_WORD -99 -#define NOT_A_CHARACTER -1 -#define NOT_A_DISTANCE -1 -#define NOT_A_COORDINATE -1 -#define EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO -2 -#define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO -3 -#define ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO -4 -#define NOT_AN_INDEX -1 -#define NOT_A_PROBABILITY -1 +#define NOT_VALID_WORD (-99) +#define NOT_A_CODE_POINT (-1) +#define NOT_A_DISTANCE (-1) +#define NOT_A_COORDINATE (-1) +#define EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO (-2) +#define PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO (-3) +#define ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO (-4) +#define NOT_AN_INDEX (-1) +#define NOT_A_PROBABILITY (-1) #define KEYCODE_SPACE ' ' @@ -225,9 +320,15 @@ static inline void prof_out(void) { // This is only used for the size of array. Not to be used in c functions. #define MAX_WORD_LENGTH_INTERNAL 48 +// This must be the same as ProximityInfo#MAX_PROXIMITY_CHARS_SIZE, currently it's 16. +#define MAX_PROXIMITY_CHARS_SIZE_INTERNAL 16 + // This must be equal to ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE in KeyDetector.java #define ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE 2 +// Assuming locale strings such as en_US, sr-Latn etc. +#define MAX_LOCALE_STRING_LENGTH 10 + // Word limit for sub queues used in WordsPriorityQueuePool. Sub queues are temporary queues used // for better performance. // Holds up to 1 candidate for each word @@ -252,12 +353,18 @@ static inline void prof_out(void) { #define FIRST_WORD_INDEX 0 +#define MAX_SPACES_INTERNAL 16 + +// Max Distance between point to key +#define MAX_POINT_TO_KEY_LENGTH 10000000 + +// The max number of the keys in one keyboard layout +#define MAX_KEY_COUNT_IN_A_KEYBOARD 64 + // TODO: Reduce this constant if possible; check the maximum number of digraphs in the same // word in the dictionary for languages with digraphs, like German and French #define DEFAULT_MAX_DIGRAPH_SEARCH_DEPTH 5 -// Minimum suggest depth for one word for all cases except for missing space suggestions. -#define MIN_SUGGEST_DEPTH 1 #define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 @@ -286,7 +393,26 @@ template<typename T> inline T max(T a, T b) { return a > b ? a : b; } #define NEUTRAL_AREA_RADIUS_RATIO 1.3f // DEBUG -#define INPUTLENGTH_FOR_DEBUG -1 -#define MIN_OUTPUT_INDEX_FOR_DEBUG -1 - +#define INPUTLENGTH_FOR_DEBUG (-1) +#define MIN_OUTPUT_INDEX_FOR_DEBUG (-1) + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName(); \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +// Used as a return value for character comparison +typedef enum { + // Same char, possibly with different case or accent + EQUIVALENT_CHAR, + // It is a char located nearby on the keyboard + NEAR_PROXIMITY_CHAR, + // It is an unrelated char + UNRELATED_CHAR, + // Additional proximity char which can differ by language. + ADDITIONAL_PROXIMITY_CHAR +} ProximityType; #endif // LATINIME_DEFINES_H diff --git a/native/jni/src/dic_traverse_wrapper.cpp b/native/jni/src/dic_traverse_wrapper.cpp new file mode 100644 index 000000000..88ca9fa0d --- /dev/null +++ b/native/jni/src/dic_traverse_wrapper.cpp @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2012, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: jni: Session" + +#include "dic_traverse_wrapper.h" + +namespace latinime { +void *(*DicTraverseWrapper::sDicTraverseSessionFactoryMethod)(JNIEnv *, jstring) = 0; +void (*DicTraverseWrapper::sDicTraverseSessionReleaseMethod)(void *) = 0; +void (*DicTraverseWrapper::sDicTraverseSessionInitMethod)( + void *, const Dictionary *const, const int *, const int) = 0; +} // namespace latinime diff --git a/native/jni/src/dic_traverse_wrapper.h b/native/jni/src/dic_traverse_wrapper.h new file mode 100644 index 000000000..292382487 --- /dev/null +++ b/native/jni/src/dic_traverse_wrapper.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2012, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_TRAVERSE_WRAPPER_H +#define LATINIME_DIC_TRAVERSE_WRAPPER_H + +#include <stdint.h> + +#include "defines.h" +#include "jni.h" + +namespace latinime { +class Dictionary; +// TODO: Remove +class DicTraverseWrapper { + public: + static void *getDicTraverseSession(JNIEnv *env, jstring locale) { + if (sDicTraverseSessionFactoryMethod) { + return sDicTraverseSessionFactoryMethod(env, locale); + } + return 0; + } + static void initDicTraverseSession(void *traverseSession, + const Dictionary *const dictionary, const int *prevWord, const int prevWordLength) { + if (sDicTraverseSessionInitMethod) { + sDicTraverseSessionInitMethod(traverseSession, dictionary, prevWord, prevWordLength); + } + } + static void releaseDicTraverseSession(void *traverseSession) { + if (sDicTraverseSessionReleaseMethod) { + sDicTraverseSessionReleaseMethod(traverseSession); + } + } + static void setTraverseSessionFactoryMethod( + void *(*factoryMethod)(JNIEnv *, jstring)) { + sDicTraverseSessionFactoryMethod = factoryMethod; + } + static void setTraverseSessionInitMethod( + void (*initMethod)(void *, const Dictionary *const, const int *, const int)) { + sDicTraverseSessionInitMethod = initMethod; + } + static void setTraverseSessionReleaseMethod(void (*releaseMethod)(void *)) { + sDicTraverseSessionReleaseMethod = releaseMethod; + } + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicTraverseWrapper); + static void *(*sDicTraverseSessionFactoryMethod)(JNIEnv *, jstring); + static void (*sDicTraverseSessionInitMethod)( + void *, const Dictionary *const, const int *, const int); + static void (*sDicTraverseSessionReleaseMethod)(void *); +}; +int register_DicTraverseSession(JNIEnv *env); +} // namespace latinime +#endif // LATINIME_DIC_TRAVERSE_WRAPPER_H diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp index 1fb02478b..2fbe83e86 100644 --- a/native/jni/src/dictionary.cpp +++ b/native/jni/src/dictionary.cpp @@ -1,36 +1,44 @@ /* -** -** Copyright 2009, The Android Open Source Project -** -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** -** http://www.apache.org/licenses/LICENSE-2.0 -** -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -*/ - -#include <stdio.h> + * Copyright (C) 2009, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #define LOG_TAG "LatinIME: dictionary.cpp" +#include <stdint.h> + +#include "bigram_dictionary.h" #include "binary_format.h" #include "defines.h" #include "dictionary.h" +#include "dic_traverse_wrapper.h" +#include "gesture_decoder_wrapper.h" +#include "unigram_dictionary.h" namespace latinime { // TODO: Change the type of all keyCodes to uint32_t Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, - int typedLetterMultiplier, int fullWordMultiplier, - int maxWordLength, int maxWords) - : mDict((unsigned char*) dict), mDictSize(dictSize), - mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust) { + int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, + int maxPredictions) + : mDict(static_cast<unsigned char *>(dict)), + mOffsetDict((static_cast<unsigned char *>(dict)) + BinaryFormat::getHeaderSize(mDict)), + mDictSize(dictSize), mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust), + mUnigramDictionary(new UnigramDictionary(mOffsetDict, typedLetterMultiplier, + fullWordMultiplier, maxWordLength, maxWords, BinaryFormat::getFlags(mDict))), + mBigramDictionary(new BigramDictionary(mOffsetDict, maxWordLength, maxPredictions)), + mGestureDecoder(new GestureDecoderWrapper(maxWordLength, maxWords)) { if (DEBUG_DICT) { if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) { AKLOGI("Max word length (%d) is greater than %d", @@ -38,30 +46,56 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, AKLOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF)); } } - mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); - mWordsPriorityQueuePool = new WordsPriorityQueuePool( - maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength); - const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict); - const unsigned int options = BinaryFormat::getFlags(mDict); - mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier, - fullWordMultiplier, maxWordLength, maxWords, options); - mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, this); } Dictionary::~Dictionary() { - delete mCorrection; - delete mWordsPriorityQueuePool; delete mUnigramDictionary; delete mBigramDictionary; + delete mGestureDecoder; } -int Dictionary::getFrequency(const int32_t *word, int length) { +int Dictionary::getSuggestions(ProximityInfo *proximityInfo, void *traverseSession, + int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, + int *codes, int codesSize, int *prevWordChars, + int prevWordLength, int commitPoint, bool isGesture, + bool useFullEditDistance, unsigned short *outWords, + int *frequencies, int *spaceIndices, int *outputTypes) const { + int result = 0; + if (isGesture) { + DicTraverseWrapper::initDicTraverseSession( + traverseSession, this, prevWordChars, prevWordLength); + result = mGestureDecoder->getSuggestions(proximityInfo, traverseSession, + xcoordinates, ycoordinates, times, pointerIds, codes, codesSize, commitPoint, + outWords, frequencies, spaceIndices, outputTypes); + if (DEBUG_DICT) { + DUMP_RESULT(outWords, frequencies, 18 /* MAX_WORDS */, MAX_WORD_LENGTH_INTERNAL); + } + return result; + } else { + std::map<int, int> bigramMap; + uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE]; + mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars, + prevWordLength, &bigramMap, bigramFilter); + result = mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, + ycoordinates, codes, codesSize, &bigramMap, bigramFilter, + useFullEditDistance, outWords, frequencies, outputTypes); + return result; + } +} + +int Dictionary::getBigrams(const int32_t *word, int length, int *codes, int codesSize, + unsigned short *outWords, int *frequencies, int *outputTypes) const { + if (length <= 0) return 0; + return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies, + outputTypes); +} + +int Dictionary::getFrequency(const int32_t *word, int length) const { return mUnigramDictionary->getFrequency(word, length); } bool Dictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2, - int length2) { + int length2) const { return mBigramDictionary->isValidBigram(word1, length1, word2, length2); } - } // namespace latinime diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h index 9f2367904..a1358890d 100644 --- a/native/jni/src/dictionary.h +++ b/native/jni/src/dictionary.h @@ -17,55 +17,63 @@ #ifndef LATINIME_DICTIONARY_H #define LATINIME_DICTIONARY_H -#include <map> +#include <stdint.h> -#include "bigram_dictionary.h" -#include "char_utils.h" -#include "correction.h" #include "defines.h" -#include "proximity_info.h" -#include "unigram_dictionary.h" -#include "words_priority_queue_pool.h" namespace latinime { +class BigramDictionary; +class IncrementalDecoderInterface; +class ProximityInfo; +class UnigramDictionary; + class Dictionary { public: + // Taken from SuggestedWords.java + const static int KIND_TYPED = 0; // What user typed + const static int KIND_CORRECTION = 1; // Simple correction/suggestion + const static int KIND_COMPLETION = 2; // Completion (suggestion with appended chars) + const static int KIND_WHITELIST = 3; // Whitelisted word + const static int KIND_BLACKLIST = 4; // Blacklisted word + const static int KIND_HARDCODED = 5; // Hardcoded suggestion, e.g. punctuation + const static int KIND_APP_DEFINED = 6; // Suggested by the application + const static int KIND_SHORTCUT = 7; // A shortcut + const static int KIND_PREDICTION = 8; // A prediction (== a suggestion with no input) + Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler, - int fullWordMultiplier, int maxWordLength, int maxWords); + int fullWordMultiplier, int maxWordLength, int maxWords, int maxPredictions); - int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, - int *codes, int codesSize, const int32_t* prevWordChars, const int prevWordLength, - bool useFullEditDistance, unsigned short *outWords, int *frequencies) { - std::map<int, int> bigramMap; - uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE]; - mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars, - prevWordLength, &bigramMap, bigramFilter); - return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, - mCorrection, xcoordinates, ycoordinates, codes, codesSize, &bigramMap, - bigramFilter, useFullEditDistance, outWords, frequencies); - } + int getSuggestions(ProximityInfo *proximityInfo, void *traverseSession, int *xcoordinates, + int *ycoordinates, int *times, int *pointerIds, int *codes, int codesSize, + int *prevWordChars, int prevWordLength, int commitPoint, bool isGesture, + bool useFullEditDistance, unsigned short *outWords, + int *frequencies, int *spaceIndices, int *outputTypes) const; int getBigrams(const int32_t *word, int length, int *codes, int codesSize, - unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) { - return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies, - maxWordLength, maxBigrams); - } + unsigned short *outWords, int *frequencies, int *outputTypes) const; - int getFrequency(const int32_t *word, int length); - bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2); - void *getDict() { return (void *)mDict; } - int getDictSize() { return mDictSize; } - int getMmapFd() { return mMmapFd; } - int getDictBufAdjust() { return mDictBufAdjust; } - ~Dictionary(); + int getFrequency(const int32_t *word, int length) const; + bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const; + const uint8_t *getDict() const { // required to release dictionary buffer + return mDict; + } + const uint8_t *getOffsetDict() const { + return mOffsetDict; + } + int getDictSize() const { return mDictSize; } + int getMmapFd() const { return mMmapFd; } + int getDictBufAdjust() const { return mDictBufAdjust; } + virtual ~Dictionary(); // public static utility methods // static inline methods should be defined in the header file static int wideStrLen(unsigned short *str); private: - const unsigned char *mDict; + DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary); + const uint8_t *mDict; + const uint8_t *mOffsetDict; // Used only for the mmap version of dictionary loading, but we use these as dummy variables // also for the malloc version. @@ -73,21 +81,21 @@ class Dictionary { const int mMmapFd; const int mDictBufAdjust; - UnigramDictionary *mUnigramDictionary; - BigramDictionary *mBigramDictionary; - WordsPriorityQueuePool *mWordsPriorityQueuePool; - Correction *mCorrection; + const UnigramDictionary *mUnigramDictionary; + const BigramDictionary *mBigramDictionary; + IncrementalDecoderInterface *mGestureDecoder; }; // public static utility methods // static inline methods should be defined in the header file inline int Dictionary::wideStrLen(unsigned short *str) { if (!str) return 0; - unsigned short *end = str; - while (*end) - end++; - return end - str; + int length = 0; + while (*str) { + str++; + length++; + } + return length; } } // namespace latinime - #endif // LATINIME_DICTIONARY_H diff --git a/native/jni/src/geometry_utils.h b/native/jni/src/geometry_utils.h new file mode 100644 index 000000000..3892b46a0 --- /dev/null +++ b/native/jni/src/geometry_utils.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_GEOMETRY_UTILS_H +#define LATINIME_GEOMETRY_UTILS_H + +#include <cmath> + +#define MAX_PATHS 2 + +#define DEBUG_DECODER false + +#define M_PI_F 3.14159265f + +#define ROUND_FLOAT_10000(f) ((f) < 1000.0f && (f) > 0.001f) \ + ? (floorf((f) * 10000.0f) / 10000.0f) : (f) + +#define SQUARE_FLOAT(x) ((x) * (x)) + +namespace latinime { + +static inline float getSquaredDistanceFloat(float x1, float y1, float x2, float y2) { + const float deltaX = x1 - x2; + const float deltaY = y1 - y2; + return SQUARE_FLOAT(deltaX) + SQUARE_FLOAT(deltaY); +} + +static inline float getDistanceFloat(float x1, float y1, float x2, float y2) { + return hypotf(x1 - x2, y1 - y2); +} + +static inline int getDistanceInt(int x1, int y1, int x2, int y2) { + return static_cast<int>(getDistanceFloat(static_cast<float>(x1), static_cast<float>(y1), + static_cast<float>(x2), static_cast<float>(y2))); +} + +static inline float getAngle(int x1, int y1, int x2, int y2) { + const int dx = x1 - x2; + const int dy = y1 - y2; + if (dx == 0 && dy == 0) return 0; + return atan2f(static_cast<float>(dy), static_cast<float>(dx)); +} + +static inline float getAngleDiff(float a1, float a2) { + const float deltaA = fabsf(a1 - a2); + const float diff = ROUND_FLOAT_10000(deltaA); + if (diff > M_PI_F) { + const float normalizedDiff = 2.0f * M_PI_F - diff; + return ROUND_FLOAT_10000(normalizedDiff); + } + return diff; +} + +static inline float pointToLineSegSquaredDistanceFloat( + float x, float y, float x1, float y1, float x2, float y2, bool extend) { + const float ray1x = x - x1; + const float ray1y = y - y1; + const float ray2x = x2 - x1; + const float ray2y = y2 - y1; + + const float dotProduct = ray1x * ray2x + ray1y * ray2y; + const float lineLengthSqr = SQUARE_FLOAT(ray2x) + SQUARE_FLOAT(ray2y); + const float projectionLengthSqr = dotProduct / lineLengthSqr; + + float projectionX; + float projectionY; + if (!extend && projectionLengthSqr < 0.0f) { + projectionX = x1; + projectionY = y1; + } else if (!extend && projectionLengthSqr > 1.0f) { + projectionX = x2; + projectionY = y2; + } else { + projectionX = x1 + projectionLengthSqr * ray2x; + projectionY = y1 + projectionLengthSqr * ray2y; + } + return getSquaredDistanceFloat(x, y, projectionX, projectionY); +} +} // namespace latinime +#endif // LATINIME_GEOMETRY_UTILS_H diff --git a/native/jni/src/gesture/gesture_decoder_wrapper.cpp b/native/jni/src/gesture/gesture_decoder_wrapper.cpp new file mode 100644 index 000000000..afbe0c5c3 --- /dev/null +++ b/native/jni/src/gesture/gesture_decoder_wrapper.cpp @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gesture_decoder_wrapper.h" + +namespace latinime { + IncrementalDecoderInterface * + (*GestureDecoderWrapper::sGestureDecoderFactoryMethod)(int, int) = 0; +} // namespace latinime diff --git a/native/jni/src/gesture/gesture_decoder_wrapper.h b/native/jni/src/gesture/gesture_decoder_wrapper.h new file mode 100644 index 000000000..92e1ded49 --- /dev/null +++ b/native/jni/src/gesture/gesture_decoder_wrapper.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_GESTURE_DECODER_WRAPPER_H +#define LATINIME_GESTURE_DECODER_WRAPPER_H + +#include <stdint.h> +#include "defines.h" +#include "incremental_decoder_interface.h" + +namespace latinime { + +class UnigramDictionary; +class BigramDictionary; +class ProximityInfo; + +class GestureDecoderWrapper : public IncrementalDecoderInterface { + public: + GestureDecoderWrapper(const int maxWordLength, const int maxWords) + : mIncrementalDecoderInterface(getGestureDecoderInstance(maxWordLength, maxWords)) { + } + + virtual ~GestureDecoderWrapper() { + delete mIncrementalDecoderInterface; + } + + int getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, + int *times, int *pointerIds, int *codes, int inputSize, int commitPoint, + unsigned short *outWords, int *frequencies, int *outputIndices, + int *outputTypes) const { + if (!mIncrementalDecoderInterface) { + return 0; + } + return mIncrementalDecoderInterface->getSuggestions( + pInfo, traverseSession, inputXs, inputYs, times, pointerIds, codes, + inputSize, commitPoint, outWords, frequencies, outputIndices, outputTypes); + } + + static void setGestureDecoderFactoryMethod( + IncrementalDecoderInterface *(*factoryMethod)(int, int)) { + sGestureDecoderFactoryMethod = factoryMethod; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(GestureDecoderWrapper); + static IncrementalDecoderInterface *getGestureDecoderInstance(int maxWordLength, int maxWords) { + if (sGestureDecoderFactoryMethod) { + return sGestureDecoderFactoryMethod(maxWordLength, maxWords); + } + return 0; + } + + static IncrementalDecoderInterface *(*sGestureDecoderFactoryMethod)(int, int); + IncrementalDecoderInterface *mIncrementalDecoderInterface; +}; +} // namespace latinime +#endif // LATINIME_GESTURE_DECODER_WRAPPER_H diff --git a/native/jni/src/gesture/incremental_decoder_interface.h b/native/jni/src/gesture/incremental_decoder_interface.h new file mode 100644 index 000000000..d1395aab9 --- /dev/null +++ b/native/jni/src/gesture/incremental_decoder_interface.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_INCREMENTAL_DECODER_INTERFACE_H +#define LATINIME_INCREMENTAL_DECODER_INTERFACE_H + +#include <stdint.h> +#include "defines.h" + +namespace latinime { + +class UnigramDictionary; +class BigramDictionary; +class ProximityInfo; + +class IncrementalDecoderInterface { + public: + virtual int getSuggestions(ProximityInfo *pInfo, void *traverseSession, + int *inputXs, int *inputYs, int *times, int *pointerIds, int *codes, + int inputSize, int commitPoint, unsigned short *outWords, int *frequencies, + int *outputIndices, int *outputTypes) const = 0; + IncrementalDecoderInterface() { }; + virtual ~IncrementalDecoderInterface() { }; + private: + DISALLOW_COPY_AND_ASSIGN(IncrementalDecoderInterface); +}; +} // namespace latinime +#endif // LATINIME_INCREMENTAL_DECODER_INTERFACE_H diff --git a/native/jni/src/gesture/incremental_decoder_wrapper.cpp b/native/jni/src/gesture/incremental_decoder_wrapper.cpp new file mode 100644 index 000000000..8fcda6c9e --- /dev/null +++ b/native/jni/src/gesture/incremental_decoder_wrapper.cpp @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "incremental_decoder_wrapper.h" + +namespace latinime { + IncrementalDecoderInterface * + (*IncrementalDecoderWrapper::sIncrementalDecoderFactoryMethod)(int, int) = 0; +} // namespace latinime diff --git a/native/jni/src/gesture/incremental_decoder_wrapper.h b/native/jni/src/gesture/incremental_decoder_wrapper.h new file mode 100644 index 000000000..da7afdb8a --- /dev/null +++ b/native/jni/src/gesture/incremental_decoder_wrapper.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_INCREMENTAL_DECODER_WRAPPER_H +#define LATINIME_INCREMENTAL_DECODER_WRAPPER_H + +#include <stdint.h> +#include "defines.h" +#include "incremental_decoder_interface.h" + +namespace latinime { + +class UnigramDictionary; +class BigramDictionary; +class ProximityInfo; + +class IncrementalDecoderWrapper : public IncrementalDecoderInterface { + public: + IncrementalDecoderWrapper(const int maxWordLength, const int maxWords) + : mIncrementalDecoderInterface(getIncrementalDecoderInstance(maxWordLength, maxWords)) { + } + + virtual ~IncrementalDecoderWrapper() { + delete mIncrementalDecoderInterface; + } + + int getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, + int *times, int *pointerIds, int *codes, int inputSize, int commitPoint, + unsigned short *outWords, int *frequencies, int *outputIndices, + int *outputTypes) const { + if (!mIncrementalDecoderInterface) { + return 0; + } + return mIncrementalDecoderInterface->getSuggestions( + pInfo, traverseSession, inputXs, inputYs, times, pointerIds, codes, + inputSize, commitPoint, outWords, frequencies, outputIndices, outputTypes); + } + + static void setIncrementalDecoderFactoryMethod( + IncrementalDecoderInterface *(*factoryMethod)(int, int)) { + sIncrementalDecoderFactoryMethod = factoryMethod; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(IncrementalDecoderWrapper); + static IncrementalDecoderInterface *getIncrementalDecoderInstance(int maxWordLength, + int maxWords) { + if (sIncrementalDecoderFactoryMethod) { + return sIncrementalDecoderFactoryMethod(maxWordLength, maxWords); + } + return 0; + } + + static IncrementalDecoderInterface *(*sIncrementalDecoderFactoryMethod)(int, int); + IncrementalDecoderInterface *mIncrementalDecoderInterface; +}; +} // namespace latinime +#endif // LATINIME_INCREMENTAL_DECODER_WRAPPER_H diff --git a/native/jni/src/hash_map_compat.h b/native/jni/src/hash_map_compat.h new file mode 100644 index 000000000..116359a73 --- /dev/null +++ b/native/jni/src/hash_map_compat.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2012, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HASH_MAP_COMPAT_H +#define LATINIME_HASH_MAP_COMPAT_H + +// TODO: Use std::unordered_map that has been standardized in C++11 + +#ifdef __APPLE__ +#include <ext/hash_map> +#else // __APPLE__ +#include <hash_map> +#endif // __APPLE__ + +#ifdef __SGI_STL_PORT +#define hash_map_compat stlport::hash_map +#else // __SGI_STL_PORT +#define hash_map_compat __gnu_cxx::hash_map +#endif // __SGI_STL_PORT + +#endif // LATINIME_HASH_MAP_COMPAT_H diff --git a/native/jni/src/proximity_info.cpp b/native/jni/src/proximity_info.cpp index 80e14fbb1..c9f83b62c 100644 --- a/native/jni/src/proximity_info.cpp +++ b/native/jni/src/proximity_info.cpp @@ -14,93 +14,86 @@ * limitations under the License. */ -#include <assert.h> -#include <stdio.h> -#include <string> +#include <cassert> +#include <cmath> +#include <cstring> #define LOG_TAG "LatinIME: proximity_info.cpp" #include "additional_proximity_chars.h" +#include "char_utils.h" #include "defines.h" -#include "dictionary.h" +#include "geometry_utils.h" +#include "jni.h" #include "proximity_info.h" namespace latinime { -inline void copyOrFillZero(void *to, const void *from, size_t size) { - if (from) { - memcpy(to, from, size); - } else { - memset(to, 0, size); +/* static */ const float ProximityInfo::NOT_A_DISTANCE_FLOAT = -1.0f; + +static inline void safeGetOrFillZeroIntArrayRegion(JNIEnv *env, jintArray jArray, jsize len, + jint *buffer) { + if (jArray && buffer) { + env->GetIntArrayRegion(jArray, 0, len, buffer); + } else if (buffer) { + memset(buffer, 0, len * sizeof(jint)); } } -const float ProximityInfo::NOT_A_DISTANCE_FLOAT = -1.0f; +static inline void safeGetOrFillZeroFloatArrayRegion(JNIEnv *env, jfloatArray jArray, jsize len, + jfloat *buffer) { + if (jArray && buffer) { + env->GetFloatArrayRegion(jArray, 0, len, buffer); + } else if (buffer) { + memset(buffer, 0, len * sizeof(jfloat)); + } +} -ProximityInfo::ProximityInfo(const std::string localeStr, const int maxProximityCharsSize, +ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr, const int maxProximityCharsSize, const int keyboardWidth, const int keyboardHeight, const int gridWidth, - const int gridHeight, const int mostCommonKeyWidth, - const int32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates, - const int32_t *keyYCoordinates, const int32_t *keyWidths, const int32_t *keyHeights, - const int32_t *keyCharCodes, const float *sweetSpotCenterXs, const float *sweetSpotCenterYs, - const float *sweetSpotRadii) - : MAX_PROXIMITY_CHARS_SIZE(maxProximityCharsSize), - GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight), + const int gridHeight, const int mostCommonKeyWidth, const jintArray proximityChars, + const int keyCount, const jintArray keyXCoordinates, const jintArray keyYCoordinates, + const jintArray keyWidths, const jintArray keyHeights, const jintArray keyCharCodes, + const jfloatArray sweetSpotCenterXs, const jfloatArray sweetSpotCenterYs, + const jfloatArray sweetSpotRadii) + : MAX_PROXIMITY_CHARS_SIZE(maxProximityCharsSize), GRID_WIDTH(gridWidth), + GRID_HEIGHT(gridHeight), MOST_COMMON_KEY_WIDTH(mostCommonKeyWidth), MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth), CELL_WIDTH((keyboardWidth + gridWidth - 1) / gridWidth), CELL_HEIGHT((keyboardHeight + gridHeight - 1) / gridHeight), KEY_COUNT(min(keyCount, MAX_KEY_COUNT_IN_A_KEYBOARD)), + KEYBOARD_WIDTH(keyboardWidth), KEYBOARD_HEIGHT(keyboardHeight), HAS_TOUCH_POSITION_CORRECTION_DATA(keyCount > 0 && keyXCoordinates && keyYCoordinates && keyWidths && keyHeights && keyCharCodes && sweetSpotCenterXs && sweetSpotCenterYs && sweetSpotRadii), - mLocaleStr(localeStr), - mInputXCoordinates(0), mInputYCoordinates(0), - mTouchPositionCorrectionEnabled(false) { + mProximityCharsArray(new int32_t[GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE + /* proximityGridLength */]), + mCodeToKeyMap() { const int proximityGridLength = GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE; - mProximityCharsArray = new int32_t[proximityGridLength]; - mInputCodes = new int32_t[MAX_PROXIMITY_CHARS_SIZE * MAX_WORD_LENGTH_INTERNAL]; if (DEBUG_PROXIMITY_INFO) { AKLOGI("Create proximity info array %d", proximityGridLength); } - memcpy(mProximityCharsArray, proximityCharsArray, - proximityGridLength * sizeof(mProximityCharsArray[0])); - const int normalizedSquaredDistancesLength = - MAX_PROXIMITY_CHARS_SIZE * MAX_WORD_LENGTH_INTERNAL; - mNormalizedSquaredDistances = new int[normalizedSquaredDistancesLength]; - for (int i = 0; i < normalizedSquaredDistancesLength; ++i) { - mNormalizedSquaredDistances[i] = NOT_A_DISTANCE; - } - - copyOrFillZero(mKeyXCoordinates, keyXCoordinates, KEY_COUNT * sizeof(mKeyXCoordinates[0])); - copyOrFillZero(mKeyYCoordinates, keyYCoordinates, KEY_COUNT * sizeof(mKeyYCoordinates[0])); - copyOrFillZero(mKeyWidths, keyWidths, KEY_COUNT * sizeof(mKeyWidths[0])); - copyOrFillZero(mKeyHeights, keyHeights, KEY_COUNT * sizeof(mKeyHeights[0])); - copyOrFillZero(mKeyCharCodes, keyCharCodes, KEY_COUNT * sizeof(mKeyCharCodes[0])); - copyOrFillZero(mSweetSpotCenterXs, sweetSpotCenterXs, - KEY_COUNT * sizeof(mSweetSpotCenterXs[0])); - copyOrFillZero(mSweetSpotCenterYs, sweetSpotCenterYs, - KEY_COUNT * sizeof(mSweetSpotCenterYs[0])); - copyOrFillZero(mSweetSpotRadii, sweetSpotRadii, KEY_COUNT * sizeof(mSweetSpotRadii[0])); - - initializeCodeToKeyIndex(); -} - -// Build the reversed look up table from the char code to the index in mKeyXCoordinates, -// mKeyYCoordinates, mKeyWidths, mKeyHeights, mKeyCharCodes. -void ProximityInfo::initializeCodeToKeyIndex() { - memset(mCodeToKeyIndex, -1, (MAX_CHAR_CODE + 1) * sizeof(mCodeToKeyIndex[0])); - for (int i = 0; i < KEY_COUNT; ++i) { - const int code = mKeyCharCodes[i]; - if (0 <= code && code <= MAX_CHAR_CODE) { - mCodeToKeyIndex[code] = i; - } + const jsize localeCStrUtf8Length = env->GetStringUTFLength(localeJStr); + if (localeCStrUtf8Length >= MAX_LOCALE_STRING_LENGTH) { + AKLOGI("Locale string length too long: length=%d", localeCStrUtf8Length); + assert(false); } + memset(mLocaleStr, 0, sizeof(mLocaleStr)); + env->GetStringUTFRegion(localeJStr, 0, env->GetStringLength(localeJStr), mLocaleStr); + safeGetOrFillZeroIntArrayRegion(env, proximityChars, proximityGridLength, mProximityCharsArray); + safeGetOrFillZeroIntArrayRegion(env, keyXCoordinates, KEY_COUNT, mKeyXCoordinates); + safeGetOrFillZeroIntArrayRegion(env, keyYCoordinates, KEY_COUNT, mKeyYCoordinates); + safeGetOrFillZeroIntArrayRegion(env, keyWidths, KEY_COUNT, mKeyWidths); + safeGetOrFillZeroIntArrayRegion(env, keyHeights, KEY_COUNT, mKeyHeights); + safeGetOrFillZeroIntArrayRegion(env, keyCharCodes, KEY_COUNT, mKeyCodePoints); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterXs, KEY_COUNT, mSweetSpotCenterXs); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterYs, KEY_COUNT, mSweetSpotCenterYs); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotRadii, KEY_COUNT, mSweetSpotRadii); + initializeG(); } ProximityInfo::~ProximityInfo() { - delete[] mNormalizedSquaredDistances; delete[] mProximityCharsArray; - delete[] mInputCodes; } inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y) const { @@ -112,7 +105,8 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { if (x < 0 || y < 0) { if (DEBUG_DICT) { AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); - assert(false); + // TODO: Enable this assertion. + //assert(false); } return false; } @@ -121,24 +115,33 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { if (DEBUG_PROXIMITY_INFO) { AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); } + int32_t *proximityCharsArray = mProximityCharsArray; for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { if (DEBUG_PROXIMITY_INFO) { AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]); } - if (mProximityCharsArray[startIndex + i] == KEYCODE_SPACE) { + if (proximityCharsArray[startIndex + i] == KEYCODE_SPACE) { return true; } } return false; } -bool ProximityInfo::isOnKey(const int keyId, const int x, const int y) const { - if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case - const int left = mKeyXCoordinates[keyId]; - const int top = mKeyYCoordinates[keyId]; - const int right = left + mKeyWidths[keyId] + 1; - const int bottom = top + mKeyHeights[keyId]; - return left < right && top < bottom && x >= left && x < right && y >= top && y < bottom; +static inline float getNormalizedSquaredDistanceFloat(float x1, float y1, float x2, float y2, + float scale) { + const float deltaX = x1 - x2; + const float deltaY = y1 - y2; + return (SQUARE_FLOAT(deltaX) + SQUARE_FLOAT(deltaY)) / SQUARE_FLOAT(scale); +} + +float ProximityInfo::getNormalizedSquaredDistanceFromCenterFloat( + const int keyId, const int x, const int y) const { + const float centerX = static_cast<float>(getKeyCenterXOfKeyIdG(keyId)); + const float centerY = static_cast<float>(getKeyCenterYOfKeyIdG(keyId)); + const float touchX = static_cast<float>(x); + const float touchY = static_cast<float>(y); + const float keyWidth = static_cast<float>(getMostCommonKeyWidth()); + return getNormalizedSquaredDistanceFloat(centerX, centerY, touchX, touchY, keyWidth); } int ProximityInfo::squaredDistanceToEdge(const int keyId, const int x, const int y) const { @@ -156,16 +159,17 @@ int ProximityInfo::squaredDistanceToEdge(const int keyId, const int x, const int void ProximityInfo::calculateNearbyKeyCodes( const int x, const int y, const int32_t primaryKey, int *inputCodes) const { + int32_t *proximityCharsArray = mProximityCharsArray; int insertPos = 0; inputCodes[insertPos++] = primaryKey; const int startIndex = getStartIndexFromCoordinates(x, y); if (startIndex >= 0) { for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { - const int32_t c = mProximityCharsArray[startIndex + i]; + const int32_t c = proximityCharsArray[startIndex + i]; if (c < KEYCODE_SPACE || c == primaryKey) { continue; } - const int keyIndex = getKeyIndex(c); + const int keyIndex = getKeyIndexOf(c); const bool onKey = isOnKey(keyIndex, x, y); const int distance = squaredDistanceToEdge(keyIndex, x, y); if (onKey || distance < MOST_COMMON_KEY_WIDTH_SQUARE) { @@ -179,7 +183,7 @@ void ProximityInfo::calculateNearbyKeyCodes( } } const int additionalProximitySize = - AdditionalProximityChars::getAdditionalCharsSize(&mLocaleStr, primaryKey); + AdditionalProximityChars::getAdditionalCharsSize(mLocaleStr, primaryKey); if (additionalProximitySize > 0) { inputCodes[insertPos++] = ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE; if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { @@ -189,13 +193,13 @@ void ProximityInfo::calculateNearbyKeyCodes( return; } - const int32_t* additionalProximityChars = - AdditionalProximityChars::getAdditionalChars(&mLocaleStr, primaryKey); + const int32_t *additionalProximityChars = + AdditionalProximityChars::getAdditionalChars(mLocaleStr, primaryKey); for (int j = 0; j < additionalProximitySize; ++j) { const int32_t ac = additionalProximityChars[j]; int k = 0; for (; k < insertPos; ++k) { - if ((int)ac == inputCodes[k]) { + if (static_cast<int>(ac) == inputCodes[k]) { break; } } @@ -214,256 +218,78 @@ void ProximityInfo::calculateNearbyKeyCodes( } // Add a delimiter for the proximity characters for (int i = insertPos; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { - inputCodes[i] = NOT_A_CODE; - } -} - -void ProximityInfo::setInputParams(const int32_t* inputCodes, const int inputLength, - const int* xCoordinates, const int* yCoordinates) { - memset(mInputCodes, 0, - MAX_WORD_LENGTH_INTERNAL * MAX_PROXIMITY_CHARS_SIZE * sizeof(mInputCodes[0])); - - for (int i = 0; i < inputLength; ++i) { - const int32_t primaryKey = inputCodes[i]; - const int x = xCoordinates[i]; - const int y = yCoordinates[i]; - int *proximities = &mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE]; - calculateNearbyKeyCodes(x, y, primaryKey, proximities); - } - - if (DEBUG_PROXIMITY_CHARS) { - for (int i = 0; i < inputLength; ++i) { - AKLOGI("---"); - for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE; ++j) { - int icc = mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE + j]; - int icfjc = inputCodes[i * MAX_PROXIMITY_CHARS_SIZE + j]; - icc+= 0; - icfjc += 0; - AKLOGI("--- (%d)%c,%c", i, icc, icfjc); - AKLOGI("--- A<%d>,B<%d>", icc, icfjc); - } - } - } - //Keep for debug, sorry - //for (int i = 0; i < MAX_WORD_LENGTH_INTERNAL * MAX_PROXIMITY_CHARS_SIZE; ++i) { - //if (i < inputLength * MAX_PROXIMITY_CHARS_SIZE) { - //mInputCodes[i] = mInputCodesFromJava[i]; - //} else { - // mInputCodes[i] = 0; - // } - //} - mInputXCoordinates = xCoordinates; - mInputYCoordinates = yCoordinates; - mTouchPositionCorrectionEnabled = - HAS_TOUCH_POSITION_CORRECTION_DATA && xCoordinates && yCoordinates; - mInputLength = inputLength; - for (int i = 0; i < inputLength; ++i) { - mPrimaryInputWord[i] = getPrimaryCharAt(i); - } - mPrimaryInputWord[inputLength] = 0; - if (DEBUG_PROXIMITY_CHARS) { - AKLOGI("--- setInputParams"); - } - for (int i = 0; i < mInputLength; ++i) { - const int *proximityChars = getProximityCharsAt(i); - const int primaryKey = proximityChars[0]; - const int x = xCoordinates[i]; - const int y = yCoordinates[i]; - if (DEBUG_PROXIMITY_CHARS) { - int a = x + y + primaryKey; - a += 0; - AKLOGI("--- Primary = %c, x = %d, y = %d", primaryKey, x, y); - // Keep debug code just in case - //int proximities[50]; - //for (int m = 0; m < 50; ++m) { - //proximities[m] = 0; - //} - //calculateNearbyKeyCodes(x, y, primaryKey, proximities); - //for (int l = 0; l < 50 && proximities[l] > 0; ++l) { - //if (DEBUG_PROXIMITY_CHARS) { - //AKLOGI("--- native Proximity (%d) = %c", l, proximities[l]); - //} - //} - } - for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE && proximityChars[j] > 0; ++j) { - const int currentChar = proximityChars[j]; - const float squaredDistance = hasInputCoordinates() - ? calculateNormalizedSquaredDistance(getKeyIndex(currentChar), i) - : NOT_A_DISTANCE_FLOAT; - if (squaredDistance >= 0.0f) { - mNormalizedSquaredDistances[i * MAX_PROXIMITY_CHARS_SIZE + j] = - (int)(squaredDistance * NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR); - } else { - mNormalizedSquaredDistances[i * MAX_PROXIMITY_CHARS_SIZE + j] = (j == 0) - ? EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO - : PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO; - } - if (DEBUG_PROXIMITY_CHARS) { - AKLOGI("--- Proximity (%d) = %c", j, currentChar); - } - } - } -} - -inline float square(const float x) { return x * x; } - -float ProximityInfo::calculateNormalizedSquaredDistance( - const int keyIndex, const int inputIndex) const { - if (keyIndex == NOT_AN_INDEX) { - return NOT_A_DISTANCE_FLOAT; - } - if (!hasSweetSpotData(keyIndex)) { - return NOT_A_DISTANCE_FLOAT; - } - if (NOT_A_COORDINATE == mInputXCoordinates[inputIndex]) { - return NOT_A_DISTANCE_FLOAT; + inputCodes[i] = NOT_A_CODE_POINT; } - const float squaredDistance = calculateSquaredDistanceFromSweetSpotCenter(keyIndex, inputIndex); - const float squaredRadius = square(mSweetSpotRadii[keyIndex]); - return squaredDistance / squaredRadius; -} - -bool ProximityInfo::hasInputCoordinates() const { - return mInputXCoordinates && mInputYCoordinates; } -int ProximityInfo::getKeyIndex(const int c) const { +int ProximityInfo::getKeyIndexOf(const int c) const { if (KEY_COUNT == 0) { // We do not have the coordinate data return NOT_AN_INDEX; } - const unsigned short baseLowerC = toBaseLowerCase(c); - if (baseLowerC > MAX_CHAR_CODE) { - return NOT_AN_INDEX; + const int baseLowerC = static_cast<int>(toBaseLowerCase(c)); + hash_map_compat<int, int>::const_iterator mapPos = mCodeToKeyMap.find(baseLowerC); + if (mapPos != mCodeToKeyMap.end()) { + return mapPos->second; } - return mCodeToKeyIndex[baseLowerC]; -} - -float ProximityInfo::calculateSquaredDistanceFromSweetSpotCenter( - const int keyIndex, const int inputIndex) const { - const float sweetSpotCenterX = mSweetSpotCenterXs[keyIndex]; - const float sweetSpotCenterY = mSweetSpotCenterYs[keyIndex]; - const float inputX = (float)mInputXCoordinates[inputIndex]; - const float inputY = (float)mInputYCoordinates[inputIndex]; - return square(inputX - sweetSpotCenterX) + square(inputY - sweetSpotCenterY); + return NOT_AN_INDEX; } -inline const int* ProximityInfo::getProximityCharsAt(const int index) const { - return mInputCodes + (index * MAX_PROXIMITY_CHARS_SIZE); -} - -unsigned short ProximityInfo::getPrimaryCharAt(const int index) const { - return getProximityCharsAt(index)[0]; +int ProximityInfo::getCodePointOf(const int keyIndex) const { + if (keyIndex < 0 || keyIndex >= KEY_COUNT) { + return NOT_A_CODE_POINT; + } + return mKeyIndexToCodePointG[keyIndex]; } -inline bool ProximityInfo::existsCharInProximityAt(const int index, const int c) const { - const int *chars = getProximityCharsAt(index); - int i = 0; - while (chars[i] > 0 && i < MAX_PROXIMITY_CHARS_SIZE) { - if (chars[i++] == c) { - return true; +void ProximityInfo::initializeG() { + // TODO: Optimize + for (int i = 0; i < KEY_COUNT; ++i) { + const int code = mKeyCodePoints[i]; + const int lowerCode = toBaseLowerCase(code); + mCenterXsG[i] = mKeyXCoordinates[i] + mKeyWidths[i] / 2; + mCenterYsG[i] = mKeyYCoordinates[i] + mKeyHeights[i] / 2; + mCodeToKeyMap[lowerCode] = i; + mKeyIndexToCodePointG[i] = lowerCode; + } + for (int i = 0; i < KEY_COUNT; i++) { + mKeyKeyDistancesG[i][i] = 0; + for (int j = i + 1; j < KEY_COUNT; j++) { + mKeyKeyDistancesG[i][j] = getDistanceInt( + mCenterXsG[i], mCenterYsG[i], mCenterXsG[j], mCenterYsG[j]); + mKeyKeyDistancesG[j][i] = mKeyKeyDistancesG[i][j]; } } - return false; } -bool ProximityInfo::existsAdjacentProximityChars(const int index) const { - if (index < 0 || index >= mInputLength) return false; - const int currentChar = getPrimaryCharAt(index); - const int leftIndex = index - 1; - if (leftIndex >= 0 && existsCharInProximityAt(leftIndex, currentChar)) { - return true; - } - const int rightIndex = index + 1; - if (rightIndex < mInputLength && existsCharInProximityAt(rightIndex, currentChar)) { - return true; - } - return false; +int ProximityInfo::getKeyCenterXOfCodePointG(int charCode) const { + return getKeyCenterXOfKeyIdG(getKeyIndexOf(charCode)); } -// In the following function, c is the current character of the dictionary word -// currently examined. -// currentChars is an array containing the keys close to the character the -// user actually typed at the same position. We want to see if c is in it: if so, -// then the word contains at that position a character close to what the user -// typed. -// What the user typed is actually the first character of the array. -// proximityIndex is a pointer to the variable where getMatchedProximityId returns -// the index of c in the proximity chars of the input index. -// Notice : accented characters do not have a proximity list, so they are alone -// in their list. The non-accented version of the character should be considered -// "close", but not the other keys close to the non-accented version. -ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(const int index, - const unsigned short c, const bool checkProximityChars, int *proximityIndex) const { - const int *currentChars = getProximityCharsAt(index); - const int firstChar = currentChars[0]; - const unsigned short baseLowerC = toBaseLowerCase(c); +int ProximityInfo::getKeyCenterYOfCodePointG(int charCode) const { + return getKeyCenterYOfKeyIdG(getKeyIndexOf(charCode)); +} - // The first char in the array is what user typed. If it matches right away, - // that means the user typed that same char for this pos. - if (firstChar == baseLowerC || firstChar == c) { - return EQUIVALENT_CHAR; +int ProximityInfo::getKeyCenterXOfKeyIdG(int keyId) const { + if (keyId >= 0) { + return mCenterXsG[keyId]; } + return 0; +} - if (!checkProximityChars) return UNRELATED_CHAR; - - // If the non-accented, lowercased version of that first character matches c, - // then we have a non-accented version of the accented character the user - // typed. Treat it as a close char. - if (toBaseLowerCase(firstChar) == baseLowerC) - return NEAR_PROXIMITY_CHAR; - - // Not an exact nor an accent-alike match: search the list of close keys - int j = 1; - while (j < MAX_PROXIMITY_CHARS_SIZE - && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { - const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); - if (matched) { - if (proximityIndex) { - *proximityIndex = j; - } - return NEAR_PROXIMITY_CHAR; - } - ++j; +int ProximityInfo::getKeyCenterYOfKeyIdG(int keyId) const { + if (keyId >= 0) { + return mCenterYsG[keyId]; } - if (j < MAX_PROXIMITY_CHARS_SIZE - && currentChars[j] == ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { - ++j; - while (j < MAX_PROXIMITY_CHARS_SIZE - && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { - const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); - if (matched) { - if (proximityIndex) { - *proximityIndex = j; - } - return ADDITIONAL_PROXIMITY_CHAR; - } - ++j; - } - } - - // Was not included, signal this as an unrelated character. - return UNRELATED_CHAR; + return 0; } -bool ProximityInfo::sameAsTyped(const unsigned short *word, int length) const { - if (length != mInputLength) { - return false; +int ProximityInfo::getKeyKeyDistanceG(int key0, int key1) const { + const int keyId0 = getKeyIndexOf(key0); + const int keyId1 = getKeyIndexOf(key1); + if (keyId0 >= 0 && keyId1 >= 0) { + return mKeyKeyDistancesG[keyId0][keyId1]; } - const int *inputCodes = mInputCodes; - while (length--) { - if ((unsigned int) *inputCodes != (unsigned int) *word) { - return false; - } - inputCodes += MAX_PROXIMITY_CHARS_SIZE; - word++; - } - return true; + return MAX_POINT_TO_KEY_LENGTH; } - -const int ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2; -const int ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; -const int ProximityInfo::MAX_KEY_COUNT_IN_A_KEYBOARD; -const int ProximityInfo::MAX_CHAR_CODE; - } // namespace latinime diff --git a/native/jni/src/proximity_info.h b/native/jni/src/proximity_info.h index 3a2511cf1..0d8c6a3ca 100644 --- a/native/jni/src/proximity_info.h +++ b/native/jni/src/proximity_info.h @@ -18,113 +18,152 @@ #define LATINIME_PROXIMITY_INFO_H #include <stdint.h> -#include <string> #include "defines.h" +#include "hash_map_compat.h" +#include "jni.h" namespace latinime { class Correction; +inline bool isSkippableChar(const uint16_t character) { + // TODO: Do not hardcode here + return character == '\'' || character == '-'; +} + class ProximityInfo { public: - static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2 = 10; - static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR = - 1 << NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2; - - // Used as a return value for character comparison - typedef enum { - // Same char, possibly with different case or accent - EQUIVALENT_CHAR, - // It is a char located nearby on the keyboard - NEAR_PROXIMITY_CHAR, - // It is an unrelated char - UNRELATED_CHAR, - // Additional proximity char which can differ by language. - ADDITIONAL_PROXIMITY_CHAR - } ProximityType; - - ProximityInfo(const std::string localeStr, const int maxProximityCharsSize, + ProximityInfo(JNIEnv *env, const jstring localeJStr, const int maxProximityCharsSize, const int keyboardWidth, const int keyboardHeight, const int gridWidth, - const int gridHeight, const int mostCommonkeyWidth, - const int32_t *proximityCharsArray, const int keyCount, const int32_t *keyXCoordinates, - const int32_t *keyYCoordinates, const int32_t *keyWidths, const int32_t *keyHeights, - const int32_t *keyCharCodes, const float *sweetSpotCenterXs, - const float *sweetSpotCenterYs, const float *sweetSpotRadii); + const int gridHeight, const int mostCommonKeyWidth, const jintArray proximityChars, + const int keyCount, const jintArray keyXCoordinates, const jintArray keyYCoordinates, + const jintArray keyWidths, const jintArray keyHeights, const jintArray keyCharCodes, + const jfloatArray sweetSpotCenterXs, const jfloatArray sweetSpotCenterYs, + const jfloatArray sweetSpotRadii); ~ProximityInfo(); bool hasSpaceProximity(const int x, const int y) const; - void setInputParams(const int32_t *inputCodes, const int inputLength, - const int *xCoordinates, const int *yCoordinates); - const int* getProximityCharsAt(const int index) const; - unsigned short getPrimaryCharAt(const int index) const; - bool existsCharInProximityAt(const int index, const int c) const; - bool existsAdjacentProximityChars(const int index) const; - ProximityType getMatchedProximityId(const int index, const unsigned short c, - const bool checkProximityChars, int *proximityIndex = 0) const; - int getNormalizedSquaredDistance(const int inputIndex, const int proximityIndex) const { - return mNormalizedSquaredDistances[inputIndex * MAX_PROXIMITY_CHARS_SIZE + proximityIndex]; - } + int getNormalizedSquaredDistance(const int inputIndex, const int proximityIndex) const; + float getNormalizedSquaredDistanceFromCenterFloat( + const int keyId, const int x, const int y) const; bool sameAsTyped(const unsigned short *word, int length) const; - const unsigned short* getPrimaryInputWord() const { - return mPrimaryInputWord; + int getKeyIndexOf(const int c) const; + int getCodePointOf(const int keyIndex) const; + bool hasSweetSpotData(const int keyIndex) const { + // When there are no calibration data for a key, + // the radius of the key is assigned to zero. + return mSweetSpotRadii[keyIndex] > 0.0f; + } + float getSweetSpotRadiiAt(int keyIndex) const { + return mSweetSpotRadii[keyIndex]; + } + float getSweetSpotCenterXAt(int keyIndex) const { + return mSweetSpotCenterXs[keyIndex]; + } + float getSweetSpotCenterYAt(int keyIndex) const { + return mSweetSpotCenterYs[keyIndex]; + } + void calculateNearbyKeyCodes( + const int x, const int y, const int32_t primaryKey, int *inputCodes) const; + + bool hasTouchPositionCorrectionData() const { + return HAS_TOUCH_POSITION_CORRECTION_DATA; + } + + int getMostCommonKeyWidth() const { + return MOST_COMMON_KEY_WIDTH; + } + + int getMostCommonKeyWidthSquare() const { + return MOST_COMMON_KEY_WIDTH_SQUARE; + } + + const char *getLocaleStr() const { + return mLocaleStr; + } + + int getKeyCount() const { + return KEY_COUNT; + } + + int getCellHeight() const { + return CELL_HEIGHT; + } + + int getCellWidth() const { + return CELL_WIDTH; } - bool touchPositionCorrectionEnabled() const { - return mTouchPositionCorrectionEnabled; + + int getGridWidth() const { + return GRID_WIDTH; + } + + int getGridHeight() const { + return GRID_HEIGHT; } + int getKeyboardWidth() const { + return KEYBOARD_WIDTH; + } + + int getKeyboardHeight() const { + return KEYBOARD_HEIGHT; + } + + int getKeyCenterXOfCodePointG(int charCode) const; + int getKeyCenterYOfCodePointG(int charCode) const; + int getKeyCenterXOfKeyIdG(int keyId) const; + int getKeyCenterYOfKeyIdG(int keyId) const; + int getKeyKeyDistanceG(int key0, int key1) const; + private: - // The max number of the keys in one keyboard layout - static const int MAX_KEY_COUNT_IN_A_KEYBOARD = 64; - // The upper limit of the char code in mCodeToKeyIndex - static const int MAX_CHAR_CODE = 127; + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfo); static const float NOT_A_DISTANCE_FLOAT; - static const int NOT_A_CODE = -1; int getStartIndexFromCoordinates(const int x, const int y) const; - void initializeCodeToKeyIndex(); + void initializeG(); float calculateNormalizedSquaredDistance(const int keyIndex, const int inputIndex) const; float calculateSquaredDistanceFromSweetSpotCenter( const int keyIndex, const int inputIndex) const; bool hasInputCoordinates() const; - int getKeyIndex(const int c) const; - bool hasSweetSpotData(const int keyIndex) const { - // When there are no calibration data for a key, - // the radius of the key is assigned to zero. - return mSweetSpotRadii[keyIndex] > 0.0; - } - bool isOnKey(const int keyId, const int x, const int y) const; int squaredDistanceToEdge(const int keyId, const int x, const int y) const; - void calculateNearbyKeyCodes( - const int x, const int y, const int32_t primaryKey, int *inputCodes) const; + bool isOnKey(const int keyId, const int x, const int y) const { + if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case + const int left = mKeyXCoordinates[keyId]; + const int top = mKeyYCoordinates[keyId]; + const int right = left + mKeyWidths[keyId] + 1; + const int bottom = top + mKeyHeights[keyId]; + return left < right && top < bottom && x >= left && x < right && y >= top && y < bottom; + } const int MAX_PROXIMITY_CHARS_SIZE; const int GRID_WIDTH; const int GRID_HEIGHT; + const int MOST_COMMON_KEY_WIDTH; const int MOST_COMMON_KEY_WIDTH_SQUARE; const int CELL_WIDTH; const int CELL_HEIGHT; const int KEY_COUNT; + const int KEYBOARD_WIDTH; + const int KEYBOARD_HEIGHT; const bool HAS_TOUCH_POSITION_CORRECTION_DATA; - const std::string mLocaleStr; - int32_t *mInputCodes; - const int *mInputXCoordinates; - const int *mInputYCoordinates; - bool mTouchPositionCorrectionEnabled; + char mLocaleStr[MAX_LOCALE_STRING_LENGTH]; int32_t *mProximityCharsArray; - int *mNormalizedSquaredDistances; int32_t mKeyXCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; int32_t mKeyYCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; int32_t mKeyWidths[MAX_KEY_COUNT_IN_A_KEYBOARD]; int32_t mKeyHeights[MAX_KEY_COUNT_IN_A_KEYBOARD]; - int32_t mKeyCharCodes[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int32_t mKeyCodePoints[MAX_KEY_COUNT_IN_A_KEYBOARD]; float mSweetSpotCenterXs[MAX_KEY_COUNT_IN_A_KEYBOARD]; float mSweetSpotCenterYs[MAX_KEY_COUNT_IN_A_KEYBOARD]; float mSweetSpotRadii[MAX_KEY_COUNT_IN_A_KEYBOARD]; - int mInputLength; - unsigned short mPrimaryInputWord[MAX_WORD_LENGTH_INTERNAL]; - int mCodeToKeyIndex[MAX_CHAR_CODE + 1]; -}; + hash_map_compat<int, int> mCodeToKeyMap; + int mKeyIndexToCodePointG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mCenterXsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mCenterYsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyKeyDistancesG[MAX_KEY_COUNT_IN_A_KEYBOARD][MAX_KEY_COUNT_IN_A_KEYBOARD]; + // TODO: move to correction.h +}; } // namespace latinime - #endif // LATINIME_PROXIMITY_INFO_H diff --git a/native/jni/src/proximity_info_state.cpp b/native/jni/src/proximity_info_state.cpp new file mode 100644 index 000000000..c5f2884c6 --- /dev/null +++ b/native/jni/src/proximity_info_state.cpp @@ -0,0 +1,542 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstring> // for memset() +#include <stdint.h> + +#define LOG_TAG "LatinIME: proximity_info_state.cpp" + +#include "defines.h" +#include "geometry_utils.h" +#include "proximity_info.h" +#include "proximity_info_state.h" + +namespace latinime { + +const int ProximityInfoState::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2 = 10; +const int ProximityInfoState::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR = + 1 << NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2; +const float ProximityInfoState::NOT_A_DISTANCE_FLOAT = -1.0f; +const int ProximityInfoState::NOT_A_CODE = -1; + +void ProximityInfoState::initInputParams(const int pointerId, const float maxPointToKeyLength, + const ProximityInfo *proximityInfo, const int32_t *const inputCodes, const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times, + const int *const pointerIds, const bool isGeometric) { + + if (isGeometric) { + mIsContinuationPossible = checkAndReturnIsContinuationPossible( + inputSize, xCoordinates, yCoordinates, times); + } else { + mIsContinuationPossible = false; + } + + mProximityInfo = proximityInfo; + mHasTouchPositionCorrectionData = proximityInfo->hasTouchPositionCorrectionData(); + mMostCommonKeyWidthSquare = proximityInfo->getMostCommonKeyWidthSquare(); + mLocaleStr = proximityInfo->getLocaleStr(); + mKeyCount = proximityInfo->getKeyCount(); + mCellHeight = proximityInfo->getCellHeight(); + mCellWidth = proximityInfo->getCellWidth(); + mGridHeight = proximityInfo->getGridWidth(); + mGridWidth = proximityInfo->getGridHeight(); + + memset(mInputCodes, 0, sizeof(mInputCodes)); + + if (!isGeometric && pointerId == 0) { + // Initialize + // - mInputCodes + // - mNormalizedSquaredDistances + // TODO: Merge + for (int i = 0; i < inputSize; ++i) { + const int32_t primaryKey = inputCodes[i]; + const int x = xCoordinates[i]; + const int y = yCoordinates[i]; + int *proximities = &mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE_INTERNAL]; + mProximityInfo->calculateNearbyKeyCodes(x, y, primaryKey, proximities); + } + + if (DEBUG_PROXIMITY_CHARS) { + for (int i = 0; i < inputSize; ++i) { + AKLOGI("---"); + for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE_INTERNAL; ++j) { + int icc = mInputCodes[i * MAX_PROXIMITY_CHARS_SIZE_INTERNAL + j]; + int icfjc = inputCodes[i * MAX_PROXIMITY_CHARS_SIZE_INTERNAL + j]; + icc += 0; + icfjc += 0; + AKLOGI("--- (%d)%c,%c", i, icc, icfjc); AKLOGI("--- A<%d>,B<%d>", icc, icfjc); + } + } + } + } + + /////////////////////// + // Setup touch points + int pushTouchPointStartIndex = 0; + int lastSavedInputSize = 0; + mMaxPointToKeyLength = maxPointToKeyLength; + if (mIsContinuationPossible && mInputIndice.size() > 1) { + // Just update difference. + // Two points prior is never skipped. Thus, we pop 2 input point data here. + pushTouchPointStartIndex = mInputIndice[mInputIndice.size() - 2]; + popInputData(); + popInputData(); + lastSavedInputSize = mInputXs.size(); + } else { + // Clear all data. + mInputXs.clear(); + mInputYs.clear(); + mTimes.clear(); + mInputIndice.clear(); + mLengthCache.clear(); + mDistanceCache.clear(); + mNearKeysVector.clear(); + } + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: reused points = %d, last input size = %d", + pushTouchPointStartIndex, lastSavedInputSize); + } + mInputSize = 0; + + if (xCoordinates && yCoordinates) { + const bool proximityOnly = !isGeometric && (xCoordinates[0] < 0 || yCoordinates[0] < 0); + int lastInputIndex = pushTouchPointStartIndex; + for (int i = lastInputIndex; i < inputSize; ++i) { + const int pid = pointerIds ? pointerIds[i] : 0; + if (pointerId == pid) { + lastInputIndex = i; + } + } + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: last input index = %d", lastInputIndex); + } + // Working space to save near keys distances for current, prev and prevprev input point. + NearKeysDistanceMap nearKeysDistances[3]; + // These pointers are swapped for each inputs points. + NearKeysDistanceMap *currentNearKeysDistances = &nearKeysDistances[0]; + NearKeysDistanceMap *prevNearKeysDistances = &nearKeysDistances[1]; + NearKeysDistanceMap *prevPrevNearKeysDistances = &nearKeysDistances[2]; + + for (int i = pushTouchPointStartIndex; i <= lastInputIndex; ++i) { + // Assuming pointerId == 0 if pointerIds is null. + const int pid = pointerIds ? pointerIds[i] : 0; + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: (%d)PID = %d", i, pid); + } + if (pointerId == pid) { + const int c = isGeometric ? NOT_A_COORDINATE : getPrimaryCharAt(i); + const int x = proximityOnly ? NOT_A_COORDINATE : xCoordinates[i]; + const int y = proximityOnly ? NOT_A_COORDINATE : yCoordinates[i]; + const int time = times ? times[i] : -1; + if (pushTouchPoint(i, c, x, y, time, isGeometric /* do sampling */, + i == lastInputIndex, currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances)) { + // Previous point information was popped. + NearKeysDistanceMap *tmp = prevNearKeysDistances; + prevNearKeysDistances = currentNearKeysDistances; + currentNearKeysDistances = tmp; + } else { + NearKeysDistanceMap *tmp = prevPrevNearKeysDistances; + prevPrevNearKeysDistances = prevNearKeysDistances; + prevNearKeysDistances = currentNearKeysDistances; + currentNearKeysDistances = tmp; + } + } + } + mInputSize = mInputXs.size(); + } + + if (mInputSize > 0) { + const int keyCount = mProximityInfo->getKeyCount(); + mNearKeysVector.resize(mInputSize); + mDistanceCache.resize(mInputSize * keyCount); + for (int i = lastSavedInputSize; i < mInputSize; ++i) { + mNearKeysVector[i].reset(); + static const float NEAR_KEY_NORMALIZED_SQUARED_THRESHOLD = 4.0f; + for (int k = 0; k < keyCount; ++k) { + const int index = i * keyCount + k; + const int x = mInputXs[i]; + const int y = mInputYs[i]; + const float normalizedSquaredDistance = + mProximityInfo->getNormalizedSquaredDistanceFromCenterFloat(k, x, y); + mDistanceCache[index] = normalizedSquaredDistance; + if (normalizedSquaredDistance < NEAR_KEY_NORMALIZED_SQUARED_THRESHOLD) { + mNearKeysVector[i].set(k, 1); + } + } + } + + static const float READ_FORWORD_LENGTH_SCALE = 0.95f; + const int readForwordLength = static_cast<int>( + hypotf(mProximityInfo->getKeyboardWidth(), mProximityInfo->getKeyboardHeight()) + * READ_FORWORD_LENGTH_SCALE); + for (int i = 0; i < mInputSize; ++i) { + if (DEBUG_GEO_FULL) { + AKLOGI("Sampled(%d): x = %d, y = %d, time = %d", i, mInputXs[i], mInputYs[i], + mTimes[i]); + } + for (int j = max(i + 1, lastSavedInputSize); j < mInputSize; ++j) { + if (mLengthCache[j] - mLengthCache[i] >= readForwordLength) { + break; + } + mNearKeysVector[i] |= mNearKeysVector[j]; + } + } + } + + // end + /////////////////////// + + memset(mNormalizedSquaredDistances, NOT_A_DISTANCE, sizeof(mNormalizedSquaredDistances)); + memset(mPrimaryInputWord, 0, sizeof(mPrimaryInputWord)); + mTouchPositionCorrectionEnabled = mInputSize > 0 && mHasTouchPositionCorrectionData + && xCoordinates && yCoordinates && !isGeometric; + if (!isGeometric && pointerId == 0) { + for (int i = 0; i < inputSize; ++i) { + mPrimaryInputWord[i] = getPrimaryCharAt(i); + } + + for (int i = 0; i < mInputSize && mTouchPositionCorrectionEnabled; ++i) { + const int *proximityChars = getProximityCharsAt(i); + const int primaryKey = proximityChars[0]; + const int x = xCoordinates[i]; + const int y = yCoordinates[i]; + if (DEBUG_PROXIMITY_CHARS) { + int a = x + y + primaryKey; + a += 0; + AKLOGI("--- Primary = %c, x = %d, y = %d", primaryKey, x, y); + } + for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE_INTERNAL && proximityChars[j] > 0; ++j) { + const int currentChar = proximityChars[j]; + const float squaredDistance = + hasInputCoordinates() ? calculateNormalizedSquaredDistance( + mProximityInfo->getKeyIndexOf(currentChar), i) : + NOT_A_DISTANCE_FLOAT; + if (squaredDistance >= 0.0f) { + mNormalizedSquaredDistances[i * MAX_PROXIMITY_CHARS_SIZE_INTERNAL + j] = + (int) (squaredDistance * NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR); + } else { + mNormalizedSquaredDistances[i * MAX_PROXIMITY_CHARS_SIZE_INTERNAL + j] = + (j == 0) ? EQUIVALENT_CHAR_WITHOUT_DISTANCE_INFO : + PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO; + } + if (DEBUG_PROXIMITY_CHARS) { + AKLOGI("--- Proximity (%d) = %c", j, currentChar); + } + } + } + } + + if (DEBUG_GEO_FULL) { + AKLOGI("ProximityState init finished: %d points out of %d", mInputSize, inputSize); + } +} + +bool ProximityInfoState::checkAndReturnIsContinuationPossible(const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times) { + for (int i = 0; i < mInputSize; ++i) { + const int index = mInputIndice[i]; + if (index > inputSize || xCoordinates[index] != mInputXs[i] || + yCoordinates[index] != mInputYs[i] || times[index] != mTimes[i]) { + return false; + } + } + return true; +} + +// Calculating point to key distance for all near keys and returning the distance between +// the given point and the nearest key position. +float ProximityInfoState::updateNearKeysDistances(const int x, const int y, + NearKeysDistanceMap *const currentNearKeysDistances) { + static const float NEAR_KEY_THRESHOLD = 4.0f; + + currentNearKeysDistances->clear(); + const int keyCount = mProximityInfo->getKeyCount(); + float nearestKeyDistance = mMaxPointToKeyLength; + for (int k = 0; k < keyCount; ++k) { + const float dist = mProximityInfo->getNormalizedSquaredDistanceFromCenterFloat(k, x, y); + if (dist < NEAR_KEY_THRESHOLD) { + currentNearKeysDistances->insert(std::pair<int, float>(k, dist)); + } + if (nearestKeyDistance > dist) { + nearestKeyDistance = dist; + } + } + return nearestKeyDistance; +} + +// Check if previous point is at local minimum position to near keys. +bool ProximityInfoState::isPrevLocalMin(const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances) const { + static const float MARGIN = 0.01f; + + for (NearKeysDistanceMap::const_iterator it = prevNearKeysDistances->begin(); + it != prevNearKeysDistances->end(); ++it) { + NearKeysDistanceMap::const_iterator itPP = prevPrevNearKeysDistances->find(it->first); + NearKeysDistanceMap::const_iterator itC = currentNearKeysDistances->find(it->first); + if ((itPP == prevPrevNearKeysDistances->end() || itPP->second > it->second + MARGIN) + && (itC == currentNearKeysDistances->end() || itC->second > it->second + MARGIN)) { + return true; + } + } + return false; +} + +// Calculating a point score that indicates usefulness of the point. +float ProximityInfoState::getPointScore( + const int x, const int y, const int time, const bool lastPoint, const float nearest, + const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances) const { + static const int DISTANCE_BASE_SCALE = 100; + static const int SAVE_DISTANCE_SCALE = 200; + static const int SKIP_DISTANCE_SCALE = 25; + static const int CHECK_LOCALMIN_DISTANCE_THRESHOLD_SCALE = 40; + static const int STRAIGHT_SKIP_DISTANCE_THRESHOLD_SCALE = 50; + static const int CORNER_CHECK_DISTANCE_THRESHOLD_SCALE = 27; + static const float SAVE_DISTANCE_SCORE = 2.0f; + static const float SKIP_DISTANCE_SCORE = -1.0f; + static const float CHECK_LOCALMIN_DISTANCE_SCORE = -1.0f; + static const float STRAIGHT_ANGLE_THRESHOLD = M_PI_F / 36.0f; + static const float STRAIGHT_SKIP_NEAREST_DISTANCE_THRESHOLD = 0.5f; + static const float STRAIGHT_SKIP_SCORE = -1.0f; + static const float CORNER_ANGLE_THRESHOLD = M_PI_F / 2.0f; + static const float CORNER_SCORE = 1.0f; + + const std::size_t size = mInputXs.size(); + if (size <= 1) { + return 0.0f; + } + const int baseSampleRate = mProximityInfo->getMostCommonKeyWidth(); + const int distNext = getDistanceInt(x, y, mInputXs.back(), mInputYs.back()) + * DISTANCE_BASE_SCALE; + const int distPrev = getDistanceInt(mInputXs.back(), mInputYs.back(), + mInputXs[size - 2], mInputYs[size - 2]) * DISTANCE_BASE_SCALE; + float score = 0.0f; + + // Sum of distances + if (distPrev + distNext > baseSampleRate * SAVE_DISTANCE_SCALE) { + score += SAVE_DISTANCE_SCORE; + } + // Distance + if (distPrev < baseSampleRate * SKIP_DISTANCE_SCALE) { + score += SKIP_DISTANCE_SCORE; + } + // Location + if (distPrev < baseSampleRate * CHECK_LOCALMIN_DISTANCE_THRESHOLD_SCALE) { + if (!isPrevLocalMin(currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances)) { + score += CHECK_LOCALMIN_DISTANCE_SCORE; + } + } + // Angle + const float angle1 = getAngle(x, y, mInputXs.back(), mInputYs.back()); + const float angle2 = getAngle(mInputXs.back(), mInputYs.back(), + mInputXs[size - 2], mInputYs[size - 2]); + const float angleDiff = getAngleDiff(angle1, angle2); + // Skip straight + if (nearest > STRAIGHT_SKIP_NEAREST_DISTANCE_THRESHOLD + && distPrev < baseSampleRate * STRAIGHT_SKIP_DISTANCE_THRESHOLD_SCALE + && angleDiff < STRAIGHT_ANGLE_THRESHOLD) { + score += STRAIGHT_SKIP_SCORE; + } + // Save corner + if (distPrev > baseSampleRate * CORNER_CHECK_DISTANCE_THRESHOLD_SCALE + && angleDiff > CORNER_ANGLE_THRESHOLD) { + score += CORNER_SCORE; + } + return score; +} + +// Sampling touch point and pushing information to vectors. +// Returning if previous point is popped or not. +bool ProximityInfoState::pushTouchPoint(const int inputIndex, const int nodeChar, int x, int y, + const int time, const bool sample, const bool isLastPoint, + NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances) { + static const float LAST_POINT_SKIP_DISTANCE_SCALE = 0.25f; + + size_t size = mInputXs.size(); + bool popped = false; + if (nodeChar < 0 && sample) { + const float nearest = updateNearKeysDistances(x, y, currentNearKeysDistances); + const float score = getPointScore(x, y, time, isLastPoint, nearest, + currentNearKeysDistances, prevNearKeysDistances, prevPrevNearKeysDistances); + if (score < 0) { + // Pop previous point because it would be useless. + popInputData(); + size = mInputXs.size(); + popped = true; + } else { + popped = false; + } + // Check if the last point should be skipped. + if (isLastPoint) { + if (size > 0 && getDistanceFloat(x, y, mInputXs.back(), mInputYs.back()) + < mProximityInfo->getMostCommonKeyWidth() * LAST_POINT_SKIP_DISTANCE_SCALE) { + if (DEBUG_GEO_FULL) { + AKLOGI("p0: size = %zd, x = %d, y = %d, lx = %d, ly = %d, dist = %f, " + "width = %f", size, x, y, mInputXs.back(), mInputYs.back(), + getDistanceFloat(x, y, mInputXs.back(), mInputYs.back()), + mProximityInfo->getMostCommonKeyWidth() + * LAST_POINT_SKIP_DISTANCE_SCALE); + } + return popped; + } else if (size > 1) { + int minChar = 0; + float minDist = mMaxPointToKeyLength; + for (NearKeysDistanceMap::const_iterator it = currentNearKeysDistances->begin(); + it != currentNearKeysDistances->end(); ++it) { + if (minDist > it->second) { + minChar = it->first; + minDist = it->second; + } + } + NearKeysDistanceMap::const_iterator itPP = + prevNearKeysDistances->find(minChar); + if (itPP != prevNearKeysDistances->end() && minDist > itPP->second) { + if (DEBUG_GEO_FULL) { + AKLOGI("p1: char = %c, minDist = %f, prevNear key minDist = %f", + minChar, itPP->second, minDist); + } + return popped; + } + } + } + } + + if (nodeChar >= 0 && (x < 0 || y < 0)) { + const int keyId = mProximityInfo->getKeyIndexOf(nodeChar); + if (keyId >= 0) { + x = mProximityInfo->getKeyCenterXOfKeyIdG(keyId); + y = mProximityInfo->getKeyCenterYOfKeyIdG(keyId); + } + } + + // Pushing point information. + if (size > 0) { + mLengthCache.push_back( + mLengthCache.back() + getDistanceInt(x, y, mInputXs.back(), mInputYs.back())); + } else { + mLengthCache.push_back(0); + } + mInputXs.push_back(x); + mInputYs.push_back(y); + mTimes.push_back(time); + mInputIndice.push_back(inputIndex); + if (DEBUG_GEO_FULL) { + AKLOGI("pushTouchPoint: x = %03d, y = %03d, time = %d, index = %d, popped ? %01d", + x, y, time, inputIndex, popped); + } + return popped; +} + +float ProximityInfoState::calculateNormalizedSquaredDistance( + const int keyIndex, const int inputIndex) const { + if (keyIndex == NOT_AN_INDEX) { + return NOT_A_DISTANCE_FLOAT; + } + if (!mProximityInfo->hasSweetSpotData(keyIndex)) { + return NOT_A_DISTANCE_FLOAT; + } + if (NOT_A_COORDINATE == mInputXs[inputIndex]) { + return NOT_A_DISTANCE_FLOAT; + } + const float squaredDistance = calculateSquaredDistanceFromSweetSpotCenter( + keyIndex, inputIndex); + const float squaredRadius = square(mProximityInfo->getSweetSpotRadiiAt(keyIndex)); + return squaredDistance / squaredRadius; +} + +int ProximityInfoState::getDuration(const int index) const { + if (mInputSize > 0 && index >= 0 && index < mInputSize - 1) { + return mTimes[index + 1] - mTimes[index]; + } + return 0; +} + +float ProximityInfoState::getPointToKeyLength(const int inputIndex, const int codePoint, + const float scale) const { + const int keyId = mProximityInfo->getKeyIndexOf(codePoint); + if (keyId != NOT_AN_INDEX) { + const int index = inputIndex * mProximityInfo->getKeyCount() + keyId; + return min(mDistanceCache[index] * scale, mMaxPointToKeyLength); + } + if (isSkippableChar(codePoint)) { + return 0; + } + // If the char is not a key on the keyboard then return the max length. + return MAX_POINT_TO_KEY_LENGTH; +} + +int ProximityInfoState::getSpaceY() const { + const int keyId = mProximityInfo->getKeyIndexOf(' '); + return mProximityInfo->getKeyCenterYOfKeyIdG(keyId); +} + +float ProximityInfoState::calculateSquaredDistanceFromSweetSpotCenter( + const int keyIndex, const int inputIndex) const { + const float sweetSpotCenterX = mProximityInfo->getSweetSpotCenterXAt(keyIndex); + const float sweetSpotCenterY = mProximityInfo->getSweetSpotCenterYAt(keyIndex); + const float inputX = static_cast<float>(mInputXs[inputIndex]); + const float inputY = static_cast<float>(mInputYs[inputIndex]); + return square(inputX - sweetSpotCenterX) + square(inputY - sweetSpotCenterY); +} + +// Puts possible characters into filter and returns new filter size. +int32_t ProximityInfoState::getAllPossibleChars( + const size_t index, int32_t *const filter, const int32_t filterSize) const { + if (index >= mInputXs.size()) { + return filterSize; + } + int newFilterSize = filterSize; + for (int j = 0; j < mProximityInfo->getKeyCount(); ++j) { + if (mNearKeysVector[index].test(j)) { + const int32_t keyCodePoint = mProximityInfo->getCodePointOf(j); + bool insert = true; + // TODO: Avoid linear search + for (int k = 0; k < filterSize; ++k) { + if (filter[k] == keyCodePoint) { + insert = false; + break; + } + } + if (insert) { + filter[newFilterSize++] = keyCodePoint; + } + } + } + return newFilterSize; +} + +float ProximityInfoState::getAveragePointDuration() const { + if (mInputSize == 0) { + return 0.0f; + } + return static_cast<float>(mTimes[mInputSize - 1] - mTimes[0]) / static_cast<float>(mInputSize); +} + +void ProximityInfoState::popInputData() { + mInputXs.pop_back(); + mInputYs.pop_back(); + mTimes.pop_back(); + mLengthCache.pop_back(); + mInputIndice.pop_back(); +} + +} // namespace latinime diff --git a/native/jni/src/proximity_info_state.h b/native/jni/src/proximity_info_state.h new file mode 100644 index 000000000..48862a7c7 --- /dev/null +++ b/native/jni/src/proximity_info_state.h @@ -0,0 +1,294 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_STATE_H +#define LATINIME_PROXIMITY_INFO_STATE_H + +#include <bitset> +#include <cstring> // for memset() +#include <stdint.h> +#include <string> +#include <vector> + +#include "char_utils.h" +#include "defines.h" +#include "hash_map_compat.h" + +namespace latinime { + +class ProximityInfo; + +class ProximityInfoState { + public: + typedef std::bitset<MAX_KEY_COUNT_IN_A_KEYBOARD> NearKeycodesSet; + static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2; + static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; + static const float NOT_A_DISTANCE_FLOAT; + static const int NOT_A_CODE; + + ///////////////////////////////////////// + // Defined in proximity_info_state.cpp // + ///////////////////////////////////////// + void initInputParams(const int pointerId, const float maxPointToKeyLength, + const ProximityInfo *proximityInfo, const int32_t *const inputCodes, + const int inputSize, const int *xCoordinates, const int *yCoordinates, + const int *const times, const int *const pointerIds, const bool isGeometric); + + ///////////////////////////////////////// + // Defined here // + ///////////////////////////////////////// + ProximityInfoState() + : mProximityInfo(0), mMaxPointToKeyLength(0), + mHasTouchPositionCorrectionData(false), mMostCommonKeyWidthSquare(0), mLocaleStr(), + mKeyCount(0), mCellHeight(0), mCellWidth(0), mGridHeight(0), mGridWidth(0), + mIsContinuationPossible(false), mInputXs(), mInputYs(), mTimes(), mInputIndice(), + mDistanceCache(), mLengthCache(), mNearKeysVector(), + mTouchPositionCorrectionEnabled(false), mInputSize(0) { + memset(mInputCodes, 0, sizeof(mInputCodes)); + memset(mNormalizedSquaredDistances, 0, sizeof(mNormalizedSquaredDistances)); + memset(mPrimaryInputWord, 0, sizeof(mPrimaryInputWord)); + } + + virtual ~ProximityInfoState() {} + + inline unsigned short getPrimaryCharAt(const int index) const { + return getProximityCharsAt(index)[0]; + } + + inline bool existsCharInProximityAt(const int index, const int c) const { + const int *chars = getProximityCharsAt(index); + int i = 0; + while (chars[i] > 0 && i < MAX_PROXIMITY_CHARS_SIZE_INTERNAL) { + if (chars[i++] == c) { + return true; + } + } + return false; + } + + inline bool existsAdjacentProximityChars(const int index) const { + if (index < 0 || index >= mInputSize) return false; + const int currentChar = getPrimaryCharAt(index); + const int leftIndex = index - 1; + if (leftIndex >= 0 && existsCharInProximityAt(leftIndex, currentChar)) { + return true; + } + const int rightIndex = index + 1; + if (rightIndex < mInputSize && existsCharInProximityAt(rightIndex, currentChar)) { + return true; + } + return false; + } + + // In the following function, c is the current character of the dictionary word + // currently examined. + // currentChars is an array containing the keys close to the character the + // user actually typed at the same position. We want to see if c is in it: if so, + // then the word contains at that position a character close to what the user + // typed. + // What the user typed is actually the first character of the array. + // proximityIndex is a pointer to the variable where getMatchedProximityId returns + // the index of c in the proximity chars of the input index. + // Notice : accented characters do not have a proximity list, so they are alone + // in their list. The non-accented version of the character should be considered + // "close", but not the other keys close to the non-accented version. + inline ProximityType getMatchedProximityId(const int index, + const unsigned short c, const bool checkProximityChars, int *proximityIndex = 0) const { + const int *currentChars = getProximityCharsAt(index); + const int firstChar = currentChars[0]; + const unsigned short baseLowerC = toBaseLowerCase(c); + + // The first char in the array is what user typed. If it matches right away, + // that means the user typed that same char for this pos. + if (firstChar == baseLowerC || firstChar == c) { + return EQUIVALENT_CHAR; + } + + if (!checkProximityChars) return UNRELATED_CHAR; + + // If the non-accented, lowercased version of that first character matches c, + // then we have a non-accented version of the accented character the user + // typed. Treat it as a close char. + if (toBaseLowerCase(firstChar) == baseLowerC) + return NEAR_PROXIMITY_CHAR; + + // Not an exact nor an accent-alike match: search the list of close keys + int j = 1; + while (j < MAX_PROXIMITY_CHARS_SIZE_INTERNAL + && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return NEAR_PROXIMITY_CHAR; + } + ++j; + } + if (j < MAX_PROXIMITY_CHARS_SIZE_INTERNAL + && currentChars[j] == ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + ++j; + while (j < MAX_PROXIMITY_CHARS_SIZE_INTERNAL + && currentChars[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return ADDITIONAL_PROXIMITY_CHAR; + } + ++j; + } + } + + // Was not included, signal this as an unrelated character. + return UNRELATED_CHAR; + } + + inline int getNormalizedSquaredDistance( + const int inputIndex, const int proximityIndex) const { + return mNormalizedSquaredDistances[ + inputIndex * MAX_PROXIMITY_CHARS_SIZE_INTERNAL + proximityIndex]; + } + + inline const unsigned short *getPrimaryInputWord() const { + return mPrimaryInputWord; + } + + inline bool touchPositionCorrectionEnabled() const { + return mTouchPositionCorrectionEnabled; + } + + inline bool sameAsTyped(const unsigned short *word, int length) const { + if (length != mInputSize) { + return false; + } + const int *inputCodes = mInputCodes; + while (length--) { + if (static_cast<unsigned int>(*inputCodes) != static_cast<unsigned int>(*word)) { + return false; + } + inputCodes += MAX_PROXIMITY_CHARS_SIZE_INTERNAL; + word++; + } + return true; + } + + int getDuration(const int index) const; + + bool isUsed() const { + return mInputSize > 0; + } + + uint32_t size() const { + return mInputSize; + } + + int getInputX(const int index) const { + return mInputXs[index]; + } + + int getInputY(const int index) const { + return mInputYs[index]; + } + + int getLengthCache(const int index) const { + return mLengthCache[index]; + } + + bool isContinuationPossible() const { + return mIsContinuationPossible; + } + + float getPointToKeyLength(const int inputIndex, const int charCode, const float scale) const; + + int getSpaceY() const; + + int32_t getAllPossibleChars( + const size_t startIndex, int32_t *const filter, const int32_t filterSize) const; + + float getAveragePointDuration() const; + private: + DISALLOW_COPY_AND_ASSIGN(ProximityInfoState); + typedef hash_map_compat<int, float> NearKeysDistanceMap; + ///////////////////////////////////////// + // Defined in proximity_info_state.cpp // + ///////////////////////////////////////// + float calculateNormalizedSquaredDistance(const int keyIndex, const int inputIndex) const; + + float calculateSquaredDistanceFromSweetSpotCenter( + const int keyIndex, const int inputIndex) const; + + bool pushTouchPoint(const int inputIndex, const int nodeChar, int x, int y, const int time, + const bool sample, const bool isLastPoint, + NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances); + ///////////////////////////////////////// + // Defined here // + ///////////////////////////////////////// + inline float square(const float x) const { return x * x; } + + bool hasInputCoordinates() const { + return mInputXs.size() > 0 && mInputYs.size() > 0; + } + + inline const int *getProximityCharsAt(const int index) const { + return mInputCodes + (index * MAX_PROXIMITY_CHARS_SIZE_INTERNAL); + } + + float updateNearKeysDistances(const int x, const int y, + NearKeysDistanceMap *const currentNearKeysDistances); + bool isPrevLocalMin(const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances) const; + float getPointScore( + const int x, const int y, const int time, const bool last, const float nearest, + const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances) const; + bool checkAndReturnIsContinuationPossible(const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *const times); + void popInputData(); + + // const + const ProximityInfo *mProximityInfo; + float mMaxPointToKeyLength; + bool mHasTouchPositionCorrectionData; + int mMostCommonKeyWidthSquare; + std::string mLocaleStr; + int mKeyCount; + int mCellHeight; + int mCellWidth; + int mGridHeight; + int mGridWidth; + bool mIsContinuationPossible; + + std::vector<int> mInputXs; + std::vector<int> mInputYs; + std::vector<int> mTimes; + std::vector<int> mInputIndice; + std::vector<float> mDistanceCache; + std::vector<int> mLengthCache; + std::vector<NearKeycodesSet> mNearKeysVector; + bool mTouchPositionCorrectionEnabled; + int32_t mInputCodes[MAX_PROXIMITY_CHARS_SIZE_INTERNAL * MAX_WORD_LENGTH_INTERNAL]; + int mNormalizedSquaredDistances[MAX_PROXIMITY_CHARS_SIZE_INTERNAL * MAX_WORD_LENGTH_INTERNAL]; + int mInputSize; + unsigned short mPrimaryInputWord[MAX_WORD_LENGTH_INTERNAL]; +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_STATE_H diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h index 9a803cca1..53ae385ea 100644 --- a/native/jni/src/terminal_attributes.h +++ b/native/jni/src/terminal_attributes.h @@ -17,7 +17,7 @@ #ifndef LATINIME_TERMINAL_ATTRIBUTES_H #define LATINIME_TERMINAL_ATTRIBUTES_H -#include "unigram_dictionary.h" +#include "binary_format.h" namespace latinime { @@ -29,14 +29,14 @@ namespace latinime { class TerminalAttributes { public: class ShortcutIterator { - const uint8_t* const mDict; - bool mHasNextShortcutTarget; + const uint8_t *const mDict; int mPos; + bool mHasNextShortcutTarget; public: - ShortcutIterator(const uint8_t* dict, const int pos, const uint8_t flags) : mDict(dict), - mPos(pos) { - mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS)); + ShortcutIterator(const uint8_t *dict, const int pos, const uint8_t flags) + : mDict(dict), mPos(pos), + mHasNextShortcutTarget(0 != (flags & BinaryFormat::FLAG_HAS_SHORTCUT_TARGETS)) { } inline bool hasNextShortcutTarget() const { @@ -46,29 +46,24 @@ class TerminalAttributes { // Gets the shortcut target itself as a uint16_t string. For parameters and return value // see BinaryFormat::getWordAtAddress. // TODO: make the output an uint32_t* to handle the whole unicode range. - inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) { + inline int getNextShortcutTarget(const int maxDepth, uint16_t *outWord, int *outFreq) { const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos); mHasNextShortcutTarget = - 0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT); + 0 != (shortcutFlags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT); unsigned int i; for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) { - const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos); - if (NOT_A_CHARACTER == charCode) break; - outWord[i] = (uint16_t)charCode; + const int codePoint = BinaryFormat::getCodePointAndForwardPointer(mDict, &mPos); + if (NOT_A_CODE_POINT == codePoint) break; + outWord[i] = (uint16_t)codePoint; } + *outFreq = BinaryFormat::getAttributeFrequencyFromFlags(shortcutFlags); mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE; return i; } }; - private: - const uint8_t* const mDict; - const uint8_t mFlags; - const int mStartPos; - - public: - TerminalAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) : - mDict(dict), mFlags(flags), mStartPos(pos) { + TerminalAttributes(const uint8_t *const dict, const uint8_t flags, const int pos) + : mDict(dict), mFlags(flags), mStartPos(pos) { } inline ShortcutIterator getShortcutIterator() const { @@ -76,7 +71,16 @@ class TerminalAttributes { // skipped quickly, so we ignore it. return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags); } + + bool isBlacklistedOrNotAWord() const { + return mFlags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes); + const uint8_t *const mDict; + const uint8_t mFlags; + const int mStartPos; }; } // namespace latinime - #endif // LATINIME_TERMINAL_ATTRIBUTES_H diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index ea9f11b2c..49d044fbc 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -1,32 +1,33 @@ /* -** -** Copyright 2010, The Android Open Source Project -** -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** -** http://www.apache.org/licenses/LICENSE-2.0 -** -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -*/ - -#include <assert.h> -#include <string.h> + * Copyright (C) 2010, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cassert> +#include <cstring> #define LOG_TAG "LatinIME: unigram_dictionary.cpp" +#include "binary_format.h" #include "char_utils.h" #include "defines.h" #include "dictionary.h" -#include "unigram_dictionary.h" - -#include "binary_format.h" +#include "proximity_info.h" #include "terminal_attributes.h" +#include "unigram_dictionary.h" +#include "words_priority_queue.h" +#include "words_priority_queue_pool.h" namespace latinime { @@ -40,7 +41,7 @@ const UnigramDictionary::digraph_t UnigramDictionary::FRENCH_LIGATURES_DIGRAPHS[ { 'o', 'e', 0x0153 } }; // U+0153 : LATIN SMALL LIGATURE OE // TODO: check the header -UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, +UnigramDictionary::UnigramDictionary(const uint8_t *const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags) : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), @@ -57,18 +58,18 @@ UnigramDictionary::~UnigramDictionary() { } static inline unsigned int getCodesBufferSize(const int *codes, const int codesSize) { - return sizeof(*codes) * codesSize; + return static_cast<unsigned int>(sizeof(*codes)) * codesSize; } // TODO: This needs to take a const unsigned short* and not tinker with its contents -static inline void addWord( - unsigned short *word, int length, int frequency, WordsPriorityQueue *queue) { - queue->push(frequency, word, length); +static inline void addWord(unsigned short *word, int length, int frequency, + WordsPriorityQueue *queue, int type) { + queue->push(frequency, word, length, type); } // Return the replacement code point for a digraph, or 0 if none. int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, const int codesSize, - const digraph_t* const digraphs, const unsigned int digraphsSize) const { + const digraph_t *const digraphs, const unsigned int digraphsSize) const { // There can't be a digraph if we don't have at least 2 characters to examine if (i + 2 > codesSize) return false; @@ -103,9 +104,9 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit const bool useFullEditDistance, const int *codesSrc, const int codesRemain, const int currentDepth, int *codesDest, Correction *correction, WordsPriorityQueuePool *queuePool, - const digraph_t* const digraphs, const unsigned int digraphsSize) { + const digraph_t *const digraphs, const unsigned int digraphsSize) const { - const int startIndex = codesDest - codesBuffer; + const int startIndex = static_cast<int>(codesDest - codesBuffer); if (currentDepth < MAX_DIGRAPH_SEARCH_DEPTH) { for (int i = 0; i < codesRemain; ++i) { xCoordinatesBuffer[startIndex + i] = xcoordinates[codesBufferSize - codesRemain + i]; @@ -169,15 +170,16 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit // bigramMap contains the association <bigram address> -> <bigram frequency> // bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter // in bigram_dictionary.cpp -int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, - WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, +int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, - const bool useFullEditDistance, unsigned short *outWords, int *frequencies) { + const bool useFullEditDistance, unsigned short *outWords, int *frequencies, + int *outputTypes) const { - queuePool->clearAll(); - Correction* masterCorrection = correction; - correction->resetCorrection(); + WordsPriorityQueuePool queuePool(MAX_WORDS, SUB_QUEUE_MAX_WORDS, MAX_WORD_LENGTH); + queuePool.clearAll(); + Correction masterCorrection; + masterCorrection.resetCorrection(); if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & FLAGS) { // Incrementally tune the word and try all possibilities int codesBuffer[getCodesBufferSize(codes, codesSize)]; @@ -185,8 +187,8 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, - useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, - queuePool, GERMAN_UMLAUT_DIGRAPHS, + useFullEditDistance, codes, codesSize, 0, codesBuffer, &masterCorrection, + &queuePool, GERMAN_UMLAUT_DIGRAPHS, sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0])); } else if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & FLAGS) { int codesBuffer[getCodesBufferSize(codes, codesSize)]; @@ -194,36 +196,36 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int yCoordinatesBuffer[codesSize]; getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter, - useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection, - queuePool, FRENCH_LIGATURES_DIGRAPHS, + useFullEditDistance, codes, codesSize, 0, codesBuffer, &masterCorrection, + &queuePool, FRENCH_LIGATURES_DIGRAPHS, sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0])); } else { // Normal processing getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, - bigramMap, bigramFilter, useFullEditDistance, masterCorrection, queuePool); + bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool); } PROF_START(20); if (DEBUG_DICT) { - float ns = queuePool->getMasterQueue()->getHighestNormalizedScore( - proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + float ns = queuePool.getMasterQueue()->getHighestNormalizedScore( + masterCorrection.getPrimaryInputWord(), codesSize, 0, 0, 0); ns += 0; AKLOGI("Max normalized score = %f", ns); } const int suggestedWordsCount = - queuePool->getMasterQueue()->outputSuggestions( - proximityInfo->getPrimaryInputWord(), codesSize, frequencies, outWords); + queuePool.getMasterQueue()->outputSuggestions(masterCorrection.getPrimaryInputWord(), + codesSize, frequencies, outWords, outputTypes); if (DEBUG_DICT) { - float ns = queuePool->getMasterQueue()->getHighestNormalizedScore( - proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); + float ns = queuePool.getMasterQueue()->getHighestNormalizedScore( + masterCorrection.getPrimaryInputWord(), codesSize, 0, 0, 0); ns += 0; AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { - short unsigned int* w = outWords + j * MAX_WORD_LENGTH; + short unsigned int *w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; - (void)s; + (void)s; // To suppress compiler warning AKLOGI("%s %i", s, frequencies[j]); } } @@ -234,8 +236,9 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, - const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) { + const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, Correction *correction, + WordsPriorityQueuePool *queuePool) const { PROF_OPEN; PROF_START(0); @@ -243,7 +246,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_START(1); getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramMap, bigramFilter, - useFullEditDistance, inputLength, correction, queuePool); + useFullEditDistance, inputSize, correction, queuePool); PROF_END(1); PROF_START(2); @@ -256,10 +259,10 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_START(4); bool hasAutoCorrectionCandidate = false; - WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); if (masterQueue->size() > 0) { float nsForMaster = masterQueue->getHighestNormalizedScore( - proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); + correction->getPrimaryInputWord(), inputSize, 0, 0, 0); hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); } PROF_END(4); @@ -267,9 +270,9 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_START(5); // Multiple word suggestions if (SUGGEST_MULTIPLE_WORDS - && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { + && inputSize >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, correction, queuePool, + useFullEditDistance, inputSize, correction, queuePool, hasAutoCorrectionCandidate); } PROF_END(5); @@ -281,18 +284,18 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, if (DEBUG_DICT) { queuePool->dumpSubQueue1TopSuggestions(); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { - WordsPriorityQueue* queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i); + WordsPriorityQueue *queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i); if (queue->size() > 0) { - WordsPriorityQueue::SuggestedWord* sw = queue->top(); + WordsPriorityQueue::SuggestedWord *sw = queue->top(); const int score = sw->mScore; - const unsigned short* word = sw->mWord; + const unsigned short *word = sw->mWord; const int wordLength = sw->mWordLength; float ns = Correction::RankingAlgorithm::calcNormalizedScore( - proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); + correction->getPrimaryInputWord(), i, word, wordLength, score); ns += 0; AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD)); - DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); + DUMP_WORD(correction->getPrimaryInputWord(), i); DUMP_WORD(word, wordLength); } } @@ -300,33 +303,33 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, - const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { + const int *yCoordinates, const int *codes, const int inputSize, + Correction *correction) const { if (DEBUG_DICT) { AKLOGI("initSuggest"); - DUMP_WORD_INT(codes, inputLength); + DUMP_WORD_INT(codes, inputSize); } - proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); - const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); - correction->initCorrection(proximityInfo, inputLength, maxDepth); + correction->initInputParams(proximityInfo, codes, inputSize, xCoordinates, yCoordinates); + const int maxDepth = min(inputSize * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); + correction->initCorrection(proximityInfo, inputSize, maxDepth); } -static const char QUOTE = '\''; static const char SPACE = ' '; void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, - const bool useFullEditDistance, const int inputLength, - Correction *correction, WordsPriorityQueuePool *queuePool) { - initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, bigramMap, bigramFilter, correction, + const bool useFullEditDistance, const int inputSize, + Correction *correction, WordsPriorityQueuePool *queuePool) const { + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputSize, correction); + getSuggestionCandidates(useFullEditDistance, inputSize, bigramMap, bigramFilter, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, WordsPriorityQueuePool *queuePool, - const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { + const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) const { uint8_t totalTraverseCount = correction->pushAndGetTotalTraverseCount(); if (DEBUG_DICT) { AKLOGI("Traverse count %d", totalTraverseCount); @@ -346,7 +349,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); int outputIndex = 0; - correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0)); + correction->initCorrectionState(rootPosition, childCount, (inputSize <= 0)); // Depth first search while (outputIndex >= 0) { @@ -374,38 +377,55 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, inline void UnigramDictionary::onTerminal(const int probability, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, - const int currentWordIndex) { + const int currentWordIndex) const { const int inputIndex = correction->getInputIndex(); const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; int wordLength; - unsigned short* wordPointer; + unsigned short *wordPointer; if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); const int finalProbability = correction->getFinalProbability(probability, &wordPointer, &wordLength); - if (finalProbability != NOT_A_PROBABILITY) { - addWord(wordPointer, wordLength, finalProbability, masterQueue); - - const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0; - // Please note that the shortcut candidates will be added to the master queue only. - TerminalAttributes::ShortcutIterator iterator = - terminalAttributes.getShortcutIterator(); - while (iterator.hasNextShortcutTarget()) { - // TODO: addWord only supports weak ordering, meaning we have no means - // to control the order of the shortcuts relative to one another or to the word. - // We need to either modulate the probability of each shortcut according - // to its own shortcut probability or to make the queue - // so that the insert order is protected inside the queue for words - // with the same score. For the moment we use -1 to make sure the shortcut will - // never be in front of the word. - uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; - const int shortcutTargetStringLength = iterator.getNextShortcutTarget( - MAX_WORD_LENGTH_INTERNAL, shortcutTarget); - addWord(shortcutTarget, shortcutTargetStringLength, shortcutProbability, - masterQueue); + + if (0 != finalProbability && !terminalAttributes.isBlacklistedOrNotAWord()) { + // If the probability is 0, we don't want to add this word. However we still + // want to add its shortcuts (including a possible whitelist entry) if any. + // Furthermore, if this is not a word (shortcut only for example) or a blacklisted + // entry then we never want to suggest this. + addWord(wordPointer, wordLength, finalProbability, masterQueue, + Dictionary::KIND_CORRECTION); + } + + const int shortcutProbability = finalProbability > 0 ? finalProbability - 1 : 0; + // Please note that the shortcut candidates will be added to the master queue only. + TerminalAttributes::ShortcutIterator iterator = + terminalAttributes.getShortcutIterator(); + while (iterator.hasNextShortcutTarget()) { + // TODO: addWord only supports weak ordering, meaning we have no means + // to control the order of the shortcuts relative to one another or to the word. + // We need to either modulate the probability of each shortcut according + // to its own shortcut probability or to make the queue + // so that the insert order is protected inside the queue for words + // with the same score. For the moment we use -1 to make sure the shortcut will + // never be in front of the word. + uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; + int shortcutFrequency; + const int shortcutTargetStringLength = iterator.getNextShortcutTarget( + MAX_WORD_LENGTH_INTERNAL, shortcutTarget, &shortcutFrequency); + int shortcutScore; + int kind; + if (shortcutFrequency == BinaryFormat::WHITELIST_SHORTCUT_FREQUENCY + && correction->sameAsTyped()) { + shortcutScore = S_INT_MAX; + kind = Dictionary::KIND_WHITELIST; + } else { + shortcutScore = shortcutProbability; + kind = Dictionary::KIND_CORRECTION; } + addWord(shortcutTarget, shortcutTargetStringLength, shortcutScore, + masterQueue, kind); } } @@ -419,18 +439,18 @@ inline void UnigramDictionary::onTerminal(const int probability, } const int finalProbability = correction->getFinalProbabilityForSubQueue( probability, &wordPointer, &wordLength, inputIndex); - addWord(wordPointer, wordLength, finalProbability, subQueue); + addWord(wordPointer, wordLength, finalProbability, subQueue, Dictionary::KIND_CORRECTION); } } int UnigramDictionary::getSubStringSuggestion( ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, Correction *correction, - WordsPriorityQueuePool* queuePool, const int inputLength, + WordsPriorityQueuePool *queuePool, const int inputSize, const bool hasAutoCorrectionCandidate, const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, - int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) { + int *wordLengthArray, unsigned short *outputWord, int *outputWordLength) const { if (inputWordLength > MULTIPLE_WORDS_SUGGESTION_MAX_WORD_LENGTH) { return FLAG_MULTIPLE_SUGGEST_ABORT; } @@ -473,17 +493,18 @@ int UnigramDictionary::getSubStringSuggestion( // TODO: Remove the safety net above // ////////////////////////////////////////////// - unsigned short* tempOutputWord = 0; + unsigned short *tempOutputWord = 0; int nextWordLength = 0; // TODO: Optimize init suggestion initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - inputLength, correction); + inputSize, correction); + unsigned short word[MAX_WORD_LENGTH_INTERNAL]; int freq = getMostFrequentWordLike( - inputWordStartPos, inputWordLength, proximityInfo, mWord); + inputWordStartPos, inputWordLength, correction, word); if (freq > 0) { nextWordLength = inputWordLength; - tempOutputWord = mWord; + tempOutputWord = word; } else if (!hasAutoCorrectionCandidate) { if (inputWordStartPos > 0) { const int offset = inputWordStartPos; @@ -503,14 +524,14 @@ int UnigramDictionary::getSubStringSuggestion( } } } - WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); + WordsPriorityQueue *queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); // TODO: Return the correct value depending on doAutoCompletion if (!queue || queue->size() <= 0) { return FLAG_MULTIPLE_SUGGEST_ABORT; } int score = 0; const float ns = queue->getHighestNormalizedScore( - proximityInfo->getPrimaryInputWord(), inputWordLength, + correction->getPrimaryInputWord(), inputWordLength, &tempOutputWord, &score, &nextWordLength); if (DEBUG_DICT) { AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score); @@ -524,9 +545,9 @@ int UnigramDictionary::getSubStringSuggestion( freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); } if (DEBUG_DICT) { - AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)" - , currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos, - wordLengthArray[0]); + AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)", + currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos, + (currentWordIndex > 0) ? wordLengthArray[0] : 0); } if (freq <= 0 || nextWordLength <= 0 || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) { @@ -545,7 +566,7 @@ int UnigramDictionary::getSubStringSuggestion( *outputWordLength = tempOutputWordLength; } - if ((inputWordStartPos + inputWordLength) < inputLength) { + if ((inputWordStartPos + inputWordLength) < inputSize) { if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) { return FLAG_MULTIPLE_SUGGEST_SKIP; } @@ -564,31 +585,31 @@ int UnigramDictionary::getSubStringSuggestion( freqArray[i], wordLengthArray[i]); } AKLOGI("Split two words: freq = %d, length = %d, %d, isSpace ? %d", pairFreq, - inputLength, tempOutputWordLength, isSpaceProximity); + inputSize, tempOutputWordLength, isSpaceProximity); } - addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue()); + addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue(), + Dictionary::KIND_CORRECTION); } return FLAG_MULTIPLE_SUGGEST_CONTINUE; } void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, - Correction *correction, WordsPriorityQueuePool* queuePool, - const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex, - const int outputWordLength, int *freqArray, int* wordLengthArray, - unsigned short* outputWord) { + const bool useFullEditDistance, const int inputSize, Correction *correction, + WordsPriorityQueuePool *queuePool, const bool hasAutoCorrectionCandidate, + const int startInputPos, const int startWordIndex, const int outputWordLength, + int *freqArray, int *wordLengthArray, unsigned short *outputWord) const { if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { // Return if the last word index return; } if (startWordIndex >= 1 && (hasAutoCorrectionCandidate - || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) { + || inputSize < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) { // Do not suggest 3+ words if already has auto correction candidate return; } - for (int i = startInputPos + 1; i < inputLength; ++i) { + for (int i = startInputPos + 1; i < inputSize; ++i) { if (DEBUG_CORRECTION_FREQ) { AKLOGI("Multi words(%d), start in %d sep %d start out %d", startWordIndex, startInputPos, i, outputWordLength); @@ -599,7 +620,7 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, int inputWordStartPos = startInputPos; int inputWordLength = i - startInputPos; const int suggestionFlag = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, - codes, useFullEditDistance, correction, queuePool, inputLength, + codes, useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate, startWordIndex, inputWordStartPos, inputWordLength, outputWordLength, true /* not used */, freqArray, wordLengthArray, outputWord, &tempOutputWordLength); @@ -616,14 +637,14 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, // Next word // Missing space inputWordStartPos = i; - inputWordLength = inputLength - i; - if(getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + inputWordLength = inputSize - i; + if (getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, + useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate, startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, false /* missing space */, freqArray, wordLengthArray, outputWord, 0) != FLAG_MULTIPLE_SUGGEST_CONTINUE) { getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, correction, queuePool, + useFullEditDistance, inputSize, correction, queuePool, hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1, tempOutputWordLength, freqArray, wordLengthArray, outputWord); } @@ -646,7 +667,7 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, AKLOGI("Do mistyped space correction"); } getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, + useFullEditDistance, correction, queuePool, inputSize, hasAutoCorrectionCandidate, startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0); } @@ -654,10 +675,10 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, - Correction *correction, WordsPriorityQueuePool* queuePool, - const bool hasAutoCorrectionCandidate) { - if (inputLength >= MAX_WORD_LENGTH) return; + const bool useFullEditDistance, const int inputSize, + Correction *correction, WordsPriorityQueuePool *queuePool, + const bool hasAutoCorrectionCandidate) const { + if (inputSize >= MAX_WORD_LENGTH) return; if (DEBUG_DICT) { AKLOGI("--- Suggest multiple words"); } @@ -670,7 +691,7 @@ void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximit const int startInputPos = 0; const int startWordIndex = 0; getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, - useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate, + useFullEditDistance, inputSize, correction, queuePool, hasAutoCorrectionCandidate, startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray, outputWord); } @@ -678,13 +699,13 @@ void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximit // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, - const int inputLength, ProximityInfo *proximityInfo, unsigned short *word) { - uint16_t inWord[inputLength]; + const int inputSize, Correction *correction, unsigned short *word) const { + uint16_t inWord[inputSize]; - for (int i = 0; i < inputLength; ++i) { - inWord[i] = (uint16_t)proximityInfo->getPrimaryCharAt(startInputIndex + i); + for (int i = 0; i < inputSize; ++i) { + inWord[i] = (uint16_t)correction->getPrimaryCharAt(startInputIndex + i); } - return getMostFrequentWordLikeInner(inWord, inputLength, word); + return getMostFrequentWordLikeInner(inWord, inputSize, word); } // This function will take the position of a character array within a CharGroup, @@ -700,13 +721,13 @@ inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, // In and out parameters may point to the same location. This function takes care // not to use any input parameters after it wrote into its outputs. static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, - const uint8_t* const root, const int startPos, - const uint16_t* const inWord, const int startInputIndex, - int32_t* outNewWord, int* outInputIndex, int* outPos) { - const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); + const uint8_t *const root, const int startPos, const uint16_t *const inWord, + const int startInputIndex, const int inputSize, int32_t *outNewWord, int *outInputIndex, + int *outPos) { + const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); int pos = startPos; - int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - int32_t baseChar = toBaseLowerCase(character); + int32_t codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); + int32_t baseChar = toBaseLowerCase(codePoint); const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); if (baseChar != wChar) { @@ -715,18 +736,18 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, return false; } int inputIndex = startInputIndex; - outNewWord[inputIndex] = character; + outNewWord[inputIndex] = codePoint; if (hasMultipleChars) { - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - while (NOT_A_CHARACTER != character) { - baseChar = toBaseLowerCase(character); - if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); + while (NOT_A_CODE_POINT != codePoint) { + baseChar = toBaseLowerCase(codePoint); + if (inputIndex + 1 >= inputSize || toBaseLowerCase(inWord[++inputIndex]) != baseChar) { *outPos = BinaryFormat::skipOtherCharacters(root, pos); *outInputIndex = startInputIndex; return false; } - outNewWord[inputIndex] = character; - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + outNewWord[inputIndex] = codePoint; + codePoint = BinaryFormat::getCodePointAndForwardPointer(root, &pos); } } *outInputIndex = inputIndex + 1; @@ -738,11 +759,12 @@ static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, // It will compare the frequency to the max frequency, and if greater, will // copy the word into the output buffer. In output value maxFreq, it will // write the new maximum frequency if it changed. -static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length, - short unsigned int* outWord, int* maxFreq) { +static inline void onTerminalWordLike(const int freq, int32_t *newWord, const int length, + short unsigned int *outWord, int *maxFreq) { if (freq > *maxFreq) { - for (int q = 0; q < length; ++q) + for (int q = 0; q < length; ++q) { outWord[q] = newWord[q]; + } outWord[length] = 0; *maxFreq = freq; } @@ -750,30 +772,33 @@ static inline void onTerminalWordLike(const int freq, int32_t* newWord, const in // Will find the highest frequency of the words like the one passed as an argument, // that is, everything that only differs by case/accents. -int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord, - const int length, short unsigned int* outWord) { +int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t *const inWord, + const int inputSize, short unsigned int *outWord) const { int32_t newWord[MAX_WORD_LENGTH_INTERNAL]; int depth = 0; int maxFreq = -1; - const uint8_t* const root = DICT_ROOT; + const uint8_t *const root = DICT_ROOT; + int stackChildCount[MAX_WORD_LENGTH_INTERNAL]; + int stackInputIndex[MAX_WORD_LENGTH_INTERNAL]; + int stackSiblingPos[MAX_WORD_LENGTH_INTERNAL]; int startPos = 0; - mStackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); - mStackInputIndex[0] = 0; - mStackSiblingPos[0] = startPos; + stackChildCount[0] = BinaryFormat::getGroupCountAndForwardPointer(root, &startPos); + stackInputIndex[0] = 0; + stackSiblingPos[0] = startPos; while (depth >= 0) { - const int charGroupCount = mStackChildCount[depth]; - int pos = mStackSiblingPos[depth]; + const int charGroupCount = stackChildCount[depth]; + int pos = stackSiblingPos[depth]; for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) { - int inputIndex = mStackInputIndex[depth]; + int inputIndex = stackInputIndex[depth]; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); // Test whether all chars in this group match with the word we are searching for. If so, - // we want to traverse its children (or if the length match, evaluate its frequency). + // we want to traverse its children (or if the inputSize match, evaluate its frequency). // Note that this function will output the position regardless, but will only write // into inputIndex if there is a match. const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, - inputIndex, newWord, &inputIndex, &pos); - if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) { + inputIndex, inputSize, newWord, &inputIndex, &pos); + if (isAlike && (BinaryFormat::FLAG_IS_TERMINAL & flags) && (inputIndex == inputSize)) { const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); } @@ -782,18 +807,18 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); // If we had a match and the word has children, we want to traverse them. We don't have // to traverse words longer than the one we are searching for, since they will not match - // anyway, so don't traverse unless inputIndex < length. - if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) { + // anyway, so don't traverse unless inputIndex < inputSize. + if (isAlike && (-1 != childrenNodePos) && (inputIndex < inputSize)) { // Save position for this depth, to get back to this once children are done - mStackChildCount[depth] = charGroupIndex; - mStackSiblingPos[depth] = siblingPos; + stackChildCount[depth] = charGroupIndex; + stackSiblingPos[depth] = siblingPos; // Prepare stack values for next depth ++depth; int childrenPos = childrenNodePos; - mStackChildCount[depth] = + stackChildCount[depth] = BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos); - mStackSiblingPos[depth] = childrenPos; - mStackInputIndex[depth] = inputIndex; + stackSiblingPos[depth] = childrenPos; + stackInputIndex[depth] = inputIndex; pos = childrenPos; // Go to the next depth level. ++depth; @@ -808,18 +833,25 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor return maxFreq; } -int UnigramDictionary::getFrequency(const int32_t* const inWord, const int length) const { - const uint8_t* const root = DICT_ROOT; - int pos = BinaryFormat::getTerminalPosition(root, inWord, length); +int UnigramDictionary::getFrequency(const int32_t *const inWord, const int length) const { + const uint8_t *const root = DICT_ROOT; + int pos = BinaryFormat::getTerminalPosition(root, inWord, length, + false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == pos) { return NOT_A_PROBABILITY; } const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); + if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) { + // If this is not a word, or if it's a blacklisted entry, it should behave as + // having no frequency outside of the suggestion process (where it should be used + // for shortcuts). + return NOT_A_PROBABILITY; + } + const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); if (hasMultipleChars) { pos = BinaryFormat::skipOtherCharacters(root, pos); } else { - BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); + BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); } const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); return unigramFreq; @@ -848,7 +880,7 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs inline bool UnigramDictionary::processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, - WordsPriorityQueuePool *queuePool, const int currentWordIndex) { + WordsPriorityQueuePool *queuePool, const int currentWordIndex) const { if (DEBUG_DICT) { correction->checkState(); } @@ -863,8 +895,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); - const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); - const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); + const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); + const bool isTerminalNode = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); bool needsToInvokeOnTerminal = false; @@ -873,23 +905,23 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // else if FLAG_IS_TERMINAL: the frequency // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. - int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); - assert(NOT_A_CHARACTER != c); + int32_t c = BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos); + assert(NOT_A_CODE_POINT != c); // We are going to loop through each character and make it look like it's a different // node each time. To do that, we will process characters in this node in order until - // we find the character terminator. This is signalled by getCharCode* returning - // NOT_A_CHARACTER. + // we find the character terminator. This is signalled by getCodePoint* returning + // NOT_A_CODE_POINT. // As a special case, if there is only one character in this node, we must not read the - // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. + // next bytes so we will simulate the NOT_A_CODE_POINT return by testing the flags. // This way, each loop run will look like a "virtual node". do { // We prefetch the next char. If 'c' is the last char of this node, we will have - // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node + // NOT_A_CODE_POINT in the next char. From this we can decide whether this virtual node // should behave as a terminal or not and whether we have children. const int32_t nextc = hasMultipleChars - ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; - const bool isLastChar = (NOT_A_CHARACTER == nextc); + ? BinaryFormat::getCodePointAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CODE_POINT; + const bool isLastChar = (NOT_A_CODE_POINT == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. // If we are on the last char, this virtual node is a terminal if this node is. const bool isTerminal = isLastChar && isTerminalNode; @@ -918,9 +950,9 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // Prepare for the next character. Promote the prefetched char to current char - the loop // will take care of prefetching the next. If we finally found our last char, nextc will - // contain NOT_A_CHARACTER. + // contain NOT_A_CODE_POINT. c = nextc; - } while (NOT_A_CHARACTER != c); + } while (NOT_A_CODE_POINT != c); if (isTerminalNode) { // The frequency should be here, because we come here only if this is actually @@ -982,5 +1014,4 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, *newChildrenPosition = childrenPos; return true; } - } // namespace latinime diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index a1a8299e5..57129bb07 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -19,53 +19,19 @@ #include <map> #include <stdint.h> -#include "correction.h" -#include "correction_state.h" #include "defines.h" -#include "proximity_info.h" -#include "words_priority_queue.h" -#include "words_priority_queue_pool.h" namespace latinime { +class Correction; +class ProximityInfo; class TerminalAttributes; +class WordsPriorityQueuePool; + class UnigramDictionary { typedef struct { int first; int second; int replacement; } digraph_t; public: - // Mask and flags for children address type selection. - static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; - static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; - static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; - static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; - static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; - - // Flag for single/multiple char group - static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; - - // Flag for terminal groups - static const int FLAG_IS_TERMINAL = 0x10; - - // Flag for shortcut targets presence - static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; - // Flag for bigram presence - static const int FLAG_HAS_BIGRAMS = 0x04; - - // Attribute (bigram/shortcut) related flags: - // Flag for presence of more attributes - static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; - // Flag for sign of offset. If this flag is set, the offset value must be negated. - static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; - - // Mask for attribute frequency, stored on 4 bits inside the flags byte. - static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; - - // Mask and flags for attribute address type selection. - static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; - // Error tolerances static const int DEFAULT_MAX_ERRORS = 2; static const int MAX_ERRORS_FOR_TWO_WORDS = 1; @@ -73,80 +39,80 @@ class UnigramDictionary { static const int FLAG_MULTIPLE_SUGGEST_ABORT = 0; static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1; static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2; - UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, + UnigramDictionary(const uint8_t *const streamStart, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags); - int getFrequency(const int32_t* const inWord, const int length) const; + int getFrequency(const int32_t *const inWord, const int length) const; int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; - int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, - Correction *correction, const int *xcoordinates, const int *ycoordinates, - const int *codes, const int codesSize, const std::map<int, int> *bigramMap, - const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords, - int *frequencies); + int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, + const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, + const bool useFullEditDistance, unsigned short *outWords, int *frequencies, + int *outputTypes) const; virtual ~UnigramDictionary(); private: + DISALLOW_IMPLICIT_CONSTRUCTORS(UnigramDictionary); void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int inputLength, + const int *ycoordinates, const int *codes, const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const bool useFullEditDistance, Correction *correction, - WordsPriorityQueuePool *queuePool); + WordsPriorityQueuePool *queuePool) const; int getDigraphReplacement(const int *codes, const int i, const int codesSize, - const digraph_t* const digraphs, const unsigned int digraphsSize) const; + const digraph_t *const digraphs, const unsigned int digraphsSize) const; void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, - const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, + const int *xcoordinates, const int *ycoordinates, const int *codesBuffer, int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, - const bool useFullEditDistance, const int* codesSrc, const int codesRemain, - const int currentDepth, int* codesDest, Correction *correction, - WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs, - const unsigned int digraphsSize); + const bool useFullEditDistance, const int *codesSrc, const int codesRemain, + const int currentDepth, int *codesDest, Correction *correction, + WordsPriorityQueuePool *queuePool, const digraph_t *const digraphs, + const unsigned int digraphsSize) const; void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); + const int *ycoordinates, const int *codes, const int codesSize, + Correction *correction) const; void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap, - const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputLength, - Correction *correction, WordsPriorityQueuePool* queuePool); + const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputSize, + Correction *correction, WordsPriorityQueuePool *queuePool) const; void getSuggestionCandidates( - const bool useFullEditDistance, const int inputLength, + const bool useFullEditDistance, const int inputSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, - Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, - const int maxErrors, const int currentWordIndex); + Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, + const int maxErrors, const int currentWordIndex) const; void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, - Correction *correction, WordsPriorityQueuePool* queuePool, - const bool hasAutoCorrectionCandidate); + const bool useFullEditDistance, const int inputSize, + Correction *correction, WordsPriorityQueuePool *queuePool, + const bool hasAutoCorrectionCandidate) const; void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, - const int currentWordIndex); - bool needsToSkipCurrentNode(const unsigned short c, - const int inputIndex, const int skipPos, const int depth); + const int currentWordIndex) const; // Process a node by considering proximity, missing and excessive character bool processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, - const int currentWordIndex); - int getMostFrequentWordLike(const int startInputIndex, const int inputLength, - ProximityInfo *proximityInfo, unsigned short *word); - int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, - short unsigned int *outWord); + const int currentWordIndex) const; + int getMostFrequentWordLike(const int startInputIndex, const int inputSize, + Correction *correction, unsigned short *word) const; + int getMostFrequentWordLikeInner(const uint16_t *const inWord, const int inputSize, + short unsigned int *outWord) const; int getSubStringSuggestion( ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, Correction *correction, - WordsPriorityQueuePool* queuePool, const int inputLength, + WordsPriorityQueuePool *queuePool, const int inputSize, const bool hasAutoCorrectionCandidate, const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, - int *wordLengthArray, unsigned short* outputWord, int *outputWordLength); + int *wordLengthArray, unsigned short *outputWord, int *outputWordLength) const; void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, - const bool useFullEditDistance, const int inputLength, - Correction *correction, WordsPriorityQueuePool* queuePool, + const bool useFullEditDistance, const int inputSize, + Correction *correction, WordsPriorityQueuePool *queuePool, const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex, - const int outputWordLength, int *freqArray, int* wordLengthArray, - unsigned short* outputWord); + const int outputWordLength, int *freqArray, int *wordLengthArray, + unsigned short *outputWord) const; - const uint8_t* const DICT_ROOT; + const uint8_t *const DICT_ROOT; const int MAX_WORD_LENGTH; const int MAX_WORDS; const int TYPED_LETTER_MULTIPLIER; @@ -158,13 +124,6 @@ class UnigramDictionary { static const digraph_t GERMAN_UMLAUT_DIGRAPHS[]; static const digraph_t FRENCH_LIGATURES_DIGRAPHS[]; - - // Still bundled members - unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];// TODO: remove - int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove - int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove - int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove }; } // namespace latinime - #endif // LATINIME_UNIGRAM_DICTIONARY_H diff --git a/native/jni/src/words_priority_queue.h b/native/jni/src/words_priority_queue.h index 7629251d6..19efa5da3 100644 --- a/native/jni/src/words_priority_queue.h +++ b/native/jni/src/words_priority_queue.h @@ -18,8 +18,9 @@ #define LATINIME_WORDS_PRIORITY_QUEUE_H #include <cstring> // for memcpy() -#include <iostream> #include <queue> + +#include "correction.h" #include "defines.h" namespace latinime { @@ -32,31 +33,32 @@ class WordsPriorityQueue { unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; int mWordLength; bool mUsed; + int mType; - void setParams(int score, unsigned short* word, int wordLength) { + void setParams(int score, unsigned short *word, int wordLength, int type) { mScore = score; mWordLength = wordLength; memcpy(mWord, word, sizeof(unsigned short) * wordLength); mUsed = true; + mType = type; } }; - WordsPriorityQueue(int maxWords, int maxWordLength) : - MAX_WORDS((unsigned int) maxWords), MAX_WORD_LENGTH( - (unsigned int) maxWordLength) { - mSuggestedWords = new SuggestedWord[maxWordLength]; + WordsPriorityQueue(int maxWords, int maxWordLength) + : mSuggestions(), MAX_WORDS(static_cast<unsigned int>(maxWords)), + MAX_WORD_LENGTH(static_cast<unsigned int>(maxWordLength)), + mSuggestedWords(new SuggestedWord[maxWordLength]), mHighestSuggestedWord(0) { for (int i = 0; i < maxWordLength; ++i) { mSuggestedWords[i].mUsed = false; } - mHighestSuggestedWord = 0; } - ~WordsPriorityQueue() { + virtual ~WordsPriorityQueue() { delete[] mSuggestedWords; } - void push(int score, unsigned short* word, int wordLength) { - SuggestedWord* sw = 0; + void push(int score, unsigned short *word, int wordLength, int type) { + SuggestedWord *sw = 0; if (mSuggestions.size() >= MAX_WORDS) { sw = mSuggestions.top(); const int minScore = sw->mScore; @@ -68,9 +70,9 @@ class WordsPriorityQueue { } } if (sw == 0) { - sw = getFreeSuggestedWord(score, word, wordLength); + sw = getFreeSuggestedWord(score, word, wordLength, type); } else { - sw->setParams(score, word, wordLength); + sw->setParams(score, word, wordLength, type); } if (sw == 0) { AKLOGE("SuggestedWord is accidentally null."); @@ -86,21 +88,21 @@ class WordsPriorityQueue { } } - SuggestedWord* top() { + SuggestedWord *top() { if (mSuggestions.empty()) return 0; - SuggestedWord* sw = mSuggestions.top(); + SuggestedWord *sw = mSuggestions.top(); return sw; } - int outputSuggestions(const unsigned short* before, const int beforeLength, - int *frequencies, unsigned short *outputChars) { + int outputSuggestions(const unsigned short *before, const int beforeLength, + int *frequencies, unsigned short *outputChars, int* outputTypes) { mHighestSuggestedWord = 0; const unsigned int size = min( MAX_WORDS, static_cast<unsigned int>(mSuggestions.size())); - SuggestedWord* swBuffer[size]; + SuggestedWord *swBuffer[size]; int index = size - 1; while (!mSuggestions.empty() && index >= 0) { - SuggestedWord* sw = mSuggestions.top(); + SuggestedWord *sw = mSuggestions.top(); if (DEBUG_WORDS_PRIORITY_QUEUE) { AKLOGI("dump word. %d", sw->mScore); DUMP_WORD(sw->mWord, sw->mWordLength); @@ -110,11 +112,11 @@ class WordsPriorityQueue { --index; } if (size >= 2) { - SuggestedWord* nsMaxSw = 0; + SuggestedWord *nsMaxSw = 0; unsigned int maxIndex = 0; float maxNs = 0; for (unsigned int i = 0; i < size; ++i) { - SuggestedWord* tempSw = swBuffer[i]; + SuggestedWord *tempSw = swBuffer[i]; if (!tempSw) { continue; } @@ -126,22 +128,23 @@ class WordsPriorityQueue { } } if (maxIndex > 0 && nsMaxSw) { - memmove(&swBuffer[1], &swBuffer[0], maxIndex * sizeof(SuggestedWord*)); + memmove(&swBuffer[1], &swBuffer[0], maxIndex * sizeof(SuggestedWord *)); swBuffer[0] = nsMaxSw; } } for (unsigned int i = 0; i < size; ++i) { - SuggestedWord* sw = swBuffer[i]; + SuggestedWord *sw = swBuffer[i]; if (!sw) { AKLOGE("SuggestedWord is null %d", i); continue; } const unsigned int wordLength = sw->mWordLength; - char* targetAdr = (char*) outputChars + i * MAX_WORD_LENGTH * sizeof(short); + unsigned short *targetAddress = outputChars + i * MAX_WORD_LENGTH; frequencies[i] = sw->mScore; - memcpy(targetAdr, sw->mWord, (wordLength) * sizeof(short)); + outputTypes[i] = sw->mType; + memcpy(targetAddress, sw->mWord, wordLength * sizeof(unsigned short)); if (wordLength < MAX_WORD_LENGTH) { - ((unsigned short*) targetAdr)[wordLength] = 0; + targetAddress[wordLength] = 0; } sw->mUsed = false; } @@ -155,7 +158,7 @@ class WordsPriorityQueue { void clear() { mHighestSuggestedWord = 0; while (!mSuggestions.empty()) { - SuggestedWord* sw = mSuggestions.top(); + SuggestedWord *sw = mSuggestions.top(); if (DEBUG_WORDS_PRIORITY_QUEUE) { AKLOGI("Clear word. %d", sw->mScore); DUMP_WORD(sw->mWord, sw->mWordLength); @@ -172,8 +175,8 @@ class WordsPriorityQueue { DUMP_WORD(mHighestSuggestedWord->mWord, mHighestSuggestedWord->mWordLength); } - float getHighestNormalizedScore(const unsigned short* before, const int beforeLength, - unsigned short** outWord, int *outScore, int *outLength) { + float getHighestNormalizedScore(const unsigned short *before, const int beforeLength, + unsigned short **outWord, int *outScore, int *outLength) { if (!mHighestSuggestedWord) { return 0.0; } @@ -182,27 +185,28 @@ class WordsPriorityQueue { } private: + DISALLOW_IMPLICIT_CONSTRUCTORS(WordsPriorityQueue); struct wordComparator { bool operator ()(SuggestedWord * left, SuggestedWord * right) { return left->mScore > right->mScore; } }; - SuggestedWord* getFreeSuggestedWord(int score, unsigned short* word, - int wordLength) { + SuggestedWord *getFreeSuggestedWord(int score, unsigned short *word, + int wordLength, int type) { for (unsigned int i = 0; i < MAX_WORD_LENGTH; ++i) { if (!mSuggestedWords[i].mUsed) { - mSuggestedWords[i].setParams(score, word, wordLength); + mSuggestedWords[i].setParams(score, word, wordLength, type); return &mSuggestedWords[i]; } } return 0; } - static float getNormalizedScore(SuggestedWord* sw, const unsigned short* before, - const int beforeLength, unsigned short** outWord, int *outScore, int *outLength) { + static float getNormalizedScore(SuggestedWord *sw, const unsigned short *before, + const int beforeLength, unsigned short **outWord, int *outScore, int *outLength) { const int score = sw->mScore; - unsigned short* word = sw->mWord; + unsigned short *word = sw->mWord; const int wordLength = sw->mWordLength; if (outScore) { *outScore = score; @@ -217,14 +221,13 @@ class WordsPriorityQueue { before, beforeLength, word, wordLength, score); } - typedef std::priority_queue<SuggestedWord*, std::vector<SuggestedWord*>, + typedef std::priority_queue<SuggestedWord *, std::vector<SuggestedWord *>, wordComparator> Suggestions; Suggestions mSuggestions; const unsigned int MAX_WORDS; const unsigned int MAX_WORD_LENGTH; - SuggestedWord* mSuggestedWords; - SuggestedWord* mHighestSuggestedWord; + SuggestedWord *mSuggestedWords; + SuggestedWord *mHighestSuggestedWord; }; -} - +} // namespace latinime #endif // LATINIME_WORDS_PRIORITY_QUEUE_H diff --git a/native/jni/src/words_priority_queue_pool.h b/native/jni/src/words_priority_queue_pool.h index 210b5a848..2d52903e0 100644 --- a/native/jni/src/words_priority_queue_pool.h +++ b/native/jni/src/words_priority_queue_pool.h @@ -17,20 +17,20 @@ #ifndef LATINIME_WORDS_PRIORITY_QUEUE_POOL_H #define LATINIME_WORDS_PRIORITY_QUEUE_POOL_H -#include <assert.h> -#include <new> +#include <cassert> #include "words_priority_queue.h" namespace latinime { class WordsPriorityQueuePool { public: - WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) { - // Note: using placement new() requires the caller to call the destructor explicitly. - mMasterQueue = new(mMasterQueueBuf) WordsPriorityQueue(mainQueueMaxWords, maxWordLength); + WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) + // Note: using placement new() requires the caller to call the destructor explicitly. + : mMasterQueue(new(mMasterQueueBuf) WordsPriorityQueue( + mainQueueMaxWords, maxWordLength)) { for (int i = 0, subQueueBufOffset = 0; i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS * SUB_QUEUE_MAX_COUNT; - ++i, subQueueBufOffset += sizeof(WordsPriorityQueue)) { + ++i, subQueueBufOffset += static_cast<int>(sizeof(WordsPriorityQueue))) { mSubQueues[i] = new(mSubQueueBuf + subQueueBufOffset) WordsPriorityQueue(subQueueMaxWords, maxWordLength); } @@ -44,11 +44,11 @@ class WordsPriorityQueuePool { } } - WordsPriorityQueue* getMasterQueue() { + WordsPriorityQueue *getMasterQueue() { return mMasterQueue; } - WordsPriorityQueue* getSubQueue(const int wordIndex, const int inputWordLength) { + WordsPriorityQueue *getSubQueue(const int wordIndex, const int inputWordLength) { if (wordIndex >= MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { return 0; } @@ -70,7 +70,7 @@ class WordsPriorityQueuePool { inline void clearSubQueue(const int wordIndex) { for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { - WordsPriorityQueue* queue = getSubQueue(wordIndex, i); + WordsPriorityQueue *queue = getSubQueue(wordIndex, i); if (queue) { queue->clear(); } @@ -85,12 +85,12 @@ class WordsPriorityQueuePool { } private: - WordsPriorityQueue* mMasterQueue; - WordsPriorityQueue* mSubQueues[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; + DISALLOW_IMPLICIT_CONSTRUCTORS(WordsPriorityQueuePool); char mMasterQueueBuf[sizeof(WordsPriorityQueue)]; - char mSubQueueBuf[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - * SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)]; + char mSubQueueBuf[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS + * sizeof(WordsPriorityQueue)]; + WordsPriorityQueue *mMasterQueue; + WordsPriorityQueue *mSubQueues[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; }; -} - +} // namespace latinime #endif // LATINIME_WORDS_PRIORITY_QUEUE_POOL_H |