diff options
Diffstat (limited to 'native/src')
-rw-r--r-- | native/src/bigram_dictionary.cpp | 31 | ||||
-rw-r--r-- | native/src/bigram_dictionary.h | 5 | ||||
-rw-r--r-- | native/src/binary_format.h | 209 | ||||
-rw-r--r-- | native/src/char_utils.h | 2 | ||||
-rw-r--r-- | native/src/debug.h | 3 | ||||
-rw-r--r-- | native/src/defines.h | 57 | ||||
-rw-r--r-- | native/src/dictionary.cpp | 41 | ||||
-rw-r--r-- | native/src/dictionary.h | 9 | ||||
-rw-r--r-- | native/src/proximity_info.cpp | 4 | ||||
-rw-r--r-- | native/src/proximity_info.h | 10 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 1082 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 129 |
12 files changed, 1244 insertions, 338 deletions
diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp index 5ec310f07..d11aee28e 100644 --- a/native/src/bigram_dictionary.cpp +++ b/native/src/bigram_dictionary.cpp @@ -30,8 +30,10 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { - if (DEBUG_DICT) LOGI("BigramDictionary - constructor"); - if (DEBUG_DICT) LOGI("Has Bigram : %d", hasBigram); + if (DEBUG_DICT) { + LOGI("BigramDictionary - constructor"); + LOGI("Has Bigram : %d", hasBigram); + } } BigramDictionary::~BigramDictionary() { @@ -40,8 +42,10 @@ BigramDictionary::~BigramDictionary() { bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) { word[length] = 0; if (DEBUG_DICT) { +#ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; +#endif LOGI("Bigram: Found word = %s, freq = %d :", s, frequency); } @@ -54,7 +58,9 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ } insertAt++; } - if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + if (DEBUG_DICT) { + LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); + } if (insertAt < mMaxBigrams) { memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), @@ -68,7 +74,9 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ *dest++ = *word++; } *dest = 0; // NULL terminate - if (DEBUG_DICT) LOGI("Bigram: Added word at %d", insertAt); + if (DEBUG_DICT) { + LOGI("Bigram: Added word at %d", insertAt); + } return true; } return false; @@ -105,9 +113,10 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i mMaxBigrams = maxBigrams; if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) { - int pos = mParentDictionary->isValidWordRec( - DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength); - if (DEBUG_DICT) LOGI("Pos -> %d", pos); + int pos = mParentDictionary->getBigramPosition(prevWord, prevWordLength); + if (DEBUG_DICT) { + LOGI("Pos -> %d", pos); + } if (pos < 0) { return 0; } @@ -151,7 +160,9 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc } pos = followDownBranchAddress; // pos start at count int count = DICT[pos] & 0xFF; - if (DEBUG_DICT) LOGI("count - %d",count); + if (DEBUG_DICT) { + LOGI("count - %d",count); + } pos++; for (int i = 0; i < count; i++) { // pos at data @@ -225,7 +236,9 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc } depth++; if (followDownBranchAddress == 0) { - if (DEBUG_DICT) LOGI("ERROR!!! Cannot find bigram!!"); + if (DEBUG_DICT) { + LOGI("ERROR!!! Cannot find bigram!!"); + } break; } } diff --git a/native/src/bigram_dictionary.h b/native/src/bigram_dictionary.h index d658b93e6..c07458a38 100644 --- a/native/src/bigram_dictionary.h +++ b/native/src/bigram_dictionary.h @@ -50,6 +50,7 @@ private: int *mInputCodes; int mInputLength; }; -// ---------------------------------------------------------------------------- -}; // namespace latinime + +} // namespace latinime + #endif // LATINIME_BIGRAM_DICTIONARY_H diff --git a/native/src/binary_format.h b/native/src/binary_format.h new file mode 100644 index 000000000..e9f108e25 --- /dev/null +++ b/native/src/binary_format.h @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_FORMAT_H +#define LATINIME_BINARY_FORMAT_H + +namespace latinime { + +class BinaryFormat { +private: + const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; + const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; + +public: + static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); + static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); + static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); + static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); + static int skipOtherCharacters(const uint8_t* const dict, const int pos); + static int skipAttributes(const uint8_t* const dict, const int pos); + static int skipChildrenPosition(const uint8_t flags, const int pos); + static int skipFrequency(const uint8_t flags, const int pos); + static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); + static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, + const int pos); + static int readChildrenPosition(const uint8_t* const dict, const uint8_t flags, const int pos); + static bool hasChildrenInFlags(const uint8_t flags); + static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, + int *pos); +}; + +inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { + return dict[(*pos)++]; +} + +inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t* const dict, int* pos) { + return dict[(*pos)++]; +} + +inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos) { + const int origin = *pos; + const int32_t character = dict[origin]; + if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { + if (character == CHARACTER_ARRAY_TERMINATOR) { + *pos = origin + 1; + return NOT_A_CHARACTER; + } else { + *pos = origin + 3; + const int32_t char_1 = character << 16; + const int32_t char_2 = char_1 + (dict[origin + 1] << 8); + return char_2 + dict[origin + 2]; + } + } else { + *pos = origin + 1; + return character; + } +} + +inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t* const dict, + const int pos) { + return dict[pos]; +} + +inline int BinaryFormat::skipOtherCharacters(const uint8_t* const dict, const int pos) { + int currentPos = pos; + int32_t character = dict[currentPos++]; + while (CHARACTER_ARRAY_TERMINATOR != character) { + if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { + currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; + } + character = dict[currentPos++]; + } + return currentPos; +} + +static inline int attributeAddressSize(const uint8_t flags) { + static const int ATTRIBUTE_ADDRESS_SHIFT = 4; + return (flags & UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + /* Note: this is a value-dependant optimization of what may probably be + more readably written this way: + switch (flags * UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + default: return 0; + } + */ +} + +inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) { + int currentPos = pos; + uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos); + while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { + currentPos += attributeAddressSize(flags); + flags = getFlagsAndForwardPointer(dict, ¤tPos); + } + currentPos += attributeAddressSize(flags); + return currentPos; +} + +static inline int childrenAddressSize(const uint8_t flags) { + static const int CHILDREN_ADDRESS_SHIFT = 6; + return (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; + /* See the note in attributeAddressSize. The same applies here */ +} + +inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { + return pos + childrenAddressSize(flags); +} + +inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { + return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; +} + +inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, + const int pos) { + // This function skips all attributes. The format makes provision for future extension + // with other attributes (notably shortcuts) but for the time being, bigrams are the + // only attributes that may be found in a character group, so we only look at bigrams + // in this version. + if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { + return skipAttributes(dict, pos); + } else { + return pos; + } +} + +inline int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t* const dict, + const uint8_t flags, const int pos) { + int currentPos = pos; + currentPos = skipChildrenPosition(flags, currentPos); + currentPos = skipAllAttributes(dict, flags, currentPos); + return currentPos; +} + +inline int BinaryFormat::readChildrenPosition(const uint8_t* const dict, const uint8_t flags, + const int pos) { + int offset = 0; + switch (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags) { + case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + offset = dict[pos]; + break; + case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + offset = dict[pos] << 8; + offset += dict[pos + 1]; + break; + case UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + offset = dict[pos] << 16; + offset += dict[pos + 1] << 8; + offset += dict[pos + 2]; + break; + default: + // If we come here, it means we asked for the children of a word with + // no children. + return -1; + } + return pos + offset; +} + +inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { + return (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS + != (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); +} + +inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* const dict, + const uint8_t flags, int *pos) { + int offset = 0; + const int origin = *pos; + switch (UnigramDictionary::MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + offset = dict[origin]; + *pos = origin + 1; + break; + case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + offset = dict[origin] << 8; + offset += dict[origin + 1]; + *pos = origin + 2; + break; + case UnigramDictionary::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + offset = dict[origin] << 16; + offset += dict[origin + 1] << 8; + offset += dict[origin + 2]; + *pos = origin + 3; + break; + } + if (UnigramDictionary::FLAG_ATTRIBUTE_OFFSET_NEGATIVE & flags) { + return origin - offset; + } else { + return origin + offset; + } +} + +} // namespace latinime + +#endif // LATINIME_BINARY_FORMAT_H diff --git a/native/src/char_utils.h b/native/src/char_utils.h index 921ecb4a5..a69a35e7a 100644 --- a/native/src/char_utils.h +++ b/native/src/char_utils.h @@ -21,6 +21,6 @@ namespace latinime { unsigned short latin_tolower(unsigned short c); -}; // namespace latinime +} // namespace latinime #endif // LATINIME_CHAR_UTILS_H diff --git a/native/src/debug.h b/native/src/debug.h index ae629b222..38b2f107a 100644 --- a/native/src/debug.h +++ b/native/src/debug.h @@ -28,6 +28,7 @@ static inline unsigned char* convertToUnibyteString(unsigned short* input, unsig output[i] = 0; return output; } + static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input, unsigned char* output, const unsigned int length, unsigned char c) { int i = 0; @@ -37,6 +38,7 @@ static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned s output[i] = 0; return output; } + static inline void LOGI_S16(unsigned short* string, const unsigned int length) { unsigned char tmp_buffer[length]; convertToUnibyteString(string, tmp_buffer, length); @@ -46,6 +48,7 @@ static inline void LOGI_S16(unsigned short* string, const unsigned int length) { // TODO : refactor this in a blocking log or something. // usleep(10); } + static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length, unsigned char c) { unsigned char tmp_buffer[length+1]; diff --git a/native/src/defines.h b/native/src/defines.h index 00cbb6c9e..a516190af 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -18,18 +18,7 @@ #ifndef LATINIME_DEFINES_H #define LATINIME_DEFINES_H -#ifdef FLAG_DBG -#include <cutils/log.h> -#ifndef LOG_TAG -#define LOG_TAG "LatinIME: " -#endif -#define DEBUG_DICT true -#define DEBUG_DICT_FULL false -#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL -#define DEBUG_NODE DEBUG_DICT_FULL -#define DEBUG_TRACE DEBUG_DICT_FULL -#define DEBUG_PROXIMITY_INFO true - +#ifdef FLAG_DO_PROFILE // Profiler #include <time.h> #define PROF_BUF_SIZE 100 @@ -76,16 +65,7 @@ static void prof_out(void) { } } -#else // FLAG_DBG -#define LOGE -#define LOGI -#define DEBUG_DICT false -#define DEBUG_DICT_FULL false -#define DEBUG_SHOW_FOUND_WORD false -#define DEBUG_NODE false -#define DEBUG_TRACE false -#define DEBUG_PROXIMITY_INFO false - +#else // FLAG_DO_PROFILE #define PROF_BUF_SIZE 0 #define PROF_RESET #define PROF_COUNT(prof_buf_id) @@ -97,6 +77,30 @@ static void prof_out(void) { #define PROF_CLOCKOUT(prof_buf_id) #define PROF_OUTALL +#endif // FLAG_DO_PROFILE + +#ifdef FLAG_DBG +#include <cutils/log.h> +#ifndef LOG_TAG +#define LOG_TAG "LatinIME: " +#endif +#define DEBUG_DICT true +#define DEBUG_DICT_FULL false +#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL +#define DEBUG_NODE DEBUG_DICT_FULL +#define DEBUG_TRACE DEBUG_DICT_FULL +#define DEBUG_PROXIMITY_INFO true + +#else // FLAG_DBG +#define LOGE(fmt, ...) +#define LOGI(fmt, ...) +#define DEBUG_DICT false +#define DEBUG_DICT_FULL false +#define DEBUG_SHOW_FOUND_WORD false +#define DEBUG_NODE false +#define DEBUG_TRACE false +#define DEBUG_PROXIMITY_INFO false + #endif // FLAG_DBG #ifndef U_SHORT_MAX @@ -126,8 +130,11 @@ static void prof_out(void) { #define FLAG_BIGRAM_FREQ 0x7F #define DICTIONARY_VERSION_MIN 200 +// TODO: remove this constant when the switch to the new dict format is over #define DICTIONARY_HEADER_SIZE 2 +#define NEW_DICTIONARY_HEADER_SIZE 5 #define NOT_VALID_WORD -99 +#define NOT_A_CHARACTER -1 #define KEYCODE_SPACE ' ' @@ -138,12 +145,14 @@ static void prof_out(void) { #define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent. -#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 70 -#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80 +#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80 +#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12 +#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 +#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp index d69cb2a53..9e32ee80f 100644 --- a/native/src/dictionary.cpp +++ b/native/src/dictionary.cpp @@ -53,45 +53,16 @@ bool Dictionary::hasBigram() { return ((mDict[1] & 0xFF) == 1); } -// TODO: use uint16_t instead of unsigned short bool Dictionary::isValidWord(unsigned short *word, int length) { + return mUnigramDictionary->isValidWord(word, length); +} + +int Dictionary::getBigramPosition(unsigned short *word, int length) { if (IS_LATEST_DICT_VERSION) { - return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); + return mUnigramDictionary->getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length); } else { - return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD); + return mUnigramDictionary->getBigramPosition(0, word, 0, length); } } -int Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) { - // returns address of bigram data of that word - // return -99 if not found - - int count = Dictionary::getCount(mDict, &pos); - unsigned short currentChar = (unsigned short) word[offset]; - for (int j = 0; j < count; j++) { - unsigned short c = Dictionary::getChar(mDict, &pos); - int terminal = Dictionary::getTerminal(mDict, &pos); - int childPos = Dictionary::getAddress(mDict, &pos); - if (c == currentChar) { - if (offset == length - 1) { - if (terminal) { - return (pos+1); - } - } else { - if (childPos != 0) { - int t = isValidWordRec(childPos, word, offset + 1, length); - if (t > 0) { - return t; - } - } - } - } - if (terminal) { - Dictionary::getFreq(mDict, IS_LATEST_DICT_VERSION, &pos); - } - // There could be two instances of each alphabet - upper and lower case. So continue - // looking ... - } - return NOT_VALID_WORD; -} } // namespace latinime diff --git a/native/src/dictionary.h b/native/src/dictionary.h index 13b2a2816..3dc577a56 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -43,7 +43,6 @@ public: } bool isValidWord(unsigned short *word, int length); - int isValidWordRec(int pos, unsigned short *word, int offset, int length); void *getDict() { return (void *)mDict; } int getDictSize() { return mDictSize; } int getMmapFd() { return mMmapFd; } @@ -63,6 +62,9 @@ public: const int pos, unsigned short *c, int *childrenPosition, bool *terminal, int *freq); + // TODO: delete this + int getBigramPosition(unsigned short *word, int length); + private: bool hasBigram(); @@ -79,7 +81,6 @@ private: BigramDictionary *mBigramDictionary; }; -// ---------------------------------------------------------------------------- // public static utility methods // static inline methods should be defined in the header file inline unsigned short Dictionary::getChar(const unsigned char *dict, int *pos) { @@ -132,7 +133,6 @@ inline int Dictionary::getFreq(const unsigned char *dict, return freq; } - inline int Dictionary::wideStrLen(unsigned short *str) { if (!str) return 0; unsigned short *end = str; @@ -156,5 +156,6 @@ inline int Dictionary::setDictionaryValues(const unsigned char *dict, return position; } -}; // namespace latinime +} // namespace latinime + #endif // LATINIME_DICTIONARY_H diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index 102123c3c..209c31e6e 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -22,6 +22,7 @@ #include "proximity_info.h" namespace latinime { + ProximityInfo::ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth, const int keyboardHeight, const int gridWidth, const int gridHeight, const uint32_t *proximityCharsArray) @@ -61,4 +62,5 @@ bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { } return false; } -} // namespace latinime + +} // namespace latinime diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index 0f1201866..327cd0940 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -32,14 +32,16 @@ public: bool hasSpaceProximity(const int x, const int y) const; private: int getStartIndexFromCoordinates(const int x, const int y) const; - const int CELL_WIDTH; - const int CELL_HEIGHT; + const int MAX_PROXIMITY_CHARS_SIZE; const int KEYBOARD_WIDTH; const int KEYBOARD_HEIGHT; const int GRID_WIDTH; const int GRID_HEIGHT; - const int MAX_PROXIMITY_CHARS_SIZE; + const int CELL_WIDTH; + const int CELL_HEIGHT; uint32_t *mProximityCharsArray; }; -}; // namespace latinime + +} // namespace latinime + #endif // LATINIME_PROXIMITY_INFO_H diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 30fbaeae1..698584e54 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -16,8 +16,6 @@ */ #include <assert.h> -#include <fcntl.h> -#include <stdio.h> #include <string.h> #define LOG_TAG "LatinIME: unigram_dictionary.cpp" @@ -27,6 +25,10 @@ #include "dictionary.h" #include "unigram_dictionary.h" +#ifdef NEW_DICTIONARY_FORMAT +#include "binary_format.h" +#endif // NEW_DICTIONARY_FORMAT + namespace latinime { const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = @@ -34,16 +36,29 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = { 'o', 'e' }, { 'u', 'e' } }; -UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier, +// TODO: check the header +UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) - : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), +#ifndef NEW_DICTIONARY_FORMAT + : DICT_ROOT(streamStart), +#else // NEW_DICTIONARY_FORMAT + : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE), +#endif // NEW_DICTIONARY_FORMAT + MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), +#ifndef NEW_DICTIONARY_FORMAT ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0), +#else // NEW_DICTIONARY_FORMAT + // TODO : remove this variable. + ROOT_POS(0), +#endif // NEW_DICTIONARY_FORMAT BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)), MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) { - if (DEBUG_DICT) LOGI("UnigramDictionary - constructor"); + if (DEBUG_DICT) { + LOGI("UnigramDictionary - constructor"); + } } UnigramDictionary::~UnigramDictionary() {} @@ -151,6 +166,15 @@ int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const if (DEBUG_DICT) { LOGI("Returning %d words", suggestedWordsCount); + /// Print the returned words + for (int j = 0; j < suggestedWordsCount; ++j) { +#ifdef FLAG_DBG + short unsigned int* w = mOutputChars + j * MAX_WORD_LENGTH; + char s[MAX_WORD_LENGTH]; + for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; +#endif + LOGI("%s %i", s, mFrequencies[j]); + } LOGI("Next letters: "); for (int k = 0; k < NEXT_LETTERS_SIZE; k++) { if (mNextLettersFrequency[k] > 0) { @@ -183,7 +207,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, // Suggestion with missing character if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest missing characters %d", i); + } getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH); } } @@ -194,7 +220,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest excessive characters %d", i); + } getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH); } } @@ -205,7 +233,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, // Only suggest words that length is mInputLength if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) { for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest transposed characters %d", i); + } getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1); } } @@ -216,22 +246,27 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) { for (int i = 1; i < codesSize; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest missing space characters %d", i); + } getMissingSpaceWords(mInputLength, i); } } PROF_END(5); PROF_START(6); - if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY) { + if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) { // The first and last "mistyped spaces" are taken care of by excessive character handling for (int i = 1; i < codesSize - 1; ++i) { - if (DEBUG_DICT) LOGI("--- Suggest words with proximity space %d", i); + if (DEBUG_DICT) { + LOGI("--- Suggest words with proximity space %d", i); + } const int x = xcoordinates[i]; const int y = ycoordinates[i]; - if (DEBUG_PROXIMITY_INFO) + if (DEBUG_PROXIMITY_INFO) { LOGI("Input[%d] x = %d, y = %d, has space proximity = %d", i, x, y, proximityInfo->hasSpaceProximity(x, y)); + } if (proximityInfo->hasSpaceProximity(x, y)) { getMistypedSpaceWords(mInputLength, i); } @@ -242,7 +277,9 @@ void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, unsigned short *outWords, int *frequencies) { - if (DEBUG_DICT) LOGI("initSuggest"); + if (DEBUG_DICT) { + LOGI("initSuggest"); + } mFrequencies = frequencies; mOutputChars = outWords; mInputCodes = codes; @@ -250,40 +287,46 @@ void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; } -void UnigramDictionary::registerNextLetter( - unsigned short c, int *nextLetters, int nextLettersSize) { +static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) { if (c < nextLettersSize) { nextLetters[c]++; } } // TODO: We need to optimize addWord by using STL or something +// TODO: This needs to take an const unsigned short* and not tinker with its contents bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) { word[length] = 0; if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) { +#ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; +#endif LOGI("Found word = %s, freq = %d", s, frequency); } if (length > MAX_WORD_LENGTH) { - if (DEBUG_DICT) LOGI("Exceeded max word length."); + if (DEBUG_DICT) { + LOGI("Exceeded max word length."); + } return false; } // Find the right insertion point int insertAt = 0; while (insertAt < MAX_WORDS) { - if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency - && length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) { + // TODO: How should we sort words with the same frequency? + if (frequency > mFrequencies[insertAt]) { break; } insertAt++; } if (insertAt < MAX_WORDS) { if (DEBUG_DICT) { +#ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; - LOGI("Added word = %s, freq = %d", s, frequency); +#endif + LOGI("Added word = %s, freq = %d, %d", s, frequency, S_INT_MAX); } memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), @@ -297,13 +340,15 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) *dest++ = *word++; } *dest = 0; // NULL terminate - if (DEBUG_DICT) LOGI("Added word at %d", insertAt); + if (DEBUG_DICT) { + LOGI("Added word at %d", insertAt); + } return true; } return false; } -unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) { +static inline unsigned short toBaseLowerCase(unsigned short c) { if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { c = BASE_CHARS[c]; } @@ -315,7 +360,7 @@ unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) { return c; } -bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) { +bool UnigramDictionary::sameAsTyped(const unsigned short *word, int length) const { if (length != mInputLength) { return false; } @@ -343,8 +388,8 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, assert(missingPos < mInputLength); } int rootPosition = ROOT_POS; - // Get the number of child of root, then increment the position - int childCount = Dictionary::getCount(DICT, &rootPosition); + // Get the number of children of root, then increment the position + int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); int depth = 0; mStackChildCount[0] = childCount; @@ -353,6 +398,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, mStackInputIndex[0] = 0; mStackDiffs[0] = 0; mStackSiblingPos[0] = rootPosition; + mStackOutputIndex[0] = 0; // Depth first search while (depth >= 0) { @@ -363,14 +409,15 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, int inputIndex = mStackInputIndex[depth]; int diffs = mStackDiffs[depth]; int siblingPos = mStackSiblingPos[depth]; + int outputIndex = mStackOutputIndex[depth]; int firstChildPos; // depth will never be greater than maxDepth because in that case, // needsToTraverseChildrenNodes should be false - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, + const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex, maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos, excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs, - &siblingPos); + &siblingPos, &outputIndex); // Update next sibling pos mStackSiblingPos[depth] = siblingPos; if (needsToTraverseChildrenNodes) { @@ -382,6 +429,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, mStackInputIndex[depth] = inputIndex; mStackDiffs[depth] = diffs; mStackSiblingPos[depth] = firstChildPos; + mStackOutputIndex[depth] = outputIndex; } } else { // Goes to parent sibling node @@ -390,114 +438,128 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos, } } -inline static void multiplyRate(const int rate, int *freq) { - if (rate > 1000000) { - *freq = (*freq / 100) * rate; - } else { - *freq = *freq * rate / 100; - } +static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; +static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { + return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); } -bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, - const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos, - const int secondWordLength) { - if (inputLength >= MAX_WORD_LENGTH) return false; - if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos - || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) - return false; - const int newWordLength = firstWordLength + secondWordLength + 1; - // Allocating variable length array on stack - unsigned short word[newWordLength]; - const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord); - if (DEBUG_DICT) LOGI("First freq: %d", firstFreq); - if (firstFreq <= 0) return false; +static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; +inline static void multiplyIntCapped(const int multiplier, int *base) { + const int temp = *base; + if (temp != S_INT_MAX) { + // Branch if multiplier == 2 for the optimization + if (multiplier == 2) { + *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; + } else { + const int tempRetval = temp * multiplier; + *base = tempRetval >= temp ? tempRetval : S_INT_MAX; + } + } +} - for (int i = 0; i < firstWordLength; ++i) { - word[i] = mWord[i]; +inline static int powerIntCapped(const int base, const int n) { + if (base == 2) { + return n < 31 ? 1 << n : S_INT_MAX; + } else { + int ret = base; + for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); + return ret; } +} - const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord); - if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq); - if (secondFreq <= 0) return false; +inline static void multiplyRate(const int rate, int *freq) { + if (*freq != S_INT_MAX) { + if (*freq > 1000000) { + *freq /= 100; + multiplyIntCapped(rate, freq); + } else { + multiplyIntCapped(rate, freq); + *freq /= 100; + } + } +} - word[firstWordLength] = SPACE; - for (int i = (firstWordLength + 1); i < newWordLength; ++i) { - word[i] = mWord[i - firstWordLength - 1]; +inline static int calcFreqForSplitTwoWords( + const int typedLetterMultiplier, const int firstWordLength, const int secondWordLength, + const int firstFreq, const int secondFreq, const bool isSpaceProximity) { + if (firstWordLength == 0 || secondWordLength == 0) { + return 0; + } + const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); + int tempFirstFreq = firstFreq; + multiplyRate(firstDemotionRate, &tempFirstFreq); + + const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); + int tempSecondFreq = secondFreq; + multiplyRate(secondDemotionRate, &tempSecondFreq); + + const int totalLength = firstWordLength + secondWordLength; + + // Promote pairFreq with multiplying by 2, because the word length is the same as the typed + // length. + int totalFreq = tempFirstFreq + tempSecondFreq; + + // This is a workaround to try offsetting the not-enough-demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) + // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by + // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) + const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); + multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); + + // At this moment, totalFreq is calculated by the following formula: + // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) + // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) + + multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); + + // This is another workaround to offset the demotion which will be done in + // calcNormalizedScore in Utils.java. + // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote + // the same amount because we already have adjusted the synthetic freq of this "missing or + // mistyped space" suggestion candidate above in this method. + const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); + multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); + + if (isSpaceProximity) { + // A word pair with one space proximity correction + if (DEBUG_DICT) { + LOGI("Found a word pair with space proximity correction."); + } + multiplyIntCapped(typedLetterMultiplier, &totalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } - int pairFreq = ((firstFreq + secondFreq) / 2); - for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER; - multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &pairFreq); - addWord(word, newWordLength, pairFreq); - return true; + multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + return totalFreq; } bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) { return getSplitTwoWordsSuggestion( - inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos); + inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos, false); } bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) { return getSplitTwoWordsSuggestion( inputLength, 0, spaceProximityPos, spaceProximityPos + 1, - inputLength - spaceProximityPos - 1); -} - -// Keep this for comparing spec to new getWords -void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos, - const int excessivePos, const int transposedPos,int *nextLetters, - const int nextLettersSize) { - int initialPosition = initialPos; - const int count = Dictionary::getCount(DICT, &initialPosition); - getWordsRec(count, initialPosition, 0, - min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH), - mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters, - nextLettersSize); + inputLength - spaceProximityPos - 1, true); } -void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth, - const int maxDepth, const bool traverseAllNodes, const int matchWeight, - const int inputIndex, const int diffs, const int skipPos, const int excessivePos, - const int transposedPos, int *nextLetters, const int nextLettersSize) { - int siblingPos = pos; - for (int i = 0; i < childrenCount; ++i) { - int newCount; - int newChildPosition; - const int newDepth = depth + 1; - bool newTraverseAllNodes; - int newMatchRate; - int newInputIndex; - int newDiffs; - int newSiblingPos; - const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth, - traverseAllNodes, matchWeight, inputIndex, diffs, - skipPos, excessivePos, transposedPos, - nextLetters, nextLettersSize, - &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate, - &newInputIndex, &newDiffs, &newSiblingPos); - siblingPos = newSiblingPos; - - if (needsToTraverseChildrenNodes) { - getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes, - newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos, - nextLetters, nextLettersSize); - } - } -} - -static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; -static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { - return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); -} inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) const { // TODO: Demote by edit distance int finalFreq = freq * matchWeight; if (skipPos >= 0) { - if (mInputLength >= 3) { - multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE * - (mInputLength - 2) / (mInputLength - 1), &finalFreq); + if (mInputLength >= 2) { + const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE + * (10 * mInputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) + / (10 * mInputLength + - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); + if (DEBUG_DICT_FULL) { + LOGI("Demotion rate for missing character is %d.", demotionRate); + } + multiplyRate(demotionRate, &finalFreq); } else { finalFreq = 0; } @@ -511,40 +573,31 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int } } int lengthFreq = TYPED_LETTER_MULTIPLIER; - for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER; + multiplyIntCapped(powerIntCapped(TYPED_LETTER_MULTIPLIER, depth), &lengthFreq); if (lengthFreq == matchWeight) { + // Full exact match if (depth > 1) { - if (DEBUG_DICT) LOGI("Found full matched word."); + if (DEBUG_DICT) { + LOGI("Found full matched word."); + } multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); } if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) { finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); } + } else if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0 && depth > 0) { + // A word with proximity corrections + if (DEBUG_DICT) { + LOGI("Found one proximity correction."); + } + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &finalFreq); + multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } - if (sameLength) finalFreq *= FULL_WORD_MULTIPLIER; - return finalFreq; -} - -inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength( - unsigned short *word, const int inputIndex, const int depth, const int matchWeight, - int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos, - const int transposedPos, const int freq) { - const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, excessivePos, - transposedPos, freq, false); - if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq); - if (depth >= mInputLength && skipPos < 0) { - registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize); + if (DEBUG_DICT) { + LOGI("calc: %d, %d", depth, sameLength); } -} - -inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength( - unsigned short *word, const int inputIndex, const int depth, const int matchWeight, - const int skipPos, const int excessivePos, const int transposedPos, const int freq) { - if (sameAsTyped(word, depth + 1)) return; - const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, - excessivePos, transposedPos, freq, true); - // Proximity collection will promote a word of the same length as what user typed. - if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq); + if (sameLength) multiplyIntCapped(FULL_WORD_MULTIPLIER, &finalFreq); + return finalFreq; } inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, @@ -577,7 +630,6 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex return false; } - // In the following function, c is the current character of the dictionary word // currently examined. // currentChars is an array containing the keys close to the character the @@ -620,96 +672,115 @@ inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId return UNRELATED_CHAR; } -inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth, - const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex, - const int diffs, const int skipPos, const int excessivePos, const int transposedPos, - int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, - bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, - int *nextSiblingPosition) { - if (DEBUG_DICT) { - int inputCount = 0; - if (skipPos >= 0) ++inputCount; - if (excessivePos >= 0) ++inputCount; - if (transposedPos >= 0) ++inputCount; - assert(inputCount <= 1); - } - unsigned short c; - int childPosition; - bool terminal; - int freq; - bool isSameAsUserTypedLength = false; +inline void UnigramDictionary::onTerminal(unsigned short int* word, const int depth, + const uint8_t* const root, const uint8_t flags, const int pos, + const int inputIndex, const int matchWeight, const int skipPos, + const int excessivePos, const int transposedPos, const int freq, const bool sameLength, + int* nextLetters, const int nextLettersSize) { - if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; + const bool isSameAsTyped = sameLength ? sameAsTyped(word, depth + 1) : false; + if (isSameAsTyped) return; - *nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c, - &childPosition, &terminal, &freq); + if (depth >= MIN_SUGGEST_DEPTH) { + const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, + excessivePos, transposedPos, freq, sameLength); + if (!isSameAsTyped) + addWord(word, depth + 1, finalFreq); + } - const bool needsToTraverseChildrenNodes = childPosition != 0; + if (sameLength && depth >= mInputLength && skipPos < 0) { + registerNextLetter(word[mInputLength], nextLetters, nextLettersSize); + } +} - // If we are only doing traverseAllNodes, no need to look at the typed characters. - if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { - mWord[depth] = c; - if (traverseAllNodes && terminal) { - onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, inputIndex, depth, - matchWeight, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, - freq); - } - if (!needsToTraverseChildrenNodes) return false; - *newTraverseAllNodes = traverseAllNodes; - *newMatchRate = matchWeight; - *newDiffs = diffs; - *newInputIndex = inputIndex; - } else { - const int *currentChars = getInputCharsAt(inputIndex); +bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength, + const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos, + const int secondWordLength, const bool isSpaceProximity) { + if (inputLength >= MAX_WORD_LENGTH) return false; + if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos + || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) + return false; + const int newWordLength = firstWordLength + secondWordLength + 1; + // Allocating variable length array on stack + unsigned short word[newWordLength]; + const int firstFreq = getMostFrequentWordLike(firstWordStartPos, firstWordLength, mWord); + if (DEBUG_DICT) { + LOGI("First freq: %d", firstFreq); + } + if (firstFreq <= 0) return false; - if (transposedPos >= 0) { - if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; - if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; - } + for (int i = 0; i < firstWordLength; ++i) { + word[i] = mWord[i]; + } - int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos, - transposedPos); - if (UNRELATED_CHAR == matchedProximityCharId) return false; - mWord[depth] = c; - // If inputIndex is greater than mInputLength, that means there is no - // proximity chars. So, we don't need to check proximity. - if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { - matchWeight = matchWeight * TYPED_LETTER_MULTIPLIER; - } - bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 - || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); - if (isSameAsUserTypedLength && terminal) { - onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, matchWeight, - skipPos, excessivePos, transposedPos, freq); - } - if (!needsToTraverseChildrenNodes) return false; - // Start traversing all nodes after the index exceeds the user typed length - *newTraverseAllNodes = isSameAsUserTypedLength; - *newMatchRate = matchWeight; - *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); - *newInputIndex = inputIndex + 1; + const int secondFreq = getMostFrequentWordLike(secondWordStartPos, secondWordLength, mWord); + if (DEBUG_DICT) { + LOGI("Second freq: %d", secondFreq); } - // Optimization: Prune out words that are too long compared to how much was typed. - if (depth >= maxDepth || *newDiffs > mMaxEditDistance) { - return false; + if (secondFreq <= 0) return false; + + word[firstWordLength] = SPACE; + for (int i = (firstWordLength + 1); i < newWordLength; ++i) { + word[i] = mWord[i - firstWordLength - 1]; } - // If inputIndex is greater than mInputLength, that means there are no proximity chars. - // TODO: Check if this can be isSameAsUserTypedLength only. - if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) { - *newTraverseAllNodes = true; + int pairFreq = calcFreqForSplitTwoWords(TYPED_LETTER_MULTIPLIER, firstWordLength, + secondWordLength, firstFreq, secondFreq, isSpaceProximity); + if (DEBUG_DICT) { + LOGI("Split two words: %d, %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength, + TYPED_LETTER_MULTIPLIER); } - // get the count of nodes and increment childAddress. - *newCount = Dictionary::getCount(DICT, &childPosition); - *newChildPosition = childPosition; - if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); - return needsToTraverseChildrenNodes; + addWord(word, newWordLength, pairFreq); + return true; +} + +#ifndef NEW_DICTIONARY_FORMAT +// The following functions will be entirely replaced with new implementations. +void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos, + const int excessivePos, const int transposedPos,int *nextLetters, + const int nextLettersSize) { + int initialPosition = initialPos; + const int count = Dictionary::getCount(DICT_ROOT, &initialPosition); + getWordsRec(count, initialPosition, 0, + min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH), + mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters, + nextLettersSize); } -inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength, - unsigned short *word) { +void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth, + const int maxDepth, const bool traverseAllNodes, const int matchWeight, + const int inputIndex, const int diffs, const int skipPos, const int excessivePos, + const int transposedPos, int *nextLetters, const int nextLettersSize) { + int siblingPos = pos; + for (int i = 0; i < childrenCount; ++i) { + int newCount; + int newChildPosition; + bool newTraverseAllNodes; + int newMatchRate; + int newInputIndex; + int newDiffs; + int newSiblingPos; + int newOutputIndex; + const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth, + traverseAllNodes, matchWeight, inputIndex, diffs, + skipPos, excessivePos, transposedPos, + nextLetters, nextLettersSize, + &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate, + &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex); + siblingPos = newSiblingPos; + + if (needsToTraverseChildrenNodes) { + getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes, + newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos, + nextLetters, nextLettersSize); + } + } +} + +inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, + const int inputLength, unsigned short *word) { int pos = ROOT_POS; - int count = Dictionary::getCount(DICT, &pos); + int count = Dictionary::getCount(DICT_ROOT, &pos); int maxFreq = 0; int depth = 0; unsigned short newWord[MAX_WORD_LENGTH_INTERNAL]; @@ -734,9 +805,11 @@ inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const i if (newFreq > maxFreq) { for (int i = 0; i < inputLength; ++i) word[i] = newWord[i]; if (DEBUG_DICT && DEBUG_NODE) { +#ifdef FLAG_DBG char s[inputLength + 1]; for (int i = 0; i < inputLength; ++i) s[i] = word[i]; s[inputLength] = 0; +#endif LOGI("New missing space word found: %d > %d (%s), %d, %d", newFreq, maxFreq, s, inputLength, depth); } @@ -765,10 +838,12 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh const int inputIndex = startInputIndex + depth; const int *currentChars = getInputCharsAt(inputIndex); unsigned short c; - *siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c, - newChildPosition, newTerminal, newFreq); + *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos, + &c, newChildPosition, newTerminal, newFreq); const unsigned int inputC = currentChars[0]; - if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX); + if (DEBUG_DICT) { + assert(inputC <= U_SHORT_MAX); + } const unsigned short baseLowerC = toBaseLowerCase(c); const bool matched = (inputC == baseLowerC || inputC == c); const bool hasChild = *newChildPosition != 0; @@ -776,10 +851,12 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh word[depth] = c; if (DEBUG_DICT && DEBUG_NODE) { LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq); - if (*newTerminal) LOGI("Terminal %d", *newFreq); + if (*newTerminal) { + LOGI("Terminal %d", *newFreq); + } } if (hasChild) { - *newCount = Dictionary::getCount(DICT, newChildPosition); + *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition); return true; } else { return false; @@ -791,4 +868,575 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh return false; } } + +// TODO: use uint32_t instead of unsigned short +bool UnigramDictionary::isValidWord(unsigned short *word, int length) { + if (IS_LATEST_DICT_VERSION) { + return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); + } else { + return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD); + } +} + + +// Require strict exact match. +int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, + int length) const { + // returns address of bigram data of that word + // return -99 if not found + + int count = Dictionary::getCount(DICT_ROOT, &pos); + unsigned short currentChar = (unsigned short) word[offset]; + for (int j = 0; j < count; j++) { + unsigned short c = Dictionary::getChar(DICT_ROOT, &pos); + int terminal = Dictionary::getTerminal(DICT_ROOT, &pos); + int childPos = Dictionary::getAddress(DICT_ROOT, &pos); + if (c == currentChar) { + if (offset == length - 1) { + if (terminal) { + return (pos+1); + } + } else { + if (childPos != 0) { + int t = getBigramPosition(childPos, word, offset + 1, length); + if (t > 0) { + return t; + } + } + } + } + if (terminal) { + Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos); + } + // There could be two instances of each alphabet - upper and lower case. So continue + // looking ... + } + return NOT_VALID_WORD; +} + +// The following functions will be modified. +inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth, + const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex, + const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos, + int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, + bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, + int *nextSiblingPosition, int *nextOutputIndex) { + if (DEBUG_DICT) { + int inputCount = 0; + if (skipPos >= 0) ++inputCount; + if (excessivePos >= 0) ++inputCount; + if (transposedPos >= 0) ++inputCount; + assert(inputCount <= 1); + } + unsigned short c; + int childPosition; + bool terminal; + int freq; + bool isSameAsUserTypedLength = false; + + const int pos = initialPos; + const int depth = initialDepth; + const int traverseAllNodes = initialTraverseAllNodes; + const int diffs = initialDiffs; + + const uint8_t flags = 0; // No flags for now + + if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; + + *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos, + &c, &childPosition, &terminal, &freq); + *nextOutputIndex = depth + 1; + + const bool needsToTraverseChildrenNodes = childPosition != 0; + + // If we are only doing traverseAllNodes, no need to look at the typed characters. + if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { + mWord[depth] = c; + if (traverseAllNodes && terminal) { + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, + excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize); + } + if (!needsToTraverseChildrenNodes) return false; + *newTraverseAllNodes = traverseAllNodes; + *newMatchRate = matchWeight; + *newDiffs = diffs; + *newInputIndex = inputIndex; + } else { + const int *currentChars = getInputCharsAt(inputIndex); + + if (transposedPos >= 0) { + if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; + if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; + } + + int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos, + transposedPos); + if (UNRELATED_CHAR == matchedProximityCharId) return false; + mWord[depth] = c; + // If inputIndex is greater than mInputLength, that means there is no + // proximity chars. So, we don't need to check proximity. + if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); + } + bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 + || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); + if (isSameAsUserTypedLength && terminal) { + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, + excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize); + } + if (!needsToTraverseChildrenNodes) return false; + // Start traversing all nodes after the index exceeds the user typed length + *newTraverseAllNodes = isSameAsUserTypedLength; + *newMatchRate = matchWeight; + *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); + *newInputIndex = inputIndex + 1; + } + // Optimization: Prune out words that are too long compared to how much was typed. + if (depth >= maxDepth || *newDiffs > mMaxEditDistance) { + return false; + } + + // If inputIndex is greater than mInputLength, that means there are no proximity chars. + // TODO: Check if this can be isSameAsUserTypedLength only. + if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) { + *newTraverseAllNodes = true; + } + // get the count of nodes and increment childAddress. + *newCount = Dictionary::getCount(DICT_ROOT, &childPosition); + *newChildPosition = childPosition; + if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); + return needsToTraverseChildrenNodes; +} + +#else // NEW_DICTIONARY_FORMAT + +// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous +// interface. +inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, + const int inputLength, unsigned short *word) { + uint16_t inWord[inputLength]; + + for (int i = 0; i < inputLength; ++i) { + inWord[i] = *getInputCharsAt(startInputIndex + i); + } + return getMostFrequentWordLikeInner(inWord, inputLength, word); +} + +// This function will take the position of a character array within a CharGroup, +// and check it actually like-matches the word in inWord starting at startInputIndex, +// that is, it matches it with case and accents squashed. +// The function returns true if there was a full match, false otherwise. +// The function will copy on-the-fly the characters in the CharGroup to outNewWord. +// It will also place the end position of the array in outPos; in outInputIndex, +// it will place the index of the first char AFTER the match if there was a match, +// and the initial position if there was not. It makes sense because if there was +// a match we want to continue searching, but if there was not, we want to go to +// the next CharGroup. +// In and out parameters may point to the same location. This function takes care +// not to use any input parameters after it wrote into its outputs. +static inline bool testCharGroupForContinuedLikeness(const uint8_t flags, + const uint8_t* const root, const int startPos, + const uint16_t* const inWord, const int startInputIndex, + int32_t* outNewWord, int* outInputIndex, int* outPos) { + const bool hasMultipleChars = (0 != (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags)); + int pos = startPos; + int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + int32_t baseChar = toBaseLowerCase(character); + const uint16_t wChar = toBaseLowerCase(inWord[startInputIndex]); + + if (baseChar != wChar) { + *outPos = hasMultipleChars ? BinaryFormat::skipOtherCharacters(root, pos) : pos; + *outInputIndex = startInputIndex; + return false; + } + int inputIndex = startInputIndex; + outNewWord[inputIndex] = character; + if (hasMultipleChars) { + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + while (NOT_A_CHARACTER != character) { + baseChar = toBaseLowerCase(character); + if (toBaseLowerCase(inWord[++inputIndex]) != baseChar) { + *outPos = BinaryFormat::skipOtherCharacters(root, pos); + *outInputIndex = startInputIndex; + return false; + } + outNewWord[inputIndex] = character; + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + } + } + *outInputIndex = inputIndex + 1; + *outPos = pos; + return true; +} + +// This function is invoked when a word like the word searched for is found. +// It will compare the frequency to the max frequency, and if greater, will +// copy the word into the output buffer. In output value maxFreq, it will +// write the new maximum frequency if it changed. +static inline void onTerminalWordLike(const int freq, int32_t* newWord, const int length, + short unsigned int* outWord, int* maxFreq) { + if (freq > *maxFreq) { + for (int q = 0; q < length; ++q) + outWord[q] = newWord[q]; + outWord[length] = 0; + *maxFreq = freq; + } +} + +// Will find the highest frequency of the words like the one passed as an argument, +// that is, everything that only differs by case/accents. +int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWord, + const int length, short unsigned int* outWord) { + int32_t newWord[MAX_WORD_LENGTH_INTERNAL]; + int depth = 0; + int maxFreq = -1; + const uint8_t* const root = DICT_ROOT; + + mStackChildCount[0] = root[0]; + mStackInputIndex[0] = 0; + mStackSiblingPos[0] = 1; + while (depth >= 0) { + const int charGroupCount = mStackChildCount[depth]; + int pos = mStackSiblingPos[depth]; + for (int charGroupIndex = charGroupCount - 1; charGroupIndex >= 0; --charGroupIndex) { + int inputIndex = mStackInputIndex[depth]; + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + // Test whether all chars in this group match with the word we are searching for. If so, + // we want to traverse its children (or if the length match, evaluate its frequency). + // Note that this function will output the position regardless, but will only write + // into inputIndex if there is a match. + const bool isAlike = testCharGroupForContinuedLikeness(flags, root, pos, inWord, + inputIndex, newWord, &inputIndex, &pos); + if (isAlike && (FLAG_IS_TERMINAL & flags) && (inputIndex == length)) { + const int frequency = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); + onTerminalWordLike(frequency, newWord, inputIndex, outWord, &maxFreq); + } + pos = BinaryFormat::skipFrequency(flags, pos); + const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); + const int childrenNodePos = BinaryFormat::readChildrenPosition(root, flags, pos); + // If we had a match and the word has children, we want to traverse them. We don't have + // to traverse words longer than the one we are searching for, since they will not match + // anyway, so don't traverse unless inputIndex < length. + if (isAlike && (-1 != childrenNodePos) && (inputIndex < length)) { + // Save position for this depth, to get back to this once children are done + mStackChildCount[depth] = charGroupIndex; + mStackSiblingPos[depth] = siblingPos; + // Prepare stack values for next depth + ++depth; + int childrenPos = childrenNodePos; + mStackChildCount[depth] = + BinaryFormat::getGroupCountAndForwardPointer(root, &childrenPos); + mStackSiblingPos[depth] = childrenPos; + mStackInputIndex[depth] = inputIndex; + pos = childrenPos; + // Go to the next depth level. + ++depth; + break; + } else { + // No match, or no children, or word too long to ever match: go the next sibling. + pos = siblingPos; + } + } + --depth; + } + return maxFreq; +} + +// This function gets the frequency of the exact matching word in the dictionary. +// If no match is found, it returns -1. +int UnigramDictionary::getFrequency(const uint16_t* const inWord, const int length) const { + int pos = 0; + int wordPos = 0; + const uint8_t* const root = DICT_ROOT; + + while (true) { + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + if (wordPos > length) return -1; + int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); + const uint16_t wChar = inWord[wordPos]; + while (true) { + // If there are no more character groups in this node, it means we could not + // find a matching character for this depth, therefore there is no match. + if (0 >= charGroupCount) return -1; + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + if (character == wChar) { + // This is the correct node. Only one character group may start with the same + // char within a node, so either we found our match in this node, or there is + // no match and we can return -1. So we will check all the characters in this + // character group indeed does match. + if (FLAG_HAS_MULTIPLE_CHARS & flags) { + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + while (NOT_A_CHARACTER != character) { + ++wordPos; + // If we shoot the length of the word we search for, or if we find a single + // character that does not match, as explained above, it means the word is + // not in the dictionary (by virtue of this chargroup being the only one to + // match the word on the first character, but not matching the whole word). + if (wordPos > length) return -1; + if (inWord[wordPos] != character) return -1; + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + } + } + // If we come here we know that so far, we do match. Either we are on a terminal + // and we match the length, in which case we found it, or we traverse children. + // If we don't match the length AND don't have children, then a word in the + // dictionary fully matches a prefix of the searched word but not the full word. + ++wordPos; + if (FLAG_IS_TERMINAL & flags) { + if (wordPos == length) { + return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); + } + pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); + } + if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) + return -1; + // We have children and we are still shorter than the word we are searching for, so + // we need to traverse children. Put the pointer on the children position, and + // break + pos = BinaryFormat::readChildrenPosition(root, flags, pos); + break; + } else { + // This chargroup does not match, so skip the remaining part and go to the next. + if (FLAG_HAS_MULTIPLE_CHARS & flags) { + pos = BinaryFormat::skipOtherCharacters(root, pos); + } + pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); + } + --charGroupCount; + } + } +} + +bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const { + return -1 != getFrequency(inWord, length); +} + +int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize, + unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, + int maxAlternatives) { + // TODO: add implementation. + return 0; +} + +// TODO: remove this function. +int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset, + int length) const { + return -1; +} + +// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not. +// If the return value is false, then the caller should read in the output "nextSiblingPosition" +// to find out the address of the next sibling node and pass it to a new call of processCurrentNode. +// It is worthy to note that when false is returned, the output values other than +// nextSiblingPosition are undefined. +// If the return value is true, then the caller must proceed to traverse the children of this +// node. processCurrentNode will output the information about the children: their count in +// newCount, their position in newChildrenPosition, the traverseAllNodes flag in +// newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the +// diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into +// newOutputIndex. Please also note the following caveat: processCurrentNode does not know when +// there aren't any more nodes at this level, it merely returns the address of the first byte after +// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any +// given level, as output into newCount when traversing this level's parent. +inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth, + const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex, + const int initialDiffs, const int skipPos, const int excessivePos, const int transposedPos, + int *nextLetters, const int nextLettersSize, int *newCount, int *newChildrenPosition, + bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs, + int *nextSiblingPosition, int *newOutputIndex) { + if (DEBUG_DICT) { + int inputCount = 0; + if (skipPos >= 0) ++inputCount; + if (excessivePos >= 0) ++inputCount; + if (transposedPos >= 0) ++inputCount; + assert(inputCount <= 1); + } + int pos = initialPos; + int depth = initialDepth; + int traverseAllNodes = initialTraverseAllNodes; + int diffs = initialDiffs; + + // Flags contain the following information: + // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: + // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address + // is on the specified number of bytes. + // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address. + // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. + // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) + // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); + const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); + + // This gets only ONE character from the stream. Next there will be: + // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node + // else if FLAG_IS_TERMINAL: the frequency + // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address + // Note that you can't have a node that both is not a terminal and has no children. + int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); + assert(NOT_A_CHARACTER != c); + + // We are going to loop through each character and make it look like it's a different + // node each time. To do that, we will process characters in this node in order until + // we find the character terminator. This is signalled by getCharCode* returning + // NOT_A_CHARACTER. + // As a special case, if there is only one character in this node, we must not read the + // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. + // This way, each loop run will look like a "virtual node". + do { + // We prefetch the next char. If 'c' is the last char of this node, we will have + // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node + // should behave as a terminal or not and whether we have children. + const int32_t nextc = hasMultipleChars + ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; + const bool isLastChar = (NOT_A_CHARACTER == nextc); + // If there are more chars in this nodes, then this virtual node is not a terminal. + // If we are on the last char, this virtual node is a terminal if this node is. + const bool isTerminal = isLastChar && (0 != (FLAG_IS_TERMINAL & flags)); + // If there are more chars in this node, then this virtual node has children. + // If we are on the last char, this virtual node has children if this node has. + const bool hasChildren = (!isLastChar) || BinaryFormat::hasChildrenInFlags(flags); + + // This has to be done for each virtual char (this forwards the "inputIndex" which + // is the index in the user-inputted chars, as read by getInputCharsAt. + if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex; + if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { + mWord[depth] = c; + if (traverseAllNodes && isTerminal) { + // The frequency should be here, because we come here only if this is actually + // a terminal node, and we are on its last char. + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, + excessivePos, transposedPos, freq, false, nextLetters, nextLettersSize); + } + if (!hasChildren) { + // If we don't have children here, that means we finished processing all + // characters of this node (we are on the last virtual node), AND we are in + // traverseAllNodes mode, which means we are searching for *completions*. We + // should skip the frequency if we have a terminal, and report the position + // of the next sibling. We don't have to return other values because we are + // returning false, as in "don't traverse children". + if (isTerminal) pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } + } else { + const int *currentChars = getInputCharsAt(inputIndex); + + if (transposedPos >= 0) { + if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; + if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS; + } + + const int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, + excessivePos, transposedPos); + if (UNRELATED_CHAR == matchedProximityCharId) { + // We found that this is an unrelated character, so we should give up traversing + // this node and its children entirely. + // However we may not be on the last virtual node yet so we skip the remaining + // characters in this node, the frequency if it's there, read the next sibling + // position to output it, then return false. + // We don't have to output other values because we return false, as in + // "don't traverse children". + if (!isLastChar) { + pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); + } + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } + mWord[depth] = c; + // If inputIndex is greater than mInputLength, that means there is no + // proximity chars. So, we don't need to check proximity. + if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight); + } + const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1 + || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2); + if (isSameAsUserTypedLength && isTerminal) { + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight, skipPos, + excessivePos, transposedPos, freq, true, nextLetters, nextLettersSize); + } + // This character matched the typed character (enough to traverse the node at least) + // so we just evaluated it. Now we should evaluate this virtual node's children - that + // is, if it has any. If it has no children, we're done here - so we skip the end of + // the node, output the siblings position, and return false "don't traverse children". + // Note that !hasChildren implies isLastChar, so we know we don't have to skip any + // remaining char in this group for there can't be any. + if (!hasChildren) { + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } + // Start traversing all nodes after the index exceeds the user typed length + traverseAllNodes = isSameAsUserTypedLength; + diffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0); + // Finally, we are ready to go to the next character, the next "virtual node". + // We should advance the input index. + // We do this in this branch of the 'if traverseAllNodes' because we are still matching + // characters to input; the other branch is not matching them but searching for + // completions, this is why it does not have to do it. + ++inputIndex; + } + // Optimization: Prune out words that are too long compared to how much was typed. + if (depth >= maxDepth || diffs > mMaxEditDistance) { + // We are giving up parsing this node and its children. Skip the rest of the node, + // output the sibling position, and return that we don't want to traverse children. + if (!isLastChar) { + pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); + } + pos = BinaryFormat::skipFrequency(flags, pos); + *nextSiblingPosition = + BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + return false; + } + + // Prepare for the next character. Promote the prefetched char to current char - the loop + // will take care of prefetching the next. If we finally found our last char, nextc will + // contain NOT_A_CHARACTER. + c = nextc; + // Also, the next char is one "virtual node" depth more than this char. + ++depth; + } while (NOT_A_CHARACTER != c); + + // If inputIndex is greater than mInputLength, that means there are no proximity chars. + // Here, that's all we are interested in so we don't need to check for isSameAsUserTypedLength. + if (mInputLength <= *newInputIndex) { + traverseAllNodes = true; + } + + // All the output values that are purely computation by this function are held in local + // variables. Output them to the caller. + *newTraverseAllNodes = traverseAllNodes; + *newMatchRate = matchWeight; + *newDiffs = diffs; + *newInputIndex = inputIndex; + *newOutputIndex = depth; + + // Now we finished processing this node, and we want to traverse children. If there are no + // children, we can't come here. + assert(BinaryFormat::hasChildrenInFlags(flags)); + + // If this node was a terminal it still has the frequency under the pointer (it may have been + // read, but not skipped - see readFrequencyWithoutMovingPointer). + // Next come the children position, then possibly attributes (attributes are bigrams only for + // now, maybe something related to shortcuts in the future). + // Once this is read, we still need to output the number of nodes in the immediate children of + // this node, so we read and output it before returning true, as in "please traverse children". + pos = BinaryFormat::skipFrequency(flags, pos); + int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); + *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); + *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); + *newChildrenPosition = childrenPos; + return true; +} + +#endif // NEW_DICTIONARY_FORMAT + } // namespace latinime diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 3d3007ce0..dcc8f2a9a 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -17,9 +17,14 @@ #ifndef LATINIME_UNIGRAM_DICTIONARY_H #define LATINIME_UNIGRAM_DICTIONARY_H +#include <stdint.h> #include "defines.h" #include "proximity_info.h" +#ifndef NULL +#define NULL 0 +#endif + namespace latinime { class UnigramDictionary { @@ -31,8 +36,52 @@ class UnigramDictionary { } ProximityType; public: - UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier, - int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion); +#ifdef NEW_DICTIONARY_FORMAT + + // Mask and flags for children address type selection. + static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; + static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; + static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; + static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; + static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; + + // Flag for single/multiple char group + static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; + + // Flag for terminal groups + static const int FLAG_IS_TERMINAL = 0x10; + + // Flag for bigram presence + static const int FLAG_HAS_BIGRAMS = 0x04; + + // Attribute (bigram/shortcut) related flags: + // Flag for presence of more attributes + static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; + // Flag for sign of offset. If this flag is set, the offset value must be negated. + static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; + + // Mask for attribute frequency, stored on 4 bits inside the flags byte. + static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; + + // Mask and flags for attribute address type selection. + static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; + static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +#endif // NEW_DICTIONARY_FORMAT + + UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, + int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, + const bool isLatestDictVersion); +#ifndef NEW_DICTIONARY_FORMAT + bool isValidWord(unsigned short *word, int length); +#else // NEW_DICTIONARY_FORMAT + bool isValidWord(const uint16_t* const inWord, const int length) const; + int getBigrams(unsigned short *word, int length, int *codes, int codesSize, + unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, + int maxAlternatives); +#endif // NEW_DICTIONARY_FORMAT + int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, const int flags, unsigned short *outWords, int *frequencies); @@ -52,59 +101,58 @@ private: void getSuggestionCandidates(const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, const int maxDepth); - void getVersionNumber(); - bool checkIfDictVersionIsLatest(); - int getAddress(int *pos); - int getFreq(int *pos); - int wideStrLen(unsigned short *str); - bool sameAsTyped(unsigned short *word, int length); + bool sameAsTyped(const unsigned short *word, int length) const; bool addWord(unsigned short *word, int length, int frequency); - unsigned short toBaseLowerCase(unsigned short c); - void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth, - const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs, - const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, - const int nextLettersSize); bool getSplitTwoWordsSuggestion(const int inputLength, const int firstWordStartPos, const int firstWordLength, - const int secondWordStartPos, const int secondWordLength); + const int secondWordStartPos, const int secondWordLength, const bool isSpaceProximity); bool getMissingSpaceWords(const int inputLength, const int missingSpacePos); bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos); - // Keep getWordsOld for comparing performance between getWords and getWordsOld - void getWordsOld(const int initialPos, const int inputLength, const int skipPos, - const int excessivePos, const int transposedPos, int *nextLetters, - const int nextLettersSize); - void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize); int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, const int excessivePos, const int transposedPos, const int freq, const bool sameLength) const; - void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word, - const int inputIndex, const int depth, const int snr, int *nextLetters, - const int nextLettersSize, const int skipPos, const int excessivePos, - const int transposedPos, const int freq); - void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word, - const int inputIndex, const int depth, const int snr, const int skipPos, - const int excessivePos, const int transposedPos, const int freq); + void onTerminal(unsigned short int* word, const int depth, + const uint8_t* const root, const uint8_t flags, const int pos, + const int inputIndex, const int matchWeight, const int skipPos, + const int excessivePos, const int transposedPos, const int freq, const bool sameLength, + int *nextLetters, const int nextLettersSize); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos, const int excessivePos, const int transposedPos); // Process a node by considering proximity, missing and excessive character - bool processCurrentNode(const int pos, const int depth, - const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex, - const int diffs, const int skipPos, const int excessivePos, const int transposedPos, - int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition, - bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs, - int *nextSiblingPosition); - int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word); - // Process a node by considering missing space - bool processCurrentNodeForExactMatch(const int firstChildPos, - const int startInputIndex, const int depth, unsigned short *word, - int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos); + bool processCurrentNode(const int initialPos, const int initialDepth, + const int maxDepth, const bool initialTraverseAllNodes, const int snr, int inputIndex, + const int initialDiffs, const int skipPos, const int excessivePos, + const int transposedPos, int *nextLetters, const int nextLettersSize, int *newCount, + int *newChildPosition, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, + int *newDiffs, int *nextSiblingPosition, int *nextOutputIndex); bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const; inline const int* getInputCharsAt(const int index) const { return mInputCodes + (index * MAX_PROXIMITY_CHARS); } - const unsigned char *DICT; + int getMostFrequentWordLike(const int startInputIndex, const int inputLength, + unsigned short *word); +#ifndef NEW_DICTIONARY_FORMAT + void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth, + const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs, + const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, + const int nextLettersSize); + // Keep getWordsOld for comparing performance between getWords and getWordsOld + void getWordsOld(const int initialPos, const int inputLength, const int skipPos, + const int excessivePos, const int transposedPos, int *nextLetters, + const int nextLettersSize); + // Process a node by considering missing space + bool processCurrentNodeForExactMatch(const int firstChildPos, + const int startInputIndex, const int depth, unsigned short *word, + int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos); +#else // NEW_DICTIONARY_FORMAT + int getFrequency(const uint16_t* const inWord, const int length) const; + int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, + short unsigned int* outWord); +#endif // NEW_DICTIONARY_FORMAT + + const uint8_t* const DICT_ROOT; const int MAX_WORD_LENGTH; const int MAX_WORDS; const int MAX_PROXIMITY_CHARS; @@ -138,11 +186,10 @@ private: int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL]; int mStackDiffs[MAX_WORD_LENGTH_INTERNAL]; int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL]; + int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL]; int mNextLettersFrequency[NEXT_LETTERS_SIZE]; }; -// ---------------------------------------------------------------------------- - -}; // namespace latinime +} // namespace latinime #endif // LATINIME_UNIGRAM_DICTIONARY_H |