diff options
Diffstat (limited to 'native/src')
-rw-r--r-- | native/src/debug.h | 11 | ||||
-rw-r--r-- | native/src/dictionary.h | 4 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 154 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 30 |
4 files changed, 163 insertions, 36 deletions
diff --git a/native/src/debug.h b/native/src/debug.h index e5572e1a5..ae629b222 100644 --- a/native/src/debug.h +++ b/native/src/debug.h @@ -55,4 +55,15 @@ static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int leng // usleep(10); } +static inline void printDebug(const char* tag, int* codes, int codesSize, int MAX_PROXIMITY_CHARS) { + unsigned char *buf = (unsigned char*)malloc((1 + codesSize) * sizeof(*buf)); + + buf[codesSize] = 0; + while (--codesSize >= 0) + buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS]; + LOGI("%s, WORD = %s", tag, buf); + + free(buf); +} + #endif // LATINIME_DEBUG_H diff --git a/native/src/dictionary.h b/native/src/dictionary.h index fbbb8312b..13b2a2816 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -29,9 +29,9 @@ public: Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives); int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, - int *codes, int codesSize, unsigned short *outWords, int *frequencies) { + int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) { return mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, - codesSize, outWords, frequencies); + codesSize, flags, outWords, frequencies); } // TODO: Call mBigramDictionary instead of mUnigramDictionary diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 72b0f361a..9aa36b064 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -29,20 +29,136 @@ namespace latinime { +const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = + { { 'a', 'e' }, + { 'o', 'e' }, + { 'u', 'e' } }; + UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), - ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) { + ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0), + BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)) { if (DEBUG_DICT) LOGI("UnigramDictionary - constructor"); } UnigramDictionary::~UnigramDictionary() {} -int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, - int *ycoordinates, int *codes, int codesSize, unsigned short *outWords, int *frequencies) { +static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize, + const int MAX_PROXIMITY_CHARS) { + return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize; +} + +bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const { + + // There can't be a digraph if we don't have at least 2 characters to examine + if (i + 2 > codesSize) return false; + + // Search for the first char of some digraph + int lastDigraphIndex = -1; + const int thisChar = codes[i * MAX_PROXIMITY_CHARS]; + for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1; + lastDigraphIndex >= 0; --lastDigraphIndex) { + if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break; + } + // No match: return early + if (lastDigraphIndex < 0) return false; + + // It's an interesting digraph if the second char matches too. + return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS]; +} + +// Mostly the same arguments as the non-recursive version, except: +// codes is the original value. It points to the start of the work buffer, and gets passed as is. +// codesSize is the size of the user input (thus, it is the size of codesSrc). +// codesDest is the current point in the work buffer. +// codesSrc is the current point in the user-input, original, content-unmodified buffer. +// codesRemain is the remaining size in codesSrc. +void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo, + const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, + const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, + int* codesDest, unsigned short* outWords, int* frequencies) { + + for (int i = 0; i < codesRemain; ++i) { + if (isDigraph(codesSrc, i, codesRemain)) { + // Found a digraph. We will try both spellings. eg. the word is "pruefen" + + // Copy the word up to the first char of the digraph, then continue processing + // on the remaining part of the word, skipping the second char of the digraph. + // In our example, copy "pru" and continue running on "fen" + memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR); + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + codesBufferSize, flags, codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, + codesRemain - i - 1, codesDest + i * MAX_PROXIMITY_CHARS, + outWords, frequencies); + + // Copy the second char of the digraph in place, then continue processing on + // the remaining part of the word. + // In our example, after "pru" in the buffer copy the "e", and continue running on "fen" + memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS, + BYTES_IN_ONE_CHAR); + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS, codesRemain - i, + codesDest + i * MAX_PROXIMITY_CHARS, outWords, frequencies); + return; + } + } + + // If we come here, we hit the end of the word: let's check it against the dictionary. + // In our example, we'll come here once for "prufen" and then once for "pruefen". + // If the word contains several digraphs, we'll come it for the product of them. + // eg. if the word is "ueberpruefen" we'll test, in order, against + // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen". + const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain; + if (0 != remainingBytes) + memcpy(codesDest, codesSrc, remainingBytes); + + getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies); +} + +int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, const int flags, + unsigned short *outWords, int *frequencies) { + + if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags) + { // Incrementally tune the word and try all possibilities + int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)]; + getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer, + codesSize, flags, codes, codesSize, codesBuffer, outWords, frequencies); + } else { // Normal processing + getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize, + outWords, frequencies); + } + + PROF_START(6); + // Get the word count + int suggestedWordsCount = 0; + while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) { + suggestedWordsCount++; + } + + if (DEBUG_DICT) { + LOGI("Returning %d words", suggestedWordsCount); + LOGI("Next letters: "); + for (int k = 0; k < NEXT_LETTERS_SIZE; k++) { + if (mNextLettersFrequency[k] > 0) { + LOGI("%c = %d,", k, mNextLettersFrequency[k]); + } + } + } + PROF_END(6); + PROF_CLOSE; + return suggestedWordsCount; +} + +void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo, + const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize, + unsigned short *outWords, int *frequencies) { + PROF_OPEN; PROF_START(0); initSuggestions(codes, codesSize, outWords, frequencies); @@ -103,30 +219,10 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int *xcoordi } } PROF_END(5); - - PROF_START(6); - // Get the word count - int suggestedWordsCount = 0; - while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) { - suggestedWordsCount++; - } - - if (DEBUG_DICT) { - LOGI("Returning %d words", suggestedWordsCount); - LOGI("Next letters: "); - for (int k = 0; k < NEXT_LETTERS_SIZE; k++) { - if (mNextLettersFrequency[k] > 0) { - LOGI("%c = %d,", k, mNextLettersFrequency[k]); - } - } - } - PROF_END(6); - PROF_CLOSE; - return suggestedWordsCount; } -void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords, - int *frequencies) { +void UnigramDictionary::initSuggestions(const int *codes, const int codesSize, + unsigned short *outWords, int *frequencies) { if (DEBUG_DICT) LOGI("initSuggest"); mFrequencies = frequencies; mOutputChars = outWords; @@ -204,7 +300,7 @@ bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) { if (length != mInputLength) { return false; } - int *inputCodes = mInputCodes; + const int *inputCodes = mInputCodes; while (length--) { if ((unsigned int) *inputCodes != (unsigned int) *word) { return false; @@ -423,7 +519,7 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex const int currentChar = *getInputCharsAt(inputIndex); const int leftIndex = inputIndex - 1; if (leftIndex >= 0) { - int *leftChars = getInputCharsAt(leftIndex); + const int *leftChars = getInputCharsAt(leftIndex); int i = 0; while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { if (leftChars[i++] == currentChar) return true; @@ -431,7 +527,7 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex } const int rightIndex = inputIndex + 1; if (rightIndex < inputLength) { - int *rightChars = getInputCharsAt(rightIndex); + const int *rightChars = getInputCharsAt(rightIndex); int i = 0; while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) { if (rightChars[i++] == currentChar) return true; @@ -523,7 +619,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth *newDiffs = diffs; *newInputIndex = inputIndex; } else { - int *currentChars = getInputCharsAt(inputIndex); + const int *currentChars = getInputCharsAt(inputIndex); if (transposedPos >= 0) { if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS; diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index e84875b59..a95984520 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -33,12 +33,22 @@ class UnigramDictionary { public: UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion); - int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates, - int *codes, int codesSize, unsigned short *outWords, int *frequencies); + int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, const int flags, + unsigned short *outWords, int *frequencies); ~UnigramDictionary(); private: - void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies); + void getWordSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates, + const int *ycoordinates, const int *codes, const int codesSize, + unsigned short *outWords, int *frequencies); + bool isDigraph(const int* codes, const int i, const int codesSize) const; + void getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo, + const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, + const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, + int* codesDest, unsigned short* outWords, int* frequencies); + void initSuggestions(const int *codes, const int codesSize, unsigned short *outWords, + int *frequencies); void getSuggestionCandidates(const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters, const int nextLettersSize, const int maxDepth); @@ -86,7 +96,7 @@ private: const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos); bool existsAdjacentProximityChars(const int inputIndex, const int inputLength); - inline int* getInputCharsAt(const int index) { + inline const int* getInputCharsAt(const int index) { return mInputCodes + (index * MAX_PROXIMITY_CHARS); } const unsigned char *DICT; @@ -97,10 +107,20 @@ private: const int TYPED_LETTER_MULTIPLIER; const int FULL_WORD_MULTIPLIER; const int ROOT_POS; + const unsigned int BYTES_IN_ONE_CHAR; + + // Flags for special processing + // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java + // or something very bad (like, the apocalypse) will happen. + // Please update both at the same time. + enum { + REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1 + }; + static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[]; int *mFrequencies; unsigned short *mOutputChars; - int *mInputCodes; + const int *mInputCodes; int mInputLength; // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; |