aboutsummaryrefslogtreecommitdiffstats
path: root/native/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/src')
-rw-r--r--native/src/bigram_dictionary.cpp26
-rw-r--r--native/src/debug.h69
-rw-r--r--native/src/defines.h24
-rw-r--r--native/src/dictionary.cpp4
-rw-r--r--native/src/dictionary.h10
-rw-r--r--native/src/proximity_info.cpp64
-rw-r--r--native/src/proximity_info.h45
-rw-r--r--native/src/unigram_dictionary.cpp393
-rw-r--r--native/src/unigram_dictionary.h46
9 files changed, 558 insertions, 123 deletions
diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp
index 5ec310f07..36761b88d 100644
--- a/native/src/bigram_dictionary.cpp
+++ b/native/src/bigram_dictionary.cpp
@@ -30,8 +30,10 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
: DICT(dict), MAX_WORD_LENGTH(maxWordLength),
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
- if (DEBUG_DICT) LOGI("BigramDictionary - constructor");
- if (DEBUG_DICT) LOGI("Has Bigram : %d", hasBigram);
+ if (DEBUG_DICT) {
+ LOGI("BigramDictionary - constructor");
+ LOGI("Has Bigram : %d", hasBigram);
+ }
}
BigramDictionary::~BigramDictionary() {
@@ -54,7 +56,9 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
}
insertAt++;
}
- if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
+ if (DEBUG_DICT) {
+ LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
+ }
if (insertAt < mMaxBigrams) {
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
(char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
@@ -68,7 +72,9 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
*dest++ = *word++;
}
*dest = 0; // NULL terminate
- if (DEBUG_DICT) LOGI("Bigram: Added word at %d", insertAt);
+ if (DEBUG_DICT) {
+ LOGI("Bigram: Added word at %d", insertAt);
+ }
return true;
}
return false;
@@ -107,7 +113,9 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
int pos = mParentDictionary->isValidWordRec(
DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
- if (DEBUG_DICT) LOGI("Pos -> %d", pos);
+ if (DEBUG_DICT) {
+ LOGI("Pos -> %d", pos);
+ }
if (pos < 0) {
return 0;
}
@@ -151,7 +159,9 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc
}
pos = followDownBranchAddress; // pos start at count
int count = DICT[pos] & 0xFF;
- if (DEBUG_DICT) LOGI("count - %d",count);
+ if (DEBUG_DICT) {
+ LOGI("count - %d",count);
+ }
pos++;
for (int i = 0; i < count; i++) {
// pos at data
@@ -225,7 +235,9 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc
}
depth++;
if (followDownBranchAddress == 0) {
- if (DEBUG_DICT) LOGI("ERROR!!! Cannot find bigram!!");
+ if (DEBUG_DICT) {
+ LOGI("ERROR!!! Cannot find bigram!!");
+ }
break;
}
}
diff --git a/native/src/debug.h b/native/src/debug.h
new file mode 100644
index 000000000..ae629b222
--- /dev/null
+++ b/native/src/debug.h
@@ -0,0 +1,69 @@
+/*
+**
+** Copyright 2011, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+** http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef LATINIME_DEBUG_H
+#define LATINIME_DEBUG_H
+
+#include "defines.h"
+
+static inline unsigned char* convertToUnibyteString(unsigned short* input, unsigned char* output,
+ const unsigned int length) {
+ int i = 0;
+ for (; i <= length && input[i] != 0; ++i)
+ output[i] = input[i] & 0xFF;
+ output[i] = 0;
+ return output;
+}
+static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input,
+ unsigned char* output, const unsigned int length, unsigned char c) {
+ int i = 0;
+ for (; i <= length && input[i] != 0; ++i)
+ output[i] = input[i] & 0xFF;
+ output[i-1] = c;
+ output[i] = 0;
+ return output;
+}
+static inline void LOGI_S16(unsigned short* string, const unsigned int length) {
+ unsigned char tmp_buffer[length];
+ convertToUnibyteString(string, tmp_buffer, length);
+ LOGI(">> %s", tmp_buffer);
+ // The log facility is throwing out log that comes too fast. The following
+ // is a dirty way of slowing down processing so that we can see all log.
+ // TODO : refactor this in a blocking log or something.
+ // usleep(10);
+}
+static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length,
+ unsigned char c) {
+ unsigned char tmp_buffer[length+1];
+ convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c);
+ LOGI(">> %s", tmp_buffer);
+ // Likewise
+ // usleep(10);
+}
+
+static inline void printDebug(const char* tag, int* codes, int codesSize, int MAX_PROXIMITY_CHARS) {
+ unsigned char *buf = (unsigned char*)malloc((1 + codesSize) * sizeof(*buf));
+
+ buf[codesSize] = 0;
+ while (--codesSize >= 0)
+ buf[codesSize] = (unsigned char)codes[codesSize * MAX_PROXIMITY_CHARS];
+ LOGI("%s, WORD = %s", tag, buf);
+
+ free(buf);
+}
+
+#endif // LATINIME_DEBUG_H
diff --git a/native/src/defines.h b/native/src/defines.h
index c1eaf0df2..926120703 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -28,6 +28,7 @@
#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
#define DEBUG_NODE DEBUG_DICT_FULL
#define DEBUG_TRACE DEBUG_DICT_FULL
+#define DEBUG_PROXIMITY_INFO true
// Profiler
#include <time.h>
@@ -76,13 +77,14 @@ static void prof_out(void) {
}
#else // FLAG_DBG
-#define LOGE
-#define LOGI
+#define LOGE(fmt, ...)
+#define LOGI(fmt, ...)
#define DEBUG_DICT false
#define DEBUG_DICT_FULL false
#define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE false
#define DEBUG_TRACE false
+#define DEBUG_PROXIMITY_INFO false
#define PROF_BUF_SIZE 0
#define PROF_RESET
@@ -100,6 +102,9 @@ static void prof_out(void) {
#ifndef U_SHORT_MAX
#define U_SHORT_MAX 1 << 16
#endif
+#ifndef S_INT_MAX
+#define S_INT_MAX 2147483647 // ((1 << 31) - 1)
+#endif
// Define this to use mmap() for dictionary loading. Undefine to use malloc() instead of mmap().
// We measured and compared performance of both, and found mmap() is fairly good in terms of
@@ -124,33 +129,40 @@ static void prof_out(void) {
#define DICTIONARY_HEADER_SIZE 2
#define NOT_VALID_WORD -99
+#define KEYCODE_SPACE ' '
+
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
+#define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true
// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
-#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 70
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
#define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
-// This is used as a bare multiplier (not subject to /100)
-#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
-
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
#define MAX_WORD_LENGTH_INTERNAL 48
#define MAX_DEPTH_MULTIPLIER 3
+// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
+// word in the dictionary
+#define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5
+
// Minimum suggest depth for one word for all cases except for missing space suggestions.
#define MIN_SUGGEST_DEPTH 1
#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
+// The size of next letters frequency array. Zero will disable the feature.
+#define NEXT_LETTERS_SIZE 0
+
#define min(a,b) ((a)<(b)?(a):(b))
#endif // LATINIME_DEFINES_H
diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp
index fe3375706..d69cb2a53 100644
--- a/native/src/dictionary.cpp
+++ b/native/src/dictionary.cpp
@@ -23,6 +23,7 @@
namespace latinime {
+// TODO: Change the type of all keyCodes to uint32_t
Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust,
int typedLetterMultiplier, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxAlternatives)
@@ -53,8 +54,7 @@ bool Dictionary::hasBigram() {
}
// TODO: use uint16_t instead of unsigned short
-bool Dictionary::isValidWord(unsigned short *word, int length)
-{
+bool Dictionary::isValidWord(unsigned short *word, int length) {
if (IS_LATEST_DICT_VERSION) {
return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
} else {
diff --git a/native/src/dictionary.h b/native/src/dictionary.h
index cef1cf9eb..13b2a2816 100644
--- a/native/src/dictionary.h
+++ b/native/src/dictionary.h
@@ -19,6 +19,7 @@
#include "bigram_dictionary.h"
#include "defines.h"
+#include "proximity_info.h"
#include "unigram_dictionary.h"
namespace latinime {
@@ -27,10 +28,10 @@ class Dictionary {
public:
Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives);
- int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
- int *nextLetters, int nextLettersSize) {
- return mUnigramDictionary->getSuggestions(codes, codesSize, outWords, frequencies,
- nextLetters, nextLettersSize);
+ int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates,
+ int *codes, int codesSize, int flags, unsigned short *outWords, int *frequencies) {
+ return mUnigramDictionary->getSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
+ codesSize, flags, outWords, frequencies);
}
// TODO: Call mBigramDictionary instead of mUnigramDictionary
@@ -40,6 +41,7 @@ public:
return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies,
maxWordLength, maxBigrams, maxAlternatives);
}
+
bool isValidWord(unsigned short *word, int length);
int isValidWordRec(int pos, unsigned short *word, int offset, int length);
void *getDict() { return (void *)mDict; }
diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp
new file mode 100644
index 000000000..102123c3c
--- /dev/null
+++ b/native/src/proximity_info.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#define LOG_TAG "LatinIME: proximity_info.cpp"
+
+#include "proximity_info.h"
+
+namespace latinime {
+ProximityInfo::ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth,
+ const int keyboardHeight, const int gridWidth, const int gridHeight,
+ const uint32_t *proximityCharsArray)
+ : MAX_PROXIMITY_CHARS_SIZE(maxProximityCharsSize), KEYBOARD_WIDTH(keyboardWidth),
+ KEYBOARD_HEIGHT(keyboardHeight), GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight),
+ CELL_WIDTH((keyboardWidth + gridWidth - 1) / gridWidth),
+ CELL_HEIGHT((keyboardHeight + gridHeight - 1) / gridHeight) {
+ const int len = GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE;
+ mProximityCharsArray = new uint32_t[len];
+ if (DEBUG_PROXIMITY_INFO) {
+ LOGI("Create proximity info array %d", len);
+ }
+ memcpy(mProximityCharsArray, proximityCharsArray, len * sizeof(mProximityCharsArray[0]));
+}
+
+ProximityInfo::~ProximityInfo() {
+ delete[] mProximityCharsArray;
+}
+
+inline int ProximityInfo::getStartIndexFromCoordinates(const int x, const int y) const {
+ return ((y / CELL_HEIGHT) * GRID_WIDTH + (x / CELL_WIDTH))
+ * MAX_PROXIMITY_CHARS_SIZE;
+}
+
+bool ProximityInfo::hasSpaceProximity(const int x, const int y) const {
+ const int startIndex = getStartIndexFromCoordinates(x, y);
+ if (DEBUG_PROXIMITY_INFO) {
+ LOGI("hasSpaceProximity: index %d", startIndex);
+ }
+ for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) {
+ if (DEBUG_PROXIMITY_INFO) {
+ LOGI("Index: %d", mProximityCharsArray[startIndex + i]);
+ }
+ if (mProximityCharsArray[startIndex + i] == KEYCODE_SPACE) {
+ return true;
+ }
+ }
+ return false;
+}
+} // namespace latinime
diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h
new file mode 100644
index 000000000..c2062e8c5
--- /dev/null
+++ b/native/src/proximity_info.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PROXIMITY_INFO_H
+#define LATINIME_PROXIMITY_INFO_H
+
+#include <stdint.h>
+
+#include "defines.h"
+
+namespace latinime {
+
+class ProximityInfo {
+public:
+ ProximityInfo(const int maxProximityCharsSize, const int keyboardWidth,
+ const int keybaordHeight, const int gridWidth, const int gridHeight,
+ const uint32_t *proximityCharsArray);
+ ~ProximityInfo();
+ bool hasSpaceProximity(const int x, const int y) const;
+private:
+ int getStartIndexFromCoordinates(const int x, const int y) const;
+ const int MAX_PROXIMITY_CHARS_SIZE;
+ const int KEYBOARD_WIDTH;
+ const int KEYBOARD_HEIGHT;
+ const int GRID_WIDTH;
+ const int GRID_HEIGHT;
+ const int CELL_WIDTH;
+ const int CELL_HEIGHT;
+ uint32_t *mProximityCharsArray;
+};
+}; // namespace latinime
+#endif // LATINIME_PROXIMITY_INFO_H
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index dfbe8228e..c18829014 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -29,20 +29,146 @@
namespace latinime {
+const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
+ { { 'a', 'e' },
+ { 'o', 'e' },
+ { 'u', 'e' } };
+
UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion)
- : DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
+ : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
- ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
- if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");
+ ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
+ BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)),
+ MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
+ if (DEBUG_DICT) {
+ LOGI("UnigramDictionary - constructor");
+ }
}
UnigramDictionary::~UnigramDictionary() {}
-int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,
- int *frequencies, int *nextLetters, int nextLettersSize) {
+static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
+ const int MAX_PROXIMITY_CHARS) {
+ return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
+}
+
+bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
+
+ // There can't be a digraph if we don't have at least 2 characters to examine
+ if (i + 2 > codesSize) return false;
+
+ // Search for the first char of some digraph
+ int lastDigraphIndex = -1;
+ const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
+ for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
+ lastDigraphIndex >= 0; --lastDigraphIndex) {
+ if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
+ }
+ // No match: return early
+ if (lastDigraphIndex < 0) return false;
+
+ // It's an interesting digraph if the second char matches too.
+ return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
+}
+
+// Mostly the same arguments as the non-recursive version, except:
+// codes is the original value. It points to the start of the work buffer, and gets passed as is.
+// codesSize is the size of the user input (thus, it is the size of codesSrc).
+// codesDest is the current point in the work buffer.
+// codesSrc is the current point in the user-input, original, content-unmodified buffer.
+// codesRemain is the remaining size in codesSrc.
+void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
+ const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
+ const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies) {
+
+ if (currentDepth < MAX_UMLAUT_SEARCH_DEPTH) {
+ for (int i = 0; i < codesRemain; ++i) {
+ if (isDigraph(codesSrc, i, codesRemain)) {
+ // Found a digraph. We will try both spellings. eg. the word is "pruefen"
+
+ // Copy the word up to the first char of the digraph, then continue processing
+ // on the remaining part of the word, skipping the second char of the digraph.
+ // In our example, copy "pru" and continue running on "fen"
+ // Make i the index of the second char of the digraph for simplicity. Forgetting
+ // to do that results in an infinite recursion so take care!
+ ++i;
+ memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
+ getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
+ codesBuffer, codesBufferSize, flags,
+ codesSrc + (i + 1) * MAX_PROXIMITY_CHARS, codesRemain - i - 1,
+ currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS, outWords,
+ frequencies);
+
+ // Copy the second char of the digraph in place, then continue processing on
+ // the remaining part of the word.
+ // In our example, after "pru" in the buffer copy the "e", and continue on "fen"
+ memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
+ BYTES_IN_ONE_CHAR);
+ getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
+ codesBuffer, codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS,
+ codesRemain - i, currentDepth + 1, codesDest + i * MAX_PROXIMITY_CHARS,
+ outWords, frequencies);
+ return;
+ }
+ }
+ }
+
+ // If we come here, we hit the end of the word: let's check it against the dictionary.
+ // In our example, we'll come here once for "prufen" and then once for "pruefen".
+ // If the word contains several digraphs, we'll come it for the product of them.
+ // eg. if the word is "ueberpruefen" we'll test, in order, against
+ // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
+ const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
+ if (0 != remainingBytes)
+ memcpy(codesDest, codesSrc, remainingBytes);
+
+ getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+ (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
+}
+
+int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
+ const int *ycoordinates, const int *codes, const int codesSize, const int flags,
+ unsigned short *outWords, int *frequencies) {
+
+ if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
+ { // Incrementally tune the word and try all possibilities
+ int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
+ getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+ codesSize, flags, codes, codesSize, 0, codesBuffer, outWords, frequencies);
+ } else { // Normal processing
+ getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
+ outWords, frequencies);
+ }
+
+ PROF_START(20);
+ // Get the word count
+ int suggestedWordsCount = 0;
+ while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
+ suggestedWordsCount++;
+ }
+
+ if (DEBUG_DICT) {
+ LOGI("Returning %d words", suggestedWordsCount);
+ LOGI("Next letters: ");
+ for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
+ if (mNextLettersFrequency[k] > 0) {
+ LOGI("%c = %d,", k, mNextLettersFrequency[k]);
+ }
+ }
+ }
+ PROF_END(20);
+ PROF_CLOSE;
+ return suggestedWordsCount;
+}
+
+void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
+ unsigned short *outWords, int *frequencies) {
+
PROF_OPEN;
PROF_START(0);
initSuggestions(codes, codesSize, outWords, frequencies);
@@ -52,14 +178,16 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
PROF_END(0);
PROF_START(1);
- getSuggestionCandidates(-1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);
+ getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
PROF_END(1);
PROF_START(2);
// Suggestion with missing character
if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
for (int i = 0; i < codesSize; ++i) {
- if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
+ if (DEBUG_DICT) {
+ LOGI("--- Suggest missing characters %d", i);
+ }
getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
}
}
@@ -70,7 +198,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
for (int i = 0; i < codesSize; ++i) {
- if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
+ if (DEBUG_DICT) {
+ LOGI("--- Suggest excessive characters %d", i);
+ }
getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
}
}
@@ -81,7 +211,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
// Only suggest words that length is mInputLength
if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
for (int i = 0; i < codesSize; ++i) {
- if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);
+ if (DEBUG_DICT) {
+ LOGI("--- Suggest transposed characters %d", i);
+ }
getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
}
}
@@ -92,36 +224,40 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
for (int i = 1; i < codesSize; ++i) {
- if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);
+ if (DEBUG_DICT) {
+ LOGI("--- Suggest missing space characters %d", i);
+ }
getMissingSpaceWords(mInputLength, i);
}
}
PROF_END(5);
PROF_START(6);
- // Get the word count
- int suggestedWordsCount = 0;
- while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
- suggestedWordsCount++;
- }
-
- if (DEBUG_DICT) {
- LOGI("Returning %d words", suggestedWordsCount);
- LOGI("Next letters: ");
- for (int k = 0; k < nextLettersSize; k++) {
- if (nextLetters[k] > 0) {
- LOGI("%c = %d,", k, nextLetters[k]);
+ if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY) {
+ // The first and last "mistyped spaces" are taken care of by excessive character handling
+ for (int i = 1; i < codesSize - 1; ++i) {
+ if (DEBUG_DICT) {
+ LOGI("--- Suggest words with proximity space %d", i);
+ }
+ const int x = xcoordinates[i];
+ const int y = ycoordinates[i];
+ if (DEBUG_PROXIMITY_INFO) {
+ LOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
+ i, x, y, proximityInfo->hasSpaceProximity(x, y));
+ }
+ if (proximityInfo->hasSpaceProximity(x, y)) {
+ getMistypedSpaceWords(mInputLength, i);
}
}
}
PROF_END(6);
- PROF_CLOSE;
- return suggestedWordsCount;
}
-void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
- int *frequencies) {
- if (DEBUG_DICT) LOGI("initSuggest");
+void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
+ unsigned short *outWords, int *frequencies) {
+ if (DEBUG_DICT) {
+ LOGI("initSuggest");
+ }
mFrequencies = frequencies;
mOutputChars = outWords;
mInputCodes = codes;
@@ -145,7 +281,9 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
LOGI("Found word = %s, freq = %d", s, frequency);
}
if (length > MAX_WORD_LENGTH) {
- if (DEBUG_DICT) LOGI("Exceeded max word length.");
+ if (DEBUG_DICT) {
+ LOGI("Exceeded max word length.");
+ }
return false;
}
@@ -176,13 +314,15 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
*dest++ = *word++;
}
*dest = 0; // NULL terminate
- if (DEBUG_DICT) LOGI("Added word at %d", insertAt);
+ if (DEBUG_DICT) {
+ LOGI("Added word at %d", insertAt);
+ }
return true;
}
return false;
}
-unsigned short UnigramDictionary::toLowerCase(unsigned short c) {
+unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
c = BASE_CHARS[c];
}
@@ -198,7 +338,7 @@ bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {
if (length != mInputLength) {
return false;
}
- int *inputCodes = mInputCodes;
+ const int *inputCodes = mInputCodes;
while (length--) {
if ((unsigned int) *inputCodes != (unsigned int) *word) {
return false;
@@ -238,7 +378,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
if (mStackChildCount[depth] > 0) {
--mStackChildCount[depth];
bool traverseAllNodes = mStackTraverseAll[depth];
- int snr = mStackNodeFreq[depth];
+ int matchWeight = mStackNodeFreq[depth];
int inputIndex = mStackInputIndex[depth];
int diffs = mStackDiffs[depth];
int siblingPos = mStackSiblingPos[depth];
@@ -246,9 +386,10 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
// depth will never be greater than maxDepth because in that case,
// needsToTraverseChildrenNodes should be false
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
- maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
- transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,
- &traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);
+ maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
+ excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
+ &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
+ &siblingPos);
// Update next sibling pos
mStackSiblingPos[depth] = siblingPos;
if (needsToTraverseChildrenNodes) {
@@ -256,7 +397,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
++depth;
mStackChildCount[depth] = childCount;
mStackTraverseAll[depth] = traverseAllNodes;
- mStackNodeFreq[depth] = snr;
+ mStackNodeFreq[depth] = matchWeight;
mStackInputIndex[depth] = inputIndex;
mStackDiffs[depth] = diffs;
mStackSiblingPos[depth] = firstChildPos;
@@ -276,27 +417,35 @@ inline static void multiplyRate(const int rate, int *freq) {
}
}
-bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
- if (missingSpacePos <= 0 || missingSpacePos >= inputLength
- || inputLength >= MAX_WORD_LENGTH) return false;
- const int newWordLength = inputLength + 1;
+bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
+ const int firstWordStartPos, const int firstWordLength, const int secondWordStartPos,
+ const int secondWordLength) {
+ if (inputLength >= MAX_WORD_LENGTH) return false;
+ if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
+ || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
+ return false;
+ const int newWordLength = firstWordLength + secondWordLength + 1;
// Allocating variable length array on stack
unsigned short word[newWordLength];
- const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);
- if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);
+ const int firstFreq = getBestWordFreq(firstWordStartPos, firstWordLength, mWord);
+ if (DEBUG_DICT) {
+ LOGI("First freq: %d", firstFreq);
+ }
if (firstFreq <= 0) return false;
- for (int i = 0; i < missingSpacePos; ++i) {
+ for (int i = 0; i < firstWordLength; ++i) {
word[i] = mWord[i];
}
- const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
- if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
+ const int secondFreq = getBestWordFreq(secondWordStartPos, secondWordLength, mWord);
+ if (DEBUG_DICT) {
+ LOGI("Second freq: %d", secondFreq);
+ }
if (secondFreq <= 0) return false;
- word[missingSpacePos] = SPACE;
- for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {
- word[i] = mWord[i - missingSpacePos - 1];
+ word[firstWordLength] = SPACE;
+ for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
+ word[i] = mWord[i - firstWordLength - 1];
}
int pairFreq = ((firstFreq + secondFreq) / 2);
@@ -306,6 +455,17 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
return true;
}
+bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
+ return getSplitTwoWordsSuggestion(
+ inputLength, 0, missingSpacePos, missingSpacePos, inputLength - missingSpacePos);
+}
+
+bool UnigramDictionary::getMistypedSpaceWords(const int inputLength, const int spaceProximityPos) {
+ return getSplitTwoWordsSuggestion(
+ inputLength, 0, spaceProximityPos, spaceProximityPos + 1,
+ inputLength - spaceProximityPos - 1);
+}
+
// Keep this for comparing spec to new getWords
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, const int transposedPos,int *nextLetters,
@@ -319,40 +479,52 @@ void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength,
}
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
- const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
- const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
- int *nextLetters, const int nextLettersSize) {
+ const int maxDepth, const bool traverseAllNodes, const int matchWeight,
+ const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
+ const int transposedPos, int *nextLetters, const int nextLettersSize) {
int siblingPos = pos;
for (int i = 0; i < childrenCount; ++i) {
int newCount;
int newChildPosition;
const int newDepth = depth + 1;
bool newTraverseAllNodes;
- int newSnr;
+ int newMatchRate;
int newInputIndex;
int newDiffs;
int newSiblingPos;
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
- traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,
+ traverseAllNodes, matchWeight, inputIndex, diffs,
+ skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize,
- &newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
+ &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
&newInputIndex, &newDiffs, &newSiblingPos);
siblingPos = newSiblingPos;
if (needsToTraverseChildrenNodes) {
getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
- newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
+ newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize);
}
}
}
+static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
+static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+ return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+}
inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
- const int snr, const int skipPos, const int excessivePos, const int transposedPos,
- const int freq, const bool sameLength) {
+ const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
+ const int freq, const bool sameLength) const {
// TODO: Demote by edit distance
- int finalFreq = freq * snr;
- if (skipPos >= 0) multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE, &finalFreq);
+ int finalFreq = freq * matchWeight;
+ if (skipPos >= 0) {
+ if (mInputLength >= 3) {
+ multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE *
+ (mInputLength - 2) / (mInputLength - 1), &finalFreq);
+ } else {
+ finalFreq = 0;
+ }
+ }
if (transposedPos >= 0) multiplyRate(
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
if (excessivePos >= 0) {
@@ -363,24 +535,26 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
}
int lengthFreq = TYPED_LETTER_MULTIPLIER;
for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER;
- if (lengthFreq == snr) {
+ if (lengthFreq == matchWeight) {
if (depth > 1) {
- if (DEBUG_DICT) LOGI("Found full matched word.");
+ if (DEBUG_DICT) {
+ LOGI("Found full matched word.");
+ }
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
- finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+ finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
- if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
+ if (sameLength) finalFreq *= FULL_WORD_MULTIPLIER;
return finalFreq;
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
- unsigned short *word, const int inputIndex, const int depth, const int snr,
+ unsigned short *word, const int inputIndex, const int depth, const int matchWeight,
int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
const int transposedPos, const int freq) {
- const int finalFreq = calculateFinalFreq(inputIndex, depth, snr, skipPos, excessivePos,
+ const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, excessivePos,
transposedPos, freq, false);
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
if (depth >= mInputLength && skipPos < 0) {
@@ -389,10 +563,10 @@ inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLe
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
- unsigned short *word, const int inputIndex, const int depth, const int snr,
+ unsigned short *word, const int inputIndex, const int depth, const int matchWeight,
const int skipPos, const int excessivePos, const int transposedPos, const int freq) {
if (sameAsTyped(word, depth + 1)) return;
- const int finalFreq = calculateFinalFreq(inputIndex, depth, snr, skipPos,
+ const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
excessivePos, transposedPos, freq, true);
// Proximity collection will promote a word of the same length as what user typed.
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
@@ -400,18 +574,18 @@ inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth) {
- const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];
+ const unsigned short userTypedChar = getInputCharsAt(inputIndex)[0];
// Skip the ' or other letter and continue deeper
return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
}
inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
- const int inputLength) {
+ const int inputLength) const {
if (inputIndex < 0 || inputIndex >= inputLength) return false;
const int currentChar = *getInputCharsAt(inputIndex);
const int leftIndex = inputIndex - 1;
if (leftIndex >= 0) {
- int *leftChars = getInputCharsAt(leftIndex);
+ const int *leftChars = getInputCharsAt(leftIndex);
int i = 0;
while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
if (leftChars[i++] == currentChar) return true;
@@ -419,7 +593,7 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
}
const int rightIndex = inputIndex + 1;
if (rightIndex < inputLength) {
- int *rightChars = getInputCharsAt(rightIndex);
+ const int *rightChars = getInputCharsAt(rightIndex);
int i = 0;
while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
if (rightChars[i++] == currentChar) return true;
@@ -428,32 +602,54 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
return false;
}
+
+// In the following function, c is the current character of the dictionary word
+// currently examined.
+// currentChars is an array containing the keys close to the character the
+// user actually typed at the same position. We want to see if c is in it: if so,
+// then the word contains at that position a character close to what the user
+// typed.
+// What the user typed is actually the first character of the array.
+// Notice : accented characters do not have a proximity list, so they are alone
+// in their list. The non-accented version of the character should be considered
+// "close", but not the other keys close to the non-accented version.
inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
const int *currentChars, const unsigned short c, const int skipPos,
const int excessivePos, const int transposedPos) {
- const unsigned short lowerC = toLowerCase(c);
- int j = 0;
+ const unsigned short baseLowerC = toBaseLowerCase(c);
+
+ // The first char in the array is what user typed. If it matches right away,
+ // that means the user typed that same char for this pos.
+ if (currentChars[0] == baseLowerC || currentChars[0] == c)
+ return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
+
+ // If one of those is true, we should not check for close characters at all.
+ if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
+ return UNRELATED_CHAR;
+
+ // If the non-accented, lowercased version of that first character matches c,
+ // then we have a non-accented version of the accented character the user
+ // typed. Treat it as a close char.
+ if (toBaseLowerCase(currentChars[0]) == baseLowerC)
+ return NEAR_PROXIMITY_CHAR;
+
+ // Not an exact nor an accent-alike match: search the list of close keys
+ int j = 1;
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
- const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
- // If skipPos is defined, not to search proximity collections.
- // First char is what user typed.
- if (matched) {
- if (j > 0) return NEAR_PROXIMITY_CHAR;
- return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
- } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
- // Not to check proximity characters
- return UNRELATED_CHAR;
- }
+ const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
+ if (matched) return NEAR_PROXIMITY_CHAR;
++j;
}
+
+ // Was not included, signal this as an unrelated character.
return UNRELATED_CHAR;
}
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
- const int maxDepth, const bool traverseAllNodes, int snr, int inputIndex,
+ const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
- bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
+ bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
int *nextSiblingPosition) {
if (DEBUG_DICT) {
int inputCount = 0;
@@ -480,15 +676,16 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
mWord[depth] = c;
if (traverseAllNodes && terminal) {
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, inputIndex, depth,
- snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);
+ matchWeight, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos,
+ freq);
}
if (!needsToTraverseChildrenNodes) return false;
*newTraverseAllNodes = traverseAllNodes;
- *newSnr = snr;
+ *newMatchRate = matchWeight;
*newDiffs = diffs;
*newInputIndex = inputIndex;
} else {
- int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
+ const int *currentChars = getInputCharsAt(inputIndex);
if (transposedPos >= 0) {
if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
@@ -502,18 +699,18 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
- snr = snr * TYPED_LETTER_MULTIPLIER;
+ matchWeight = matchWeight * TYPED_LETTER_MULTIPLIER;
}
bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
if (isSameAsUserTypedLength && terminal) {
- onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, snr,
+ onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, matchWeight,
skipPos, excessivePos, transposedPos, freq);
}
if (!needsToTraverseChildrenNodes) return false;
// Start traversing all nodes after the index exceeds the user typed length
*newTraverseAllNodes = isSameAsUserTypedLength;
- *newSnr = snr;
+ *newMatchRate = matchWeight;
*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
*newInputIndex = inputIndex + 1;
}
@@ -591,20 +788,24 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
const int inputIndex = startInputIndex + depth;
- const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
+ const int *currentChars = getInputCharsAt(inputIndex);
unsigned short c;
*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,
newChildPosition, newTerminal, newFreq);
const unsigned int inputC = currentChars[0];
- if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);
- const unsigned short lowerC = toLowerCase(c);
- const bool matched = (inputC == lowerC || inputC == c);
+ if (DEBUG_DICT) {
+ assert(inputC <= U_SHORT_MAX);
+ }
+ const unsigned short baseLowerC = toBaseLowerCase(c);
+ const bool matched = (inputC == baseLowerC || inputC == c);
const bool hasChild = *newChildPosition != 0;
if (matched) {
word[depth] = c;
if (DEBUG_DICT && DEBUG_NODE) {
LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
- if (*newTerminal) LOGI("Terminal %d", *newFreq);
+ if (*newTerminal) {
+ LOGI("Terminal %d", *newFreq);
+ }
}
if (hasChild) {
*newCount = Dictionary::getCount(DICT, newChildPosition);
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 90c98149b..3d3007ce0 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -18,6 +18,7 @@
#define LATINIME_UNIGRAM_DICTIONARY_H
#include "defines.h"
+#include "proximity_info.h"
namespace latinime {
@@ -32,12 +33,22 @@ class UnigramDictionary {
public:
UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
- int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
- int *nextLetters, int nextLettersSize);
+ int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
+ const int *ycoordinates, const int *codes, const int codesSize, const int flags,
+ unsigned short *outWords, int *frequencies);
~UnigramDictionary();
private:
- void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
+ void getWordSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
+ const int *ycoordinates, const int *codes, const int codesSize,
+ unsigned short *outWords, int *frequencies);
+ bool isDigraph(const int* codes, const int i, const int codesSize) const;
+ void getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
+ const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
+ const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
+ void initSuggestions(const int *codes, const int codesSize, unsigned short *outWords,
+ int *frequencies);
void getSuggestionCandidates(const int skipPos, const int excessivePos,
const int transposedPos, int *nextLetters, const int nextLettersSize,
const int maxDepth);
@@ -48,19 +59,24 @@ private:
int wideStrLen(unsigned short *str);
bool sameAsTyped(unsigned short *word, int length);
bool addWord(unsigned short *word, int length, int frequency);
- unsigned short toLowerCase(unsigned short c);
+ unsigned short toBaseLowerCase(unsigned short c);
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize);
+ bool getSplitTwoWordsSuggestion(const int inputLength,
+ const int firstWordStartPos, const int firstWordLength,
+ const int secondWordStartPos, const int secondWordLength);
bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
+ bool getMistypedSpaceWords(const int inputLength, const int spaceProximityPos);
// Keep getWordsOld for comparing performance between getWords and getWordsOld
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize);
void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
- const int excessivePos, const int transposedPos, const int freq, const bool sameLength);
+ const int excessivePos, const int transposedPos, const int freq,
+ const bool sameLength) const;
void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
const int inputIndex, const int depth, const int snr, int *nextLetters,
const int nextLettersSize, const int skipPos, const int excessivePos,
@@ -84,8 +100,10 @@ private:
bool processCurrentNodeForExactMatch(const int firstChildPos,
const int startInputIndex, const int depth, unsigned short *word,
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
- bool existsAdjacentProximityChars(const int inputIndex, const int inputLength);
- int* getInputCharsAt(const int index) {return mInputCodes + (index * MAX_PROXIMITY_CHARS);}
+ bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const;
+ inline const int* getInputCharsAt(const int index) const {
+ return mInputCodes + (index * MAX_PROXIMITY_CHARS);
+ }
const unsigned char *DICT;
const int MAX_WORD_LENGTH;
const int MAX_WORDS;
@@ -94,10 +112,21 @@ private:
const int TYPED_LETTER_MULTIPLIER;
const int FULL_WORD_MULTIPLIER;
const int ROOT_POS;
+ const unsigned int BYTES_IN_ONE_CHAR;
+ const int MAX_UMLAUT_SEARCH_DEPTH;
+
+ // Flags for special processing
+ // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
+ // or something very bad (like, the apocalypse) will happen.
+ // Please update both at the same time.
+ enum {
+ REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1
+ };
+ static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
int *mFrequencies;
unsigned short *mOutputChars;
- int *mInputCodes;
+ const int *mInputCodes;
int mInputLength;
// MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
@@ -109,6 +138,7 @@ private:
int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
+ int mNextLettersFrequency[NEXT_LETTERS_SIZE];
};
// ----------------------------------------------------------------------------