aboutsummaryrefslogtreecommitdiffstats
path: root/native/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/src')
-rw-r--r--native/src/debug.h58
-rw-r--r--native/src/defines.h9
-rw-r--r--native/src/dictionary.h6
-rw-r--r--native/src/unigram_dictionary.cpp125
-rw-r--r--native/src/unigram_dictionary.h6
5 files changed, 146 insertions, 58 deletions
diff --git a/native/src/debug.h b/native/src/debug.h
new file mode 100644
index 000000000..e5572e1a5
--- /dev/null
+++ b/native/src/debug.h
@@ -0,0 +1,58 @@
+/*
+**
+** Copyright 2011, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+** http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef LATINIME_DEBUG_H
+#define LATINIME_DEBUG_H
+
+#include "defines.h"
+
+static inline unsigned char* convertToUnibyteString(unsigned short* input, unsigned char* output,
+ const unsigned int length) {
+ int i = 0;
+ for (; i <= length && input[i] != 0; ++i)
+ output[i] = input[i] & 0xFF;
+ output[i] = 0;
+ return output;
+}
+static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input,
+ unsigned char* output, const unsigned int length, unsigned char c) {
+ int i = 0;
+ for (; i <= length && input[i] != 0; ++i)
+ output[i] = input[i] & 0xFF;
+ output[i-1] = c;
+ output[i] = 0;
+ return output;
+}
+static inline void LOGI_S16(unsigned short* string, const unsigned int length) {
+ unsigned char tmp_buffer[length];
+ convertToUnibyteString(string, tmp_buffer, length);
+ LOGI(">> %s", tmp_buffer);
+ // The log facility is throwing out log that comes too fast. The following
+ // is a dirty way of slowing down processing so that we can see all log.
+ // TODO : refactor this in a blocking log or something.
+ // usleep(10);
+}
+static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length,
+ unsigned char c) {
+ unsigned char tmp_buffer[length+1];
+ convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c);
+ LOGI(">> %s", tmp_buffer);
+ // Likewise
+ // usleep(10);
+}
+
+#endif // LATINIME_DEBUG_H
diff --git a/native/src/defines.h b/native/src/defines.h
index c1eaf0df2..918028afe 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -100,6 +100,9 @@ static void prof_out(void) {
#ifndef U_SHORT_MAX
#define U_SHORT_MAX 1 << 16
#endif
+#ifndef S_INT_MAX
+#define S_INT_MAX ((1 << 31) - 1)
+#endif
// Define this to use mmap() for dictionary loading. Undefine to use malloc() instead of mmap().
// We measured and compared performance of both, and found mmap() is fairly good in terms of
@@ -137,9 +140,6 @@ static void prof_out(void) {
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
-// This is used as a bare multiplier (not subject to /100)
-#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
-
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
#define MAX_WORD_LENGTH_INTERNAL 48
@@ -151,6 +151,9 @@ static void prof_out(void) {
#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
+// The size of next letters frequency array. Zero will disable the feature.
+#define NEXT_LETTERS_SIZE 0
+
#define min(a,b) ((a)<(b)?(a):(b))
#endif // LATINIME_DEFINES_H
diff --git a/native/src/dictionary.h b/native/src/dictionary.h
index cef1cf9eb..941bd191a 100644
--- a/native/src/dictionary.h
+++ b/native/src/dictionary.h
@@ -27,10 +27,8 @@ class Dictionary {
public:
Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives);
- int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
- int *nextLetters, int nextLettersSize) {
- return mUnigramDictionary->getSuggestions(codes, codesSize, outWords, frequencies,
- nextLetters, nextLettersSize);
+ int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies) {
+ return mUnigramDictionary->getSuggestions(codes, codesSize, outWords, frequencies);
}
// TODO: Call mBigramDictionary instead of mUnigramDictionary
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index dfbe8228e..0ea650629 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -32,7 +32,7 @@ namespace latinime {
UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion)
- : DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
+ : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
@@ -42,7 +42,7 @@ UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterM
UnigramDictionary::~UnigramDictionary() {}
int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,
- int *frequencies, int *nextLetters, int nextLettersSize) {
+ int *frequencies) {
PROF_OPEN;
PROF_START(0);
initSuggestions(codes, codesSize, outWords, frequencies);
@@ -52,7 +52,7 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
PROF_END(0);
PROF_START(1);
- getSuggestionCandidates(-1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);
+ getSuggestionCandidates(-1, -1, -1, mNextLettersFrequency, NEXT_LETTERS_SIZE, MAX_DEPTH);
PROF_END(1);
PROF_START(2);
@@ -108,9 +108,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
if (DEBUG_DICT) {
LOGI("Returning %d words", suggestedWordsCount);
LOGI("Next letters: ");
- for (int k = 0; k < nextLettersSize; k++) {
- if (nextLetters[k] > 0) {
- LOGI("%c = %d,", k, nextLetters[k]);
+ for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
+ if (mNextLettersFrequency[k] > 0) {
+ LOGI("%c = %d,", k, mNextLettersFrequency[k]);
}
}
}
@@ -182,7 +182,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
return false;
}
-unsigned short UnigramDictionary::toLowerCase(unsigned short c) {
+unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
c = BASE_CHARS[c];
}
@@ -238,7 +238,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
if (mStackChildCount[depth] > 0) {
--mStackChildCount[depth];
bool traverseAllNodes = mStackTraverseAll[depth];
- int snr = mStackNodeFreq[depth];
+ int matchWeight = mStackNodeFreq[depth];
int inputIndex = mStackInputIndex[depth];
int diffs = mStackDiffs[depth];
int siblingPos = mStackSiblingPos[depth];
@@ -246,9 +246,10 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
// depth will never be greater than maxDepth because in that case,
// needsToTraverseChildrenNodes should be false
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
- maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
- transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,
- &traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);
+ maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
+ excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
+ &firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
+ &siblingPos);
// Update next sibling pos
mStackSiblingPos[depth] = siblingPos;
if (needsToTraverseChildrenNodes) {
@@ -256,7 +257,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
++depth;
mStackChildCount[depth] = childCount;
mStackTraverseAll[depth] = traverseAllNodes;
- mStackNodeFreq[depth] = snr;
+ mStackNodeFreq[depth] = matchWeight;
mStackInputIndex[depth] = inputIndex;
mStackDiffs[depth] = diffs;
mStackSiblingPos[depth] = firstChildPos;
@@ -319,39 +320,44 @@ void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength,
}
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
- const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
- const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
- int *nextLetters, const int nextLettersSize) {
+ const int maxDepth, const bool traverseAllNodes, const int matchWeight,
+ const int inputIndex, const int diffs, const int skipPos, const int excessivePos,
+ const int transposedPos, int *nextLetters, const int nextLettersSize) {
int siblingPos = pos;
for (int i = 0; i < childrenCount; ++i) {
int newCount;
int newChildPosition;
const int newDepth = depth + 1;
bool newTraverseAllNodes;
- int newSnr;
+ int newMatchRate;
int newInputIndex;
int newDiffs;
int newSiblingPos;
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
- traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,
+ traverseAllNodes, matchWeight, inputIndex, diffs,
+ skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize,
- &newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
+ &newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
&newInputIndex, &newDiffs, &newSiblingPos);
siblingPos = newSiblingPos;
if (needsToTraverseChildrenNodes) {
getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
- newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
+ newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize);
}
}
}
+static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255;
+static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+ return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+}
inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
- const int snr, const int skipPos, const int excessivePos, const int transposedPos,
+ const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
const int freq, const bool sameLength) {
// TODO: Demote by edit distance
- int finalFreq = freq * snr;
+ int finalFreq = freq * matchWeight;
if (skipPos >= 0) multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE, &finalFreq);
if (transposedPos >= 0) multiplyRate(
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
@@ -363,13 +369,13 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
}
int lengthFreq = TYPED_LETTER_MULTIPLIER;
for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER;
- if (lengthFreq == snr) {
+ if (lengthFreq == matchWeight) {
if (depth > 1) {
if (DEBUG_DICT) LOGI("Found full matched word.");
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
- finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+ finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@@ -377,10 +383,10 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
- unsigned short *word, const int inputIndex, const int depth, const int snr,
+ unsigned short *word, const int inputIndex, const int depth, const int matchWeight,
int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
const int transposedPos, const int freq) {
- const int finalFreq = calculateFinalFreq(inputIndex, depth, snr, skipPos, excessivePos,
+ const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos, excessivePos,
transposedPos, freq, false);
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
if (depth >= mInputLength && skipPos < 0) {
@@ -389,10 +395,10 @@ inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLe
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
- unsigned short *word, const int inputIndex, const int depth, const int snr,
+ unsigned short *word, const int inputIndex, const int depth, const int matchWeight,
const int skipPos, const int excessivePos, const int transposedPos, const int freq) {
if (sameAsTyped(word, depth + 1)) return;
- const int finalFreq = calculateFinalFreq(inputIndex, depth, snr, skipPos,
+ const int finalFreq = calculateFinalFreq(inputIndex, depth, matchWeight, skipPos,
excessivePos, transposedPos, freq, true);
// Proximity collection will promote a word of the same length as what user typed.
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
@@ -428,32 +434,54 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
return false;
}
+
+// In the following function, c is the current character of the dictionary word
+// currently examined.
+// currentChars is an array containing the keys close to the character the
+// user actually typed at the same position. We want to see if c is in it: if so,
+// then the word contains at that position a character close to what the user
+// typed.
+// What the user typed is actually the first character of the array.
+// Notice : accented characters do not have a proximity list, so they are alone
+// in their list. The non-accented version of the character should be considered
+// "close", but not the other keys close to the non-accented version.
inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
const int *currentChars, const unsigned short c, const int skipPos,
const int excessivePos, const int transposedPos) {
- const unsigned short lowerC = toLowerCase(c);
- int j = 0;
+ const unsigned short baseLowerC = toBaseLowerCase(c);
+
+ // The first char in the array is what user typed. If it matches right away,
+ // that means the user typed that same char for this pos.
+ if (currentChars[0] == baseLowerC || currentChars[0] == c)
+ return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
+
+ // If one of those is true, we should not check for close characters at all.
+ if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
+ return UNRELATED_CHAR;
+
+ // If the non-accented, lowercased version of that first character matches c,
+ // then we have a non-accented version of the accented character the user
+ // typed. Treat it as a close char.
+ if (toBaseLowerCase(currentChars[0]) == baseLowerC)
+ return NEAR_PROXIMITY_CHAR;
+
+ // Not an exact nor an accent-alike match: search the list of close keys
+ int j = 1;
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
- const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
- // If skipPos is defined, not to search proximity collections.
- // First char is what user typed.
- if (matched) {
- if (j > 0) return NEAR_PROXIMITY_CHAR;
- return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
- } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
- // Not to check proximity characters
- return UNRELATED_CHAR;
- }
+ const bool matched = (currentChars[j] == baseLowerC || currentChars[j] == c);
+ if (matched) return NEAR_PROXIMITY_CHAR;
++j;
}
+
+ // Was not included, signal this as an unrelated character.
return UNRELATED_CHAR;
}
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
- const int maxDepth, const bool traverseAllNodes, int snr, int inputIndex,
+ const int maxDepth, const bool traverseAllNodes, int matchWeight, int inputIndex,
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
- bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
+ bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
int *nextSiblingPosition) {
if (DEBUG_DICT) {
int inputCount = 0;
@@ -480,11 +508,12 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
mWord[depth] = c;
if (traverseAllNodes && terminal) {
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, inputIndex, depth,
- snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);
+ matchWeight, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos,
+ freq);
}
if (!needsToTraverseChildrenNodes) return false;
*newTraverseAllNodes = traverseAllNodes;
- *newSnr = snr;
+ *newMatchRate = matchWeight;
*newDiffs = diffs;
*newInputIndex = inputIndex;
} else {
@@ -502,18 +531,18 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
- snr = snr * TYPED_LETTER_MULTIPLIER;
+ matchWeight = matchWeight * TYPED_LETTER_MULTIPLIER;
}
bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
if (isSameAsUserTypedLength && terminal) {
- onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, snr,
+ onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, matchWeight,
skipPos, excessivePos, transposedPos, freq);
}
if (!needsToTraverseChildrenNodes) return false;
// Start traversing all nodes after the index exceeds the user typed length
*newTraverseAllNodes = isSameAsUserTypedLength;
- *newSnr = snr;
+ *newMatchRate = matchWeight;
*newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
*newInputIndex = inputIndex + 1;
}
@@ -597,8 +626,8 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
newChildPosition, newTerminal, newFreq);
const unsigned int inputC = currentChars[0];
if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);
- const unsigned short lowerC = toLowerCase(c);
- const bool matched = (inputC == lowerC || inputC == c);
+ const unsigned short baseLowerC = toBaseLowerCase(c);
+ const bool matched = (inputC == baseLowerC || inputC == c);
const bool hasChild = *newChildPosition != 0;
if (matched) {
word[depth] = c;
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 90c98149b..db40646e1 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -32,8 +32,7 @@ class UnigramDictionary {
public:
UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
- int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
- int *nextLetters, int nextLettersSize);
+ int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
~UnigramDictionary();
private:
@@ -48,7 +47,7 @@ private:
int wideStrLen(unsigned short *str);
bool sameAsTyped(unsigned short *word, int length);
bool addWord(unsigned short *word, int length, int frequency);
- unsigned short toLowerCase(unsigned short c);
+ unsigned short toBaseLowerCase(unsigned short c);
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
@@ -109,6 +108,7 @@ private:
int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
+ int mNextLettersFrequency[NEXT_LETTERS_SIZE];
};
// ----------------------------------------------------------------------------