aboutsummaryrefslogtreecommitdiffstats
path: root/native/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/src')
-rw-r--r--native/src/bigram_dictionary.cpp3
-rw-r--r--native/src/dictionary.cpp41
-rw-r--r--native/src/dictionary.h4
-rw-r--r--native/src/unigram_dictionary.cpp97
-rw-r--r--native/src/unigram_dictionary.h20
5 files changed, 97 insertions, 68 deletions
diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp
index 36761b88d..11e6dc250 100644
--- a/native/src/bigram_dictionary.cpp
+++ b/native/src/bigram_dictionary.cpp
@@ -111,8 +111,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
mMaxBigrams = maxBigrams;
if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
- int pos = mParentDictionary->isValidWordRec(
- DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
+ int pos = mParentDictionary->getBigramPosition(prevWord, prevWordLength);
if (DEBUG_DICT) {
LOGI("Pos -> %d", pos);
}
diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp
index d9ef8f3b7..9e32ee80f 100644
--- a/native/src/dictionary.cpp
+++ b/native/src/dictionary.cpp
@@ -53,45 +53,16 @@ bool Dictionary::hasBigram() {
return ((mDict[1] & 0xFF) == 1);
}
-// TODO: use uint32_t instead of unsigned short
bool Dictionary::isValidWord(unsigned short *word, int length) {
+ return mUnigramDictionary->isValidWord(word, length);
+}
+
+int Dictionary::getBigramPosition(unsigned short *word, int length) {
if (IS_LATEST_DICT_VERSION) {
- return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
+ return mUnigramDictionary->getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length);
} else {
- return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
+ return mUnigramDictionary->getBigramPosition(0, word, 0, length);
}
}
-int Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
- // returns address of bigram data of that word
- // return -99 if not found
-
- int count = Dictionary::getCount(mDict, &pos);
- unsigned short currentChar = (unsigned short) word[offset];
- for (int j = 0; j < count; j++) {
- unsigned short c = Dictionary::getChar(mDict, &pos);
- int terminal = Dictionary::getTerminal(mDict, &pos);
- int childPos = Dictionary::getAddress(mDict, &pos);
- if (c == currentChar) {
- if (offset == length - 1) {
- if (terminal) {
- return (pos+1);
- }
- } else {
- if (childPos != 0) {
- int t = isValidWordRec(childPos, word, offset + 1, length);
- if (t > 0) {
- return t;
- }
- }
- }
- }
- if (terminal) {
- Dictionary::getFreq(mDict, IS_LATEST_DICT_VERSION, &pos);
- }
- // There could be two instances of each alphabet - upper and lower case. So continue
- // looking ...
- }
- return NOT_VALID_WORD;
-}
} // namespace latinime
diff --git a/native/src/dictionary.h b/native/src/dictionary.h
index 13b2a2816..1b41f69dd 100644
--- a/native/src/dictionary.h
+++ b/native/src/dictionary.h
@@ -43,7 +43,6 @@ public:
}
bool isValidWord(unsigned short *word, int length);
- int isValidWordRec(int pos, unsigned short *word, int offset, int length);
void *getDict() { return (void *)mDict; }
int getDictSize() { return mDictSize; }
int getMmapFd() { return mMmapFd; }
@@ -63,6 +62,9 @@ public:
const int pos, unsigned short *c, int *childrenPosition,
bool *terminal, int *freq);
+ // TODO: delete this
+ int getBigramPosition(unsigned short *word, int length);
+
private:
bool hasBigram();
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index e49e95b81..aa159b533 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -16,8 +16,6 @@
*/
#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
#include <string.h>
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
@@ -34,10 +32,12 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
{ 'o', 'e' },
{ 'u', 'e' } };
-UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
+// TODO: check the header
+UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion)
- : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
+ : DICT_ROOT(streamStart),
+ MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
@@ -265,8 +265,7 @@ void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
}
-void UnigramDictionary::registerNextLetter(
- unsigned short c, int *nextLetters, int nextLettersSize) {
+static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
if (c < nextLettersSize) {
nextLetters[c]++;
}
@@ -290,8 +289,8 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
// Find the right insertion point
int insertAt = 0;
while (insertAt < MAX_WORDS) {
- if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency
- && length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) {
+ // TODO: How should we sort words with the same frequency?
+ if (frequency > mFrequencies[insertAt]) {
break;
}
insertAt++;
@@ -322,7 +321,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
return false;
}
-unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) {
+static inline unsigned short toBaseLowerCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
c = BASE_CHARS[c];
}
@@ -363,7 +362,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
}
int rootPosition = ROOT_POS;
// Get the number of child of root, then increment the position
- int childCount = Dictionary::getCount(DICT, &rootPosition);
+ int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
int depth = 0;
mStackChildCount[0] = childCount;
@@ -372,6 +371,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
mStackInputIndex[0] = 0;
mStackDiffs[0] = 0;
mStackSiblingPos[0] = rootPosition;
+ mStackOutputIndex[0] = 0;
// Depth first search
while (depth >= 0) {
@@ -382,14 +382,15 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
int inputIndex = mStackInputIndex[depth];
int diffs = mStackDiffs[depth];
int siblingPos = mStackSiblingPos[depth];
+ int outputIndex = mStackOutputIndex[depth];
int firstChildPos;
// depth will never be greater than maxDepth because in that case,
// needsToTraverseChildrenNodes should be false
- const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
+ const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, outputIndex,
maxDepth, traverseAllNodes, matchWeight, inputIndex, diffs, skipPos,
excessivePos, transposedPos, nextLetters, nextLettersSize, &childCount,
&firstChildPos, &traverseAllNodes, &matchWeight, &inputIndex, &diffs,
- &siblingPos);
+ &siblingPos, &outputIndex);
// Update next sibling pos
mStackSiblingPos[depth] = siblingPos;
if (needsToTraverseChildrenNodes) {
@@ -401,6 +402,7 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
mStackInputIndex[depth] = inputIndex;
mStackDiffs[depth] = diffs;
mStackSiblingPos[depth] = firstChildPos;
+ mStackOutputIndex[depth] = outputIndex;
}
} else {
// Goes to parent sibling node
@@ -562,7 +564,7 @@ void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength,
const int excessivePos, const int transposedPos,int *nextLetters,
const int nextLettersSize) {
int initialPosition = initialPos;
- const int count = Dictionary::getCount(DICT, &initialPosition);
+ const int count = Dictionary::getCount(DICT_ROOT, &initialPosition);
getWordsRec(count, initialPosition, 0,
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
@@ -577,22 +579,22 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
for (int i = 0; i < childrenCount; ++i) {
int newCount;
int newChildPosition;
- const int newDepth = depth + 1;
bool newTraverseAllNodes;
int newMatchRate;
int newInputIndex;
int newDiffs;
int newSiblingPos;
+ int newOutputIndex;
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
traverseAllNodes, matchWeight, inputIndex, diffs,
skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize,
&newCount, &newChildPosition, &newTraverseAllNodes, &newMatchRate,
- &newInputIndex, &newDiffs, &newSiblingPos);
+ &newInputIndex, &newDiffs, &newSiblingPos, &newOutputIndex);
siblingPos = newSiblingPos;
if (needsToTraverseChildrenNodes) {
- getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
+ getWordsRec(newCount, newChildPosition, newOutputIndex, maxDepth, newTraverseAllNodes,
newMatchRate, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize);
}
@@ -754,7 +756,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
- int *nextSiblingPosition) {
+ int *nextSiblingPosition, int *nextOutputIndex) {
if (DEBUG_DICT) {
int inputCount = 0;
if (skipPos >= 0) ++inputCount;
@@ -770,8 +772,9 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
- *nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
- &childPosition, &terminal, &freq);
+ *nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
+ &c, &childPosition, &terminal, &freq);
+ *nextOutputIndex = depth + 1;
const bool needsToTraverseChildrenNodes = childPosition != 0;
@@ -829,7 +832,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
*newTraverseAllNodes = true;
}
// get the count of nodes and increment childAddress.
- *newCount = Dictionary::getCount(DICT, &childPosition);
+ *newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
*newChildPosition = childPosition;
if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
return needsToTraverseChildrenNodes;
@@ -838,7 +841,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
unsigned short *word) {
int pos = ROOT_POS;
- int count = Dictionary::getCount(DICT, &pos);
+ int count = Dictionary::getCount(DICT_ROOT, &pos);
int maxFreq = 0;
int depth = 0;
unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
@@ -894,8 +897,8 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
const int inputIndex = startInputIndex + depth;
const int *currentChars = getInputCharsAt(inputIndex);
unsigned short c;
- *siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,
- newChildPosition, newTerminal, newFreq);
+ *siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
+ &c, newChildPosition, newTerminal, newFreq);
const unsigned int inputC = currentChars[0];
if (DEBUG_DICT) {
assert(inputC <= U_SHORT_MAX);
@@ -912,7 +915,7 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
}
}
if (hasChild) {
- *newCount = Dictionary::getCount(DICT, newChildPosition);
+ *newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
return true;
} else {
return false;
@@ -924,4 +927,50 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
return false;
}
}
+
+// TODO: use uint32_t instead of unsigned short
+bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
+ if (IS_LATEST_DICT_VERSION) {
+ return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
+ } else {
+ return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
+ }
+}
+
+
+// Require strict exact match.
+int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
+ int length) const {
+ // returns address of bigram data of that word
+ // return -99 if not found
+
+ int count = Dictionary::getCount(DICT_ROOT, &pos);
+ unsigned short currentChar = (unsigned short) word[offset];
+ for (int j = 0; j < count; j++) {
+ unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
+ int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
+ int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
+ if (c == currentChar) {
+ if (offset == length - 1) {
+ if (terminal) {
+ return (pos+1);
+ }
+ } else {
+ if (childPos != 0) {
+ int t = getBigramPosition(childPos, word, offset + 1, length);
+ if (t > 0) {
+ return t;
+ }
+ }
+ }
+ }
+ if (terminal) {
+ Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
+ }
+ // There could be two instances of each alphabet - upper and lower case. So continue
+ // looking ...
+ }
+ return NOT_VALID_WORD;
+}
+
} // namespace latinime
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index dd1b89042..c47db1ad2 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -17,9 +17,14 @@
#ifndef LATINIME_UNIGRAM_DICTIONARY_H
#define LATINIME_UNIGRAM_DICTIONARY_H
+#include <stdint.h>
#include "defines.h"
#include "proximity_info.h"
+#ifndef NULL
+#define NULL 0
+#endif
+
namespace latinime {
class UnigramDictionary {
@@ -31,8 +36,11 @@ class UnigramDictionary {
} ProximityType;
public:
- UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
- int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
+ UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
+ int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
+ const bool isLatestDictVersion);
+ bool isValidWord(unsigned short *word, int length);
+ int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
int getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
unsigned short *outWords, int *frequencies);
@@ -59,7 +67,6 @@ private:
int wideStrLen(unsigned short *str);
bool sameAsTyped(unsigned short *word, int length);
bool addWord(unsigned short *word, int length, int frequency);
- unsigned short toBaseLowerCase(unsigned short c);
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
@@ -73,7 +80,6 @@ private:
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize);
- void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
const int excessivePos, const int transposedPos, const int freq,
const bool sameLength) const;
@@ -94,7 +100,7 @@ private:
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
- int *nextSiblingPosition);
+ int *nextSiblingPosition, int *nextOutputIndex);
int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);
// Process a node by considering missing space
bool processCurrentNodeForExactMatch(const int firstChildPos,
@@ -104,7 +110,8 @@ private:
inline const int* getInputCharsAt(const int index) const {
return mInputCodes + (index * MAX_PROXIMITY_CHARS);
}
- const unsigned char *DICT;
+
+ const uint8_t* const DICT_ROOT;
const int MAX_WORD_LENGTH;
const int MAX_WORDS;
const int MAX_PROXIMITY_CHARS;
@@ -138,6 +145,7 @@ private:
int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];
int mStackDiffs[MAX_WORD_LENGTH_INTERNAL];
int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];
+ int mStackOutputIndex[MAX_WORD_LENGTH_INTERNAL];
int mNextLettersFrequency[NEXT_LETTERS_SIZE];
};