aboutsummaryrefslogtreecommitdiffstats
path: root/native/src/dictionary.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'native/src/dictionary.cpp')
-rw-r--r--native/src/dictionary.cpp317
1 files changed, 298 insertions, 19 deletions
diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp
index 6e6f44182..1a39f585b4 100644
--- a/native/src/dictionary.cpp
+++ b/native/src/dictionary.cpp
@@ -19,21 +19,18 @@
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
-#include <cutils/log.h>
-
-#include <unicode/uchar.h>
-
-//#define USE_ASSET_MANAGER
-
-#ifdef USE_ASSET_MANAGER
-#include <utils/AssetManager.h>
-#include <utils/Asset.h>
-#endif
+//#define LOG_TAG "dictionary.cpp"
+//#include <cutils/log.h>
+#define LOGI
#include "dictionary.h"
#include "basechars.h"
+#include "char_utils.h"
#define DEBUG_DICT 0
+#define DICTIONARY_VERSION_MIN 200
+#define DICTIONARY_HEADER_SIZE 2
+#define NOT_VALID_WORD -99
namespace latinime {
@@ -42,6 +39,7 @@ Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultip
mDict = (unsigned char*) dict;
mTypedLetterMultiplier = typedLetterMultiplier;
mFullWordMultiplier = fullWordMultiplier;
+ getVersionNumber();
}
Dictionary::~Dictionary()
@@ -65,7 +63,11 @@ int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWor
mNextLettersFrequencies = nextLetters;
mNextLettersSize = nextLettersSize;
- getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0);
+ if (checkIfDictVersionIsLatest()) {
+ getWordsRec(DICTIONARY_HEADER_SIZE, 0, mInputLength * 3, false, 1, 0, 0);
+ } else {
+ getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0);
+ }
// Get the word count
suggWords = 0;
@@ -92,6 +94,21 @@ Dictionary::registerNextLetter(unsigned short c)
}
}
+void
+Dictionary::getVersionNumber()
+{
+ mVersion = (mDict[0] & 0xFF);
+ mBigram = (mDict[1] & 0xFF);
+ LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
+}
+
+// Checks whether it has the latest dictionary or the old dictionary
+bool
+Dictionary::checkIfDictVersionIsLatest()
+{
+ return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
+}
+
unsigned short
Dictionary::getChar(int *pos)
{
@@ -120,6 +137,28 @@ Dictionary::getAddress(int *pos)
}
int
+Dictionary::getFreq(int *pos)
+{
+ int freq = mDict[(*pos)++] & 0xFF;
+
+ if (checkIfDictVersionIsLatest()) {
+ // skipping bigram
+ int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
+ if (bigramExist > 0) {
+ int nextBigramExist = 1;
+ while (nextBigramExist > 0) {
+ (*pos) += 3;
+ nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
+ }
+ } else {
+ (*pos)++;
+ }
+ }
+
+ return freq;
+}
+
+int
Dictionary::wideStrLen(unsigned short *str)
{
if (!str) return 0;
@@ -168,6 +207,46 @@ Dictionary::addWord(unsigned short *word, int length, int frequency)
return false;
}
+bool
+Dictionary::addWordBigram(unsigned short *word, int length, int frequency)
+{
+ word[length] = 0;
+ if (DEBUG_DICT) {
+ char s[length + 1];
+ for (int i = 0; i <= length; i++) s[i] = word[i];
+ LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
+ }
+
+ // Find the right insertion point
+ int insertAt = 0;
+ while (insertAt < mMaxBigrams) {
+ if (frequency > mBigramFreq[insertAt]
+ || (mBigramFreq[insertAt] == frequency
+ && length < wideStrLen(mBigramChars + insertAt * mMaxWordLength))) {
+ break;
+ }
+ insertAt++;
+ }
+ LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
+ if (insertAt < mMaxBigrams) {
+ memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
+ (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
+ (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
+ mBigramFreq[insertAt] = frequency;
+ memmove((char*) mBigramChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
+ (char*) mBigramChars + (insertAt ) * mMaxWordLength * sizeof(short),
+ (mMaxBigrams - insertAt - 1) * sizeof(short) * mMaxWordLength);
+ unsigned short *dest = mBigramChars + (insertAt ) * mMaxWordLength;
+ while (length--) {
+ *dest++ = *word++;
+ }
+ *dest = 0; // NULL terminate
+ if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
+ return true;
+ }
+ return false;
+}
+
unsigned short
Dictionary::toLowerCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
@@ -176,7 +255,7 @@ Dictionary::toLowerCase(unsigned short c) {
if (c >='A' && c <= 'Z') {
c |= 32;
} else if (c > 127) {
- c = u_tolower(c);
+ c = latin_tolower(c);
}
return c;
}
@@ -220,12 +299,17 @@ Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int s
}
for (int i = 0; i < count; i++) {
+ // -- at char
unsigned short c = getChar(&pos);
+ // -- at flag/add
unsigned short lowerC = toLowerCase(c);
bool terminal = getTerminal(&pos);
int childrenAddress = getAddress(&pos);
+ // -- after address or flag
int freq = 1;
if (terminal) freq = getFreq(&pos);
+ // -- after add or freq
+
// If we are only doing completions, no need to look at the typed characters.
if (completion) {
mWord[depth] = c;
@@ -239,7 +323,7 @@ Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int s
getWordsRec(childrenAddress, depth + 1, maxDepth,
completion, snr, inputIndex, diffs);
}
- } else if (c == QUOTE && currentChars[0] != QUOTE || mSkipPos == depth) {
+ } else if ((c == QUOTE && currentChars[0] != QUOTE) || mSkipPos == depth) {
// Skip the ' or other letter and continue deeper
mWord[depth] = c;
if (childrenAddress != 0) {
@@ -277,14 +361,208 @@ Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int s
}
}
+int
+Dictionary::getBigramAddress(int *pos, bool advance)
+{
+ int address = 0;
+
+ address += (mDict[*pos] & 0x3F) << 16;
+ address += (mDict[*pos + 1] & 0xFF) << 8;
+ address += (mDict[*pos + 2] & 0xFF);
+
+ if (advance) {
+ *pos += 3;
+ }
+
+ return address;
+}
+
+int
+Dictionary::getBigramFreq(int *pos)
+{
+ int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
+
+ return freq;
+}
+
+
+int
+Dictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
+ unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
+ int maxAlternatives)
+{
+ mBigramFreq = bigramFreq;
+ mBigramChars = bigramChars;
+ mInputCodes = codes;
+ mInputLength = codesSize;
+ mMaxWordLength = maxWordLength;
+ mMaxBigrams = maxBigrams;
+ mMaxAlternatives = maxAlternatives;
+
+ if (mBigram == 1 && checkIfDictVersionIsLatest()) {
+ int pos = isValidWordRec(DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
+ LOGI("Pos -> %d\n", pos);
+ if (pos < 0) {
+ return 0;
+ }
+
+ int bigramCount = 0;
+ int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
+ if (bigramExist > 0) {
+ int nextBigramExist = 1;
+ while (nextBigramExist > 0 && bigramCount < maxBigrams) {
+ int bigramAddress = getBigramAddress(&pos, true);
+ int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
+ // search for all bigrams and store them
+ searchForTerminalNode(bigramAddress, frequency);
+ nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
+ bigramCount++;
+ }
+ }
+
+ return bigramCount;
+ }
+ return 0;
+}
+
+void
+Dictionary::searchForTerminalNode(int addressLookingFor, int frequency)
+{
+ // track word with such address and store it in an array
+ unsigned short word[mMaxWordLength];
+
+ int pos;
+ int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
+ bool found = false;
+ char followingChar = ' ';
+ int depth = -1;
+
+ while(!found) {
+ bool followDownAddressSearchStop = false;
+ bool firstAddress = true;
+ bool haveToSearchAll = true;
+
+ if (depth >= 0) {
+ word[depth] = (unsigned short) followingChar;
+ }
+ pos = followDownBranchAddress; // pos start at count
+ int count = mDict[pos] & 0xFF;
+ LOGI("count - %d\n",count);
+ pos++;
+ for (int i = 0; i < count; i++) {
+ // pos at data
+ pos++;
+ // pos now at flag
+ if (!getFirstBitOfByte(&pos)) { // non-terminal
+ if (!followDownAddressSearchStop) {
+ int addr = getBigramAddress(&pos, false);
+ if (addr > addressLookingFor) {
+ followDownAddressSearchStop = true;
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ } else if (!haveToSearchAll) {
+ break;
+ }
+ } else {
+ followDownBranchAddress = addr;
+ followingChar = (char)(0xFF & mDict[pos-1]);
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = false;
+ }
+ }
+ }
+ pos += 3;
+ } else if (getFirstBitOfByte(&pos)) { // terminal
+ if (addressLookingFor == (pos-1)) { // found !!
+ depth++;
+ word[depth] = (0xFF & mDict[pos-1]);
+ found = true;
+ break;
+ }
+ if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
+ if (!followDownAddressSearchStop) {
+ int addr = getBigramAddress(&pos, false);
+ if (addr > addressLookingFor) {
+ followDownAddressSearchStop = true;
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ } else if (!haveToSearchAll) {
+ break;
+ }
+ } else {
+ followDownBranchAddress = addr;
+ followingChar = (char)(0xFF & mDict[pos-1]);
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ }
+ }
+ }
+ pos += 4;
+ } else { // freq only (2 byte)
+ pos += 2;
+ }
+
+ // skipping bigram
+ int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
+ if (bigramExist > 0) {
+ int nextBigramExist = 1;
+ while (nextBigramExist > 0) {
+ pos += 3;
+ nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
+ }
+ } else {
+ pos++;
+ }
+ }
+ }
+ depth++;
+ if (followDownBranchAddress == 0) {
+ LOGI("ERROR!!! Cannot find bigram!!");
+ break;
+ }
+ }
+ if (checkFirstCharacter(word)) {
+ addWordBigram(word, depth, frequency);
+ }
+}
+
bool
-Dictionary::isValidWord(unsigned short *word, int length)
+Dictionary::checkFirstCharacter(unsigned short *word)
{
- return isValidWordRec(0, word, 0, length);
+ // Checks whether this word starts with same character or neighboring characters of
+ // what user typed.
+
+ int *inputCodes = mInputCodes;
+ int maxAlt = mMaxAlternatives;
+ while (maxAlt > 0) {
+ if ((unsigned int) *inputCodes == (unsigned int) *word) {
+ return true;
+ }
+ inputCodes++;
+ maxAlt--;
+ }
+ return false;
}
bool
+Dictionary::isValidWord(unsigned short *word, int length)
+{
+ if (checkIfDictVersionIsLatest()) {
+ return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
+ } else {
+ return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
+ }
+}
+
+int
Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
+ // returns address of bigram data of that word
+ // return -99 if not found
+
int count = getCount(&pos);
unsigned short currentChar = (unsigned short) word[offset];
for (int j = 0; j < count; j++) {
@@ -294,12 +572,13 @@ Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length
if (c == currentChar) {
if (offset == length - 1) {
if (terminal) {
- return true;
+ return (pos+1);
}
} else {
if (childPos != 0) {
- if (isValidWordRec(childPos, word, offset + 1, length)) {
- return true;
+ int t = isValidWordRec(childPos, word, offset + 1, length);
+ if (t > 0) {
+ return t;
}
}
}
@@ -310,7 +589,7 @@ Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length
// There could be two instances of each alphabet - upper and lower case. So continue
// looking ...
}
- return false;
+ return NOT_VALID_WORD;
}