aboutsummaryrefslogtreecommitdiffstats
path: root/native/src/unigram_dictionary.cpp
diff options
context:
space:
mode:
authorJean Chalard <jchalard@google.com>2011-06-16 22:33:41 +0900
committerJean Chalard <jchalard@google.com>2011-06-16 22:33:41 +0900
commit8124e64dccb2027fcdcf19b76490769087f55cc2 (patch)
tree33ca9611b1844e9a8e26014efd4f0090843ab316 /native/src/unigram_dictionary.cpp
parent3f4385511b100b369c1c0b89acf532398c3ecb30 (diff)
downloadlatinime-8124e64dccb2027fcdcf19b76490769087f55cc2.tar.gz
latinime-8124e64dccb2027fcdcf19b76490769087f55cc2.tar.xz
latinime-8124e64dccb2027fcdcf19b76490769087f55cc2.zip
New dict format, step 2
Move some methods around and make static some methods Bug: 4392433 Change-Id: I2bbe98aec118a416d21d1e293638e1d324505b9b
Diffstat (limited to 'native/src/unigram_dictionary.cpp')
-rw-r--r--native/src/unigram_dictionary.cpp48
1 files changed, 45 insertions, 3 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index cb8f5076c..91e3c8134 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -265,8 +265,7 @@ void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
}
-void UnigramDictionary::registerNextLetter(
- unsigned short c, int *nextLetters, int nextLettersSize) {
+static inline void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) {
if (c < nextLettersSize) {
nextLetters[c]++;
}
@@ -322,7 +321,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
return false;
}
-unsigned short UnigramDictionary::toBaseLowerCase(unsigned short c) {
+static inline unsigned short toBaseLowerCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
c = BASE_CHARS[c];
}
@@ -924,4 +923,47 @@ inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstCh
return false;
}
}
+
+// TODO: use uint32_t instead of unsigned short
+bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
+ if (IS_LATEST_DICT_VERSION) {
+ return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
+ } else {
+ return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
+ }
+}
+
+int UnigramDictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
+ // returns address of bigram data of that word
+ // return -99 if not found
+
+ int count = Dictionary::getCount(DICT_ROOT, &pos);
+ unsigned short currentChar = (unsigned short) word[offset];
+ for (int j = 0; j < count; j++) {
+ unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
+ int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
+ int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
+ if (c == currentChar) {
+ if (offset == length - 1) {
+ if (terminal) {
+ return (pos+1);
+ }
+ } else {
+ if (childPos != 0) {
+ int t = isValidWordRec(childPos, word, offset + 1, length);
+ if (t > 0) {
+ return t;
+ }
+ }
+ }
+ }
+ if (terminal) {
+ Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
+ }
+ // There could be two instances of each alphabet - upper and lower case. So continue
+ // looking ...
+ }
+ return NOT_VALID_WORD;
+}
+
} // namespace latinime