aboutsummaryrefslogtreecommitdiffstats
path: root/native/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/src')
-rw-r--r--native/src/binary_format.h73
-rw-r--r--native/src/unigram_dictionary.cpp79
-rw-r--r--native/src/unigram_dictionary.h3
3 files changed, 74 insertions, 81 deletions
diff --git a/native/src/binary_format.h b/native/src/binary_format.h
index 7deec27d3..a946b1ee3 100644
--- a/native/src/binary_format.h
+++ b/native/src/binary_format.h
@@ -48,6 +48,8 @@ public:
static bool hasChildrenInFlags(const uint8_t flags);
static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
int *pos);
+ static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
+ const int length);
};
inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
@@ -217,6 +219,77 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con
}
}
+// This function gets the byte position of the last chargroup of the exact matching word in the
+// dictionary. If no match is found, it returns NOT_VALID_WORD.
+inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
+ const uint16_t* const inWord, const int length) {
+ int pos = 0;
+ int wordPos = 0;
+
+ while (true) {
+ // If we already traversed the tree further than the word is long, there means
+ // there was no match (or we would have found it).
+ if (wordPos > length) return NOT_VALID_WORD;
+ int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
+ const uint16_t wChar = inWord[wordPos];
+ while (true) {
+ // If there are no more character groups in this node, it means we could not
+ // find a matching character for this depth, therefore there is no match.
+ if (0 >= charGroupCount) return NOT_VALID_WORD;
+ const int charGroupPos = pos;
+ const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
+ int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+ if (character == wChar) {
+ // This is the correct node. Only one character group may start with the same
+ // char within a node, so either we found our match in this node, or there is
+ // no match and we can return NOT_VALID_WORD. So we will check all the characters
+ // in this character group indeed does match.
+ if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+ while (NOT_A_CHARACTER != character) {
+ ++wordPos;
+ // If we shoot the length of the word we search for, or if we find a single
+ // character that does not match, as explained above, it means the word is
+ // not in the dictionary (by virtue of this chargroup being the only one to
+ // match the word on the first character, but not matching the whole word).
+ if (wordPos > length) return NOT_VALID_WORD;
+ if (inWord[wordPos] != character) return NOT_VALID_WORD;
+ character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
+ }
+ }
+ // If we come here we know that so far, we do match. Either we are on a terminal
+ // and we match the length, in which case we found it, or we traverse children.
+ // If we don't match the length AND don't have children, then a word in the
+ // dictionary fully matches a prefix of the searched word but not the full word.
+ ++wordPos;
+ if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
+ if (wordPos == length) {
+ return charGroupPos;
+ }
+ pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
+ }
+ if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
+ == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
+ return NOT_VALID_WORD;
+ }
+ // We have children and we are still shorter than the word we are searching for, so
+ // we need to traverse children. Put the pointer on the children position, and
+ // break
+ pos = BinaryFormat::readChildrenPosition(root, flags, pos);
+ break;
+ } else {
+ // This chargroup does not match, so skip the remaining part and go to the next.
+ if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
+ pos = BinaryFormat::skipOtherCharacters(root, pos);
+ }
+ pos = BinaryFormat::skipFrequency(flags, pos);
+ pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
+ }
+ --charGroupCount;
+ }
+ }
+}
+
} // namespace latinime
#endif // LATINIME_BINARY_FORMAT_H
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 64d41f32e..bccd37a61 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -1055,85 +1055,8 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor
return maxFreq;
}
-// This function gets the frequency of the exact matching word in the dictionary.
-// If no match is found, it returns NOT_VALID_WORD.
-static inline int getFrequency(const uint8_t* const root, const uint16_t* const inWord,
- const int length) {
- int pos = 0;
- int wordPos = 0;
-
- while (true) {
- // If we already traversed the tree further than the word is long, there means
- // there was no match (or we would have found it).
- if (wordPos > length) return NOT_VALID_WORD;
- int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
- const uint16_t wChar = inWord[wordPos];
- while (true) {
- // If there are no more character groups in this node, it means we could not
- // find a matching character for this depth, therefore there is no match.
- if (0 >= charGroupCount) return NOT_VALID_WORD;
- const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
- int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
- if (character == wChar) {
- // This is the correct node. Only one character group may start with the same
- // char within a node, so either we found our match in this node, or there is
- // no match and we can return NOT_VALID_WORD. So we will check all the characters
- // in this character group indeed does match.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
- character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
- while (NOT_A_CHARACTER != character) {
- ++wordPos;
- // If we shoot the length of the word we search for, or if we find a single
- // character that does not match, as explained above, it means the word is
- // not in the dictionary (by virtue of this chargroup being the only one to
- // match the word on the first character, but not matching the whole word).
- if (wordPos > length) return NOT_VALID_WORD;
- if (inWord[wordPos] != character) return NOT_VALID_WORD;
- character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
- }
- }
- // If we come here we know that so far, we do match. Either we are on a terminal
- // and we match the length, in which case we found it, or we traverse children.
- // If we don't match the length AND don't have children, then a word in the
- // dictionary fully matches a prefix of the searched word but not the full word.
- ++wordPos;
- if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
- if (wordPos == length) {
- return BinaryFormat::readFrequencyWithoutMovingPointer(root, pos);
- }
- pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
- }
- if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
- == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
- return NOT_VALID_WORD;
- }
- // We have children and we are still shorter than the word we are searching for, so
- // we need to traverse children. Put the pointer on the children position, and
- // break
- pos = BinaryFormat::readChildrenPosition(root, flags, pos);
- break;
- } else {
- // This chargroup does not match, so skip the remaining part and go to the next.
- if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
- pos = BinaryFormat::skipOtherCharacters(root, pos);
- }
- pos = BinaryFormat::skipFrequency(flags, pos);
- pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
- }
- --charGroupCount;
- }
- }
-}
-
bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
- return NOT_VALID_WORD != getFrequency(DICT_ROOT, inWord, length);
-}
-
-int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,
- unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
- int maxAlternatives) {
- // TODO: add implementation.
- return 0;
+ return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
}
// TODO: remove this function.
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 55771eeb8..97198ef13 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -71,9 +71,6 @@ public:
bool isValidWord(unsigned short *word, int length);
#else // NEW_DICTIONARY_FORMAT
bool isValidWord(const uint16_t* const inWord, const int length) const;
- int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
- unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
- int maxAlternatives);
#endif // NEW_DICTIONARY_FORMAT
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,