diff options
Diffstat (limited to 'native/src/binary_format.h')
-rw-r--r-- | native/src/binary_format.h | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/native/src/binary_format.h b/native/src/binary_format.h index a946b1ee3..6f65088db 100644 --- a/native/src/binary_format.h +++ b/native/src/binary_format.h @@ -50,6 +50,8 @@ public: int *pos); static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord, const int length); + static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, + uint16_t* outWord); }; inline int BinaryFormat::detectFormat(const uint8_t* const dict) { @@ -290,6 +292,151 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, } } +// This function searches for a terminal in the dictionary by its address. +// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, +// it is possible to check for this with advantageous complexity. For each node, we search +// for groups with children and compare the children address with the address we look for. +// When we shoot the address we look for, it means the word we look for is in the children +// of the previous group. The only tricky part is the fact that if we arrive at the end of a +// node with the last group's children address still less than what we are searching for, we +// must descend the last group's children (for example, if the word we are searching for starts +// with a z, it's the last group of the root node, so all children addresses will be smaller +// than the address we look for, and we have to descend the z node). +/* Parameters : + * root: the dictionary buffer + * address: the byte position of the last chargroup of the word we are searching for (this is + * what is stored as the "bigram address" in each bigram) + * outword: an array to write the found word, with MAX_WORD_LENGTH size. + * Return value : the length of the word, of 0 if the word was not found. + */ +inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, + const int maxDepth, uint16_t* outWord) { + int pos = 0; + int wordPos = 0; + + // One iteration of the outer loop iterates through nodes. As stated above, we will only + // traverse nodes that are actually a part of the terminal we are searching, so each time + // we enter this loop we are one depth level further than last time. + // The only reason we count nodes is because we want to reduce the probability of infinite + // looping in case there is a bug. Since we know there is an upper bound to the depth we are + // supposed to traverse, it does not hurt to count iterations. + for (int loopCount = maxDepth; loopCount > 0; --loopCount) { + int lastCandidateGroupPos = 0; + // Let's loop through char groups in this node searching for either the terminal + // or one of its ascendants. + for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0; + --charGroupCount) { + const int startPos = pos; + const uint8_t flags = getFlagsAndForwardPointer(root, &pos); + const int32_t character = getCharCodeAndForwardPointer(root, &pos); + if (address == startPos) { + // We found the address. Copy the rest of the word in the buffer and return + // the length. + outWord[wordPos] = character; + if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + int32_t nextChar = getCharCodeAndForwardPointer(root, &pos); + // We count chars in order to avoid infinite loops if the file is broken or + // if there is some other bug + int charCount = maxDepth; + while (-1 != nextChar && --charCount > 0) { + outWord[++wordPos] = nextChar; + nextChar = getCharCodeAndForwardPointer(root, &pos); + } + } + return ++wordPos; + } + // We need to skip past this char group, so skip any remaining chars after the + // first and possibly the frequency. + if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + pos = skipOtherCharacters(root, pos); + } + pos = skipFrequency(flags, pos); + + // The fact that this group has children is very important. Since we already know + // that this group does not match, if it has no children we know it is irrelevant + // to what we are searching for. + const bool hasChildren = (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != + (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)); + // We will write in `found' whether we have passed the children address we are + // searching for. For example if we search for "beer", the children of b are less + // than the address we are searching for and the children of c are greater. When we + // come here for c, we realize this is too big, and that we should descend b. + bool found; + if (hasChildren) { + // Here comes the tricky part. First, read the children position. + const int childrenPos = readChildrenPosition(root, flags, pos); + if (childrenPos > address) { + // If the children pos is greater than address, it means the previous chargroup, + // which address is stored in lastCandidateGroupPos, was the right one. + found = true; + } else if (1 >= charGroupCount) { + // However if we are on the LAST group of this node, and we have NOT shot the + // address we should descend THIS node. So we trick the lastCandidateGroupPos + // so that we will descend this node, not the previous one. + lastCandidateGroupPos = startPos; + found = true; + } else { + // Else, we should continue looking. + found = false; + } + } else { + // Even if we don't have children here, we could still be on the last group of this + // node. If this is the case, we should descend the last group that had children, + // and their address is already in lastCandidateGroup. + found = (1 >= charGroupCount); + } + + if (found) { + // Okay, we found the group we should descend. Its address is in + // the lastCandidateGroupPos variable, so we just re-read it. + if (0 != lastCandidateGroupPos) { + const uint8_t lastFlags = + getFlagsAndForwardPointer(root, &lastCandidateGroupPos); + const int32_t lastChar = + getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); + // We copy all the characters in this group to the buffer + outWord[wordPos] = lastChar; + if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & lastFlags) { + int32_t nextChar = + getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); + int charCount = maxDepth; + while (-1 != nextChar && --charCount > 0) { + outWord[++wordPos] = nextChar; + nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos); + } + } + ++wordPos; + // Now we only need to branch to the children address. Skip the frequency if + // it's there, read pos, and break to resume the search at pos. + lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); + pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); + break; + } else { + // Here is a little tricky part: we come here if we found out that all children + // addresses in this group are bigger than the address we are searching for. + // Should we conclude the word is not in the dictionary? No! It could still be + // one of the remaining chargroups in this node, so we have to keep looking in + // this node until we find it (or we realize it's not there either, in which + // case it's actually not in the dictionary). Pass the end of this group, ready + // to start the next one. + pos = skipChildrenPosAndAttributes(root, flags, pos); + } + } else { + // If we did not find it, we should record the last children address for the next + // iteration. + if (hasChildren) lastCandidateGroupPos = startPos; + // Now skip the end of this group (children pos and the attributes if any) so that + // our pos is after the end of this char group, at the start of the next one. + pos = skipChildrenPosAndAttributes(root, flags, pos); + } + + } + } + // If we have looked through all the chargroups and found no match, the address is + // not the address of a terminal in this dictionary. + return 0; +} + } // namespace latinime #endif // LATINIME_BINARY_FORMAT_H |