aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src')
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h34
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp89
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h15
-rw-r--r--native/jni/src/utils/char_utils.cpp3
4 files changed, 115 insertions, 26 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
index faaf44162..e4847fcf9 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
@@ -30,18 +30,19 @@ namespace latinime {
class PtNodeParams {
public:
// Invalid PtNode.
- PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mParentPos(NOT_A_DICT_POS),
- mCodePointCount(0), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS),
- mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), mProbabilityFieldPos(NOT_A_DICT_POS),
- mProbability(NOT_A_PROBABILITY), mChildrenPosFieldPos(NOT_A_DICT_POS),
- mChildrenPos(NOT_A_DICT_POS), mBigramLinkedNodePos(NOT_A_DICT_POS),
- mShortcutPos(NOT_A_DICT_POS), mBigramPos(NOT_A_DICT_POS),
- mSiblingPos(NOT_A_DICT_POS) {}
+ PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mHasMovedFlag(false),
+ mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mCodePoints(),
+ mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
+ mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY),
+ mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS),
+ mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS),
+ mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {}
PtNodeParams(const PtNodeParams& ptNodeParams)
: mHeadPos(ptNodeParams.mHeadPos), mFlags(ptNodeParams.mFlags),
- mParentPos(ptNodeParams.mParentPos), mCodePointCount(ptNodeParams.mCodePointCount),
- mCodePoints(), mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos),
+ mHasMovedFlag(ptNodeParams.mHasMovedFlag), mParentPos(ptNodeParams.mParentPos),
+ mCodePointCount(ptNodeParams.mCodePointCount), mCodePoints(),
+ mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos),
mTerminalId(ptNodeParams.mTerminalId),
mProbabilityFieldPos(ptNodeParams.mProbabilityFieldPos),
mProbability(ptNodeParams.mProbability),
@@ -58,7 +59,7 @@ class PtNodeParams {
const int codePointCount, const int *const codePoints, const int probability,
const int childrenPos, const int shortcutPos, const int bigramPos,
const int siblingPos)
- : mHeadPos(headPos), mFlags(flags), mParentPos(NOT_A_DICT_POS),
+ : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(false), mParentPos(NOT_A_DICT_POS),
mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS),
mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability),
@@ -73,7 +74,7 @@ class PtNodeParams {
const int parentPos, const int codePointCount, const int *const codePoints,
const int terminalIdFieldPos, const int terminalId, const int probability,
const int childrenPosFieldPos, const int childrenPos, const int siblingPos)
- : mHeadPos(headPos), mFlags(flags), mParentPos(parentPos),
+ : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos),
mCodePointCount(codePointCount), mCodePoints(),
mTerminalIdFieldPos(terminalIdFieldPos), mTerminalId(terminalId),
mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability),
@@ -87,8 +88,8 @@ class PtNodeParams {
PtNodeParams(const PtNodeParams *const ptNodeParams,
const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos,
const int codePointCount, const int *const codePoints, const int probability)
- : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mParentPos(parentPos),
- mCodePointCount(codePointCount), mCodePoints(),
+ : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true),
+ mParentPos(parentPos), mCodePointCount(codePointCount), mCodePoints(),
mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()),
mTerminalId(ptNodeParams->getTerminalId()),
mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()),
@@ -104,7 +105,7 @@ class PtNodeParams {
PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos,
const int codePointCount, const int *const codePoints, const int probability)
- : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mParentPos(parentPos),
+ : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos),
mCodePointCount(codePointCount), mCodePoints(),
mTerminalIdFieldPos(NOT_A_DICT_POS),
mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
@@ -126,11 +127,11 @@ class PtNodeParams {
// Flags
AK_FORCE_INLINE bool isDeleted() const {
- return DynamicPtReadingUtils::isDeleted(mFlags);
+ return mHasMovedFlag && DynamicPtReadingUtils::isDeleted(mFlags);
}
AK_FORCE_INLINE bool willBecomeNonTerminal() const {
- return DynamicPtReadingUtils::willBecomeNonTerminal(mFlags);
+ return mHasMovedFlag && DynamicPtReadingUtils::willBecomeNonTerminal(mFlags);
}
AK_FORCE_INLINE bool hasChildren() const {
@@ -224,6 +225,7 @@ class PtNodeParams {
const int mHeadPos;
const PatriciaTrieReadingUtils::NodeFlags mFlags;
+ const bool mHasMovedFlag;
const int mParentPos;
const uint8_t mCodePointCount;
int mCodePoints[MAX_WORD_LENGTH];
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
index 8172e70b6..fa5993090 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -20,6 +20,7 @@
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
+#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
@@ -303,4 +304,92 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
return siblingPos;
}
+const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoints,
+ const int codePointCount) const {
+ const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount,
+ false /* forceLowerCaseSearch */);
+ if (ptNodePos == NOT_A_DICT_POS) {
+ AKLOGE("getWordProperty was called for invalid word.");
+ return WordProperty();
+ }
+ const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
+ std::vector<int> codePointVector(ptNodeParams.getCodePoints(),
+ ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
+ // Fetch bigram information.
+ std::vector<WordProperty::BigramProperty> bigrams;
+ const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
+ int bigramWord1CodePoints[MAX_WORD_LENGTH];
+ BinaryDictionaryBigramsIterator bigramsIt(getBigramsStructurePolicy(), bigramListPos);
+ while (bigramsIt.hasNext()) {
+ // Fetch the next bigram information and forward the iterator.
+ bigramsIt.next();
+ // Skip the entry if the entry has been deleted. This never happens for ver2 dicts.
+ if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
+ int word1Probability = NOT_A_PROBABILITY;
+ int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
+ bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints,
+ &word1Probability);
+ std::vector<int> word1(bigramWord1CodePoints,
+ bigramWord1CodePoints + word1CodePointCount);
+ bigrams.push_back(WordProperty::BigramProperty(&word1, bigramsIt.getProbability(),
+ NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */));
+ }
+ }
+ // Fetch shortcut information.
+ std::vector<WordProperty::ShortcutProperty> shortcuts;
+ int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
+ if (shortcutPos != NOT_A_DICT_POS) {
+ int shortcutTargetCodePoints[MAX_WORD_LENGTH];
+ ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mDictRoot, &shortcutPos);
+ bool hasNext = true;
+ while (hasNext) {
+ const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
+ ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, &shortcutPos);
+ hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
+ const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
+ mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
+ std::vector<int> shortcutTarget(shortcutTargetCodePoints,
+ shortcutTargetCodePoints + shortcutTargetLength);
+ const int shortcutProbability =
+ ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
+ shortcuts.push_back(
+ WordProperty::ShortcutProperty(&shortcutTarget, shortcutProbability));
+ }
+ }
+ return WordProperty(&codePointVector, ptNodeParams.isNotAWord(),
+ ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(),
+ ptNodeParams.hasShortcutTargets(), ptNodeParams.getProbability(),
+ NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */,
+ &bigrams, &shortcuts);
+}
+
+int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints) {
+ if (token == 0) {
+ // Start iterating the dictionary.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
+ &mTerminalPtNodePositionsForIteratingWords);
+ DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
+ }
+ const int terminalPtNodePositionsVectorSize =
+ static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
+ if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
+ AKLOGE("Given token %d is invalid.", token);
+ return 0;
+ }
+ const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
+ int unigramProbability = NOT_A_PROBABILITY;
+ getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, MAX_WORD_LENGTH,
+ outCodePoints, &unigramProbability);
+ const int nextToken = token + 1;
+ if (nextToken >= terminalPtNodePositionsVectorSize) {
+ // All words have been iterated.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ return 0;
+ }
+ return nextToken;
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
index 1ce7f85d4..8fbca2612 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
@@ -18,6 +18,7 @@
#define LATINIME_PATRICIA_TRIE_POLICY_H
#include <stdint.h>
+#include <vector>
#include "defines.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
@@ -44,7 +45,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
- mHeaderPolicy.getSize()),
mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot),
mPtNodeReader(mDictRoot, mDictBufferSize, &mBigramListPolicy, &mShortcutListPolicy),
- mPtNodeArrayReader(mDictRoot, mDictBufferSize) {}
+ mPtNodeArrayReader(mDictRoot, mDictBufferSize),
+ mTerminalPtNodePositionsForIteratingWords() {}
AK_FORCE_INLINE int getRootPosition() const {
return 0;
@@ -128,15 +130,9 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
}
const WordProperty getWordProperty(const int *const codePoints,
- const int codePointCount) const {
- // getWordProperty is not supported.
- return WordProperty();
- }
+ const int codePointCount) const;
- int getNextWordAndNextToken(const int token, int *const outCodePoints) {
- // getNextWordAndNextToken is not supported.
- return 0;
- }
+ int getNextWordAndNextToken(const int token, int *const outCodePoints);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);
@@ -149,6 +145,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const ShortcutListPolicy mShortcutListPolicy;
const Ver2ParticiaTrieNodeReader mPtNodeReader;
const Ver2PtNodeArrayReader mPtNodeArrayReader;
+ std::vector<int> mTerminalPtNodePositionsForIteratingWords;
int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
DicNodeVector *const childDicNodes) const;
diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp
index d41fc8924..adc474b4c 100644
--- a/native/jni/src/utils/char_utils.cpp
+++ b/native/jni/src/utils/char_utils.cpp
@@ -1118,7 +1118,8 @@ static int compare_pair_capital(const void *a, const void *b) {
/* U+0118 */ 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067,
/* U+0120 */ 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0126, 0x0127,
/* U+0128 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069,
- /* U+0130 */ 0x0049, 0x0131, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B,
+ // U+0131: Manually changed from 0131 to 0049
+ /* U+0130 */ 0x0049, 0x0049, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B,
/* U+0138 */ 0x0138, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C,
/* U+0140 */ 0x006C, 0x004C, 0x006C, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E,
// U+0141: Manually changed from 0141 to 004C