aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h5
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp16
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp6
-rw-r--r--native/jni/src/utils/char_utils.cpp3
-rw-r--r--native/jni/src/utils/char_utils.h7
5 files changed, 30 insertions, 7 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
index 91192fc57..bef401f87 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
@@ -23,6 +23,7 @@
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -158,6 +159,10 @@ class PtNodeParams {
return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
}
+ AK_FORCE_INLINE bool representsNonWordInfo() const {
+ return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]);
+ }
+
// Parent node position
AK_FORCE_INLINE int getParentPos() const {
return mParentPos;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
index b3af1f47a..30dcfba37 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -24,6 +24,7 @@
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+#include "utils/char_utils.h"
namespace latinime {
@@ -318,12 +319,15 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
- childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
- PatriciaTrieReadingUtils::isTerminal(flags),
- PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
- PatriciaTrieReadingUtils::isBlacklisted(flags)
- || PatriciaTrieReadingUtils::isNotAWord(flags),
- mergedNodeCodePointCount, mergedNodeCodePoints);
+ // Skip PtNodes don't start with Unicode code point because they represent non-word information.
+ if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
+ childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
+ PatriciaTrieReadingUtils::isTerminal(flags),
+ PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
+ PatriciaTrieReadingUtils::isBlacklisted(flags)
+ || PatriciaTrieReadingUtils::isNotAWord(flags),
+ mergedNodeCodePointCount, mergedNodeCodePoints);
+ }
return siblingPos;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
index 2fb3decee..8373dc549 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -59,13 +59,17 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
// valid terminal DicNode.
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
}
+ readingHelper.readNextSiblingNode(ptNodeParams);
+ if (!ptNodeParams.representsNonWordInfo()) {
+ // Skip PtNodes that represent non-word information.
+ continue;
+ }
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
ptNodeParams.hasChildren(),
ptNodeParams.isBlacklisted()
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
- readingHelper.readNextSiblingNode(ptNodeParams);
}
if (readingHelper.isError()) {
mIsCorrupted = true;
diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp
index adc474b4c..b17e0847d 100644
--- a/native/jni/src/utils/char_utils.cpp
+++ b/native/jni/src/utils/char_utils.cpp
@@ -22,6 +22,9 @@
namespace latinime {
+const int CharUtils::MIN_UNICODE_CODE_POINT = 0;
+const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF;
+
struct LatinCapitalSmallPair {
unsigned short capital;
unsigned short small;
diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h
index 239419d5b..634c45b04 100644
--- a/native/jni/src/utils/char_utils.h
+++ b/native/jni/src/utils/char_utils.h
@@ -86,12 +86,19 @@ class CharUtils {
return spaceCount;
}
+ static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
+ return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
+ }
+
static unsigned short latin_tolower(const unsigned short c);
static const std::vector<int> EMPTY_STRING;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
+ static const int MIN_UNICODE_CODE_POINT;
+ static const int MAX_UNICODE_CODE_POINT;
+
/**
* Table mapping most combined Latin, Greek, and Cyrillic characters
* to their base characters. If c is in range, BASE_CHARS[c] == c