aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h19
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp27
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h5
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp7
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp21
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h15
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp21
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h5
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h7
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp3
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h25
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h1
-rw-r--r--native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp15
18 files changed, 133 insertions, 56 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index 87cf0cd3b..daf40d4f9 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
@@ -65,7 +65,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
+ &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@@ -97,7 +98,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
+ &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Copy header information
HeaderPolicy(const HeaderPolicy *const headerPolicy)
@@ -118,7 +120,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mForgettingCurveDurationToLevelDown(
headerPolicy->mForgettingCurveDurationToLevelDown),
mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
- mMaxBigramCount(headerPolicy->mMaxBigramCount) {}
+ mMaxBigramCount(headerPolicy->mMaxBigramCount),
+ mCodePointTable(headerPolicy->mCodePointTable) {}
// Temporary dummy header.
HeaderPolicy()
@@ -128,7 +131,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
- mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
+ mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0),
+ mCodePointTable(nullptr) {}
~HeaderPolicy() {}
@@ -139,6 +143,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
switch (mDictFormatVersion) {
case FormatUtils::VERSION_2:
return FormatUtils::VERSION_2;
+ case FormatUtils::VERSION_201:
+ return FormatUtils::VERSION_201;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
case FormatUtils::VERSION_4:
@@ -250,6 +256,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mDictFormatVersion >= FormatUtils::VERSION_4;
}
+ const int *getCodePointTable() const {
+ return mCodePointTable;
+ }
+
private:
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
@@ -295,6 +305,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const int mForgettingCurveDurationToLevelDown;
const int mMaxUnigramCount;
const int mMaxBigramCount;
+ const int *const mCodePointTable;
const std::vector<int> readLocale() const;
float readMultipleWordCostMultiplier() const;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
index d2c3d2fe0..41a8b13b8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
@@ -18,6 +18,7 @@
#include <cctype>
#include <cstdio>
+#include <memory>
#include <vector>
#include "defines.h"
@@ -34,12 +35,13 @@ namespace latinime {
const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
-const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256;
+const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
+const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
@@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
return;
}
int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
- int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH];
+ std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
while (pos < headerSize) {
+ // The values in the header don't use the code point table for their encoding.
const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
- MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos);
+ MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
std::vector<int> key;
key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
- MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos);
+ MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
std::vector<int> value;
- value.insert(value.end(), valueBuffer, valueBuffer + valueLength);
+ value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
headerAttributes->insert(AttributeMap::value_type(key, value));
}
}
+/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
+ AttributeMap *const headerAttributes) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
+ AttributeMap::const_iterator it = headerAttributes->find(keyVector);
+ if (it == headerAttributes->end()) {
+ return nullptr;
+ }
+ return it->second.data();
+}
+
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
int *const writingPos) {
@@ -96,7 +110,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
}
switch (version) {
case FormatUtils::VERSION_2:
- // Version 2 dictionary writing is not supported.
+ case FormatUtils::VERSION_201:
+ // Version 2 or 201 dictionary writing is not supported.
return false;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
case FormatUtils::VERSION_4:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h
index 1ab2eec69..5dd91b26c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h
@@ -46,6 +46,9 @@ class HeaderReadWriteUtils {
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+ static const int *readCodePointTable(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
const FormatUtils::FORMAT_VERSION version, int *const writingPos);
@@ -101,6 +104,8 @@ class HeaderReadWriteUtils {
static const int HEADER_FLAG_SIZE;
static const int HEADER_SIZE_FIELD_SIZE;
+ static const char *const CODE_POINT_TABLE_KEY;
+
// Value for the "flags" field. It's unused at the moment.
static const DictionaryFlags NO_FLAGS;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
index 82399f190..5c639b19c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
@@ -23,6 +23,7 @@
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
@@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
const int parentPos =
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH];
- const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
- dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
+ const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
+ dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY;
@@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
// The destination position is stored at the same place as the parent position.
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
} else {
- return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints,
+ return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
newSiblingNodePos);
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
index 9fa93efc9..372c9e36f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
@@ -114,7 +114,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
mmappedBuffer->getReadOnlyByteArrayView());
switch (formatVersion) {
case FormatUtils::VERSION_2:
- AKLOGE("Given path is a directory but the format is version 2. path: %s", path);
+ case FormatUtils::VERSION_201:
+ AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path);
break;
case FormatUtils::VERSION_4: {
return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
@@ -175,6 +176,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
}
switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
new PatriciaTriePolicy(std::move(mmappedBuffer)));
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
index e64a13cc4..6a498b2f4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
@@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
}
/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
- int *const pos) {
- return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
+ const int *const codePointTable, int *const pos) {
+ return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
}
// Returns the number of read characters.
/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
- const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
+ const NodeFlags flags, const int maxLength, const int *const codePointTable,
+ int *const outBuffer, int *const pos) {
int length = 0;
if (hasMultipleChars(flags)) {
- length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
- pos);
+ length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
+ outBuffer, pos);
} else {
- const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
+ const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
if (codePoint == NOT_A_CODE_POINT) {
// CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
// CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
@@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
// Returns the number of skipped characters.
/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
- const int maxLength, int *const pos) {
+ const int maxLength, const int *const codePointTable, int *const pos) {
if (hasMultipleChars(flags)) {
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
} else {
if (maxLength > 0) {
- getCodePointAndAdvancePosition(buffer, pos);
+ getCodePointAndAdvancePosition(buffer, codePointTable, pos);
return 1;
} else {
return 0;
@@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
- const DictionaryBigramsStructurePolicy *const bigramPolicy,
+ const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
int *const outBigramPos, int *const outSiblingPos) {
@@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
*outFlags = flags;
*outCodePointCount = getCharsAndAdvancePosition(
- dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos);
+ dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
*outProbability = isTerminal(flags) ?
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
*outChildrenPos = hasChildrenInFlags(flags) ?
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h
index c3f09c3b1..a69ec4435 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h
@@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils {
static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
- static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos);
+ static int getCodePointAndAdvancePosition(const uint8_t *const buffer,
+ const int *const codePointTable, int *const pos);
// Returns the number of read characters.
static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
- const int maxLength, int *const outBuffer, int *const pos);
+ const int maxLength, const int *const codePointTable, int *const outBuffer,
+ int *const pos);
// Returns the number of skipped characters.
static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
- const int maxLength, int *const pos);
+ const int maxLength, const int *const codePointTable, int *const pos);
static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
@@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils {
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
- NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
- int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
- int *const outBigramPos, int *const outSiblingPos);
+ const int *const codePointTable, NodeFlags *const outFlags,
+ int *const outCodePointCount, int *const outCodePoint, int *const outProbability,
+ int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos,
+ int *const outSiblingPos);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
index 7cb7dff9a..40b872055 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
@@ -45,7 +45,9 @@ const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
const int maxLength, int *const outWord, int *const pos) {
- return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, outWord, pos);
+ // TODO: Use codePointTable for shortcuts.
+ return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength,
+ nullptr /* codePointTable */, outWord, pos);
}
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
index 64b767dac..6e7dba9ff 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -81,6 +81,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
int pos = getRootPosition();
int wordPos = 0;
+ const int *const codePointTable = mHeaderPolicy.getCodePointTable();
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
// only traverse PtNodes that are actually a part of the terminal we are searching, so each
// time we enter this loop we are one depth level further than last time.
@@ -112,21 +113,21 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const PatriciaTrieReadingUtils::NodeFlags flags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mBuffer.data(), &pos);
+ mBuffer.data(), codePointTable, &pos);
if (ptNodePos == startPos) {
// We found the position. Copy the rest of the code points in the buffer and return
// the length.
outCodePoints[wordPos] = character;
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mBuffer.data(), &pos);
+ mBuffer.data(), codePointTable, &pos);
// We count code points in order to avoid infinite loops if the file is broken
// or if there is some other bug
int charCount = maxCodePointCount;
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mBuffer.data(), &pos);
+ mBuffer.data(), codePointTable, &pos);
}
}
*outUnigramProbability =
@@ -138,7 +139,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
// first and possibly the probability.
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
- &pos);
+ codePointTable, &pos);
}
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
@@ -189,17 +190,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
mBuffer.data(), &lastCandidatePtNodePos);
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mBuffer.data(), &lastCandidatePtNodePos);
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
// We copy all the characters in this PtNode to the buffer
outCodePoints[wordPos] = lastChar;
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mBuffer.data(), &lastCandidatePtNodePos);
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
int charCount = maxCodePointCount;
while (-1 != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mBuffer.data(), &lastCandidatePtNodePos);
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
}
}
++wordPos;
@@ -404,9 +405,11 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
int shortcutPos = NOT_A_DICT_POS;
int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS;
+ const int *const codePointTable = mHeaderPolicy.getCodePointTable();
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
- &mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
- &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
+ &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
+ mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
+ &siblingPos);
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
index 70e8d847e..5f179513f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
@@ -43,10 +43,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
: mMmappedBuffer(std::move(mmappedBuffer)),
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
- FormatUtils::VERSION_2),
+ FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())),
mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
- mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy),
+ mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
+ mHeaderPolicy.getCodePointTable()),
mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
mIsCorrupted(false) {}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
index 74cdf7929..dc0ed96d0 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
@@ -38,8 +38,8 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo
int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS;
PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortuctPolicy,
- mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,
- &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
+ mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
+ &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
if (mergedNodeCodePointCount <= 0) {
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
ASSERT(false);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
index 0f6769dc8..24ec5bcca 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
@@ -33,8 +33,10 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
public:
Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
- const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
- : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy) {}
+ const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
+ const int *const codePointTable)
+ : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortuctPolicy(shortcutPolicy),
+ mCodePointTable(codePointTable) {}
virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
@@ -44,6 +46,7 @@ class Ver2ParticiaTrieNodeReader : public PtNodeReader {
const ReadOnlyByteArrayView mBuffer;
const DictionaryBigramsStructurePolicy *const mBigramPolicy;
const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
+ const int *const mCodePointTable;
};
} // namespace latinime
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
index 731092efd..d795239fc 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
@@ -16,6 +16,7 @@
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
@@ -51,7 +52,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH];
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
- dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
+ dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp
index ecbe7922c..da2c30cd6 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp
@@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
if (readingPosIsInAdditionalBuffer) {
*pos -= mOriginalBuffer.size();
}
+ // Code point table is not used for dynamic format.
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
- getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
+ getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount,
+ nullptr /* codePointTable */, outCodePoints, pos);
if (readingPosIsInAdditionalBuffer) {
*pos += mOriginalBuffer.size();
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h
index 4b3c98988..abb979050 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h
@@ -147,11 +147,18 @@ class ByteArrayUtils {
*/
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
int p = pos;
- return readCodePointAndAdvancePosition(buffer, &p);
+ return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
}
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
- const uint8_t *const buffer, int *const pos) {
+ const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
+ /*
+ * codePointTable is an array to convert the most frequent characters in this dictionary to
+ * 1 byte code points. It is only made of the original code points of the most frequent
+ * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
+ * The original code points are restored by picking the code points at the indices of the
+ * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
+ */
const uint8_t firstByte = readUint8(buffer, *pos);
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
@@ -162,6 +169,9 @@ class ByteArrayUtils {
}
} else {
*pos += 1;
+ if (codePointTable) {
+ return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
+ }
return firstByte;
}
}
@@ -173,12 +183,13 @@ class ByteArrayUtils {
*/
// Returns the length of the string.
static int readStringAndAdvancePosition(const uint8_t *const buffer,
- const int maxLength, int *const outBuffer, int *const pos) {
+ const int maxLength, const int *const codePointTable, int *const outBuffer,
+ int *const pos) {
int length = 0;
- int codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
outBuffer[length++] = codePoint;
- codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
}
return length;
}
@@ -187,9 +198,9 @@ class ByteArrayUtils {
static int advancePositionToBehindString(
const uint8_t *const buffer, const int maxLength, int *const pos) {
int length = 0;
- int codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
- codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
length++;
}
return length;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
index e6e7167c2..0cffe569d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
@@ -29,6 +29,8 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
switch (formatVersion) {
case VERSION_2:
return VERSION_2;
+ case VERSION_201:
+ return VERSION_201;
case VERSION_4_ONLY_FOR_TESTING:
return VERSION_4_ONLY_FOR_TESTING;
case VERSION_4:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
index 51ad9877c..96310086b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
@@ -32,6 +32,7 @@ class FormatUtils {
enum FORMAT_VERSION {
// These MUST have the same values as the relevant constants in FormatSpec.java.
VERSION_2 = 2,
+ VERSION_201 = 201,
VERSION_4_ONLY_FOR_TESTING = 399,
VERSION_4 = 402,
VERSION_4_DEV = 403,
diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp
index a1c310d8a..c201e0d00 100644
--- a/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp
+++ b/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp
@@ -23,6 +23,19 @@
namespace latinime {
namespace {
+TEST(ByteArrayUtilsTest, TestReadCodePointTable) {
+ const int codePointTable[] = { 0x6f, 0x6b };
+ const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u };
+ int pos = 0;
+ // Expect the first entry of codePointTable
+ EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
+ // Expect the second entry of codePointTable
+ EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
+ // Expect the original code point from buffer[2] to buffer[4], 0x100
+ // It isn't picked from the codePointTable, since it exceeds the range of the codePointTable.
+ EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos));
+}
+
TEST(ByteArrayUtilsTest, TestReadInt) {
const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu };
@@ -67,7 +80,7 @@ TEST(ByteArrayUtilsTest, TestReadCodePoint) {
int pos = 0;
int codePointArray[3];
- EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH,
+ EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr,
codePointArray, &pos));
EXPECT_EQ(0x10FF00, codePointArray[0]);
EXPECT_EQ(0x20, codePointArray[1]);