diff options
Diffstat (limited to 'native/jni/src/suggest/policyimpl/dictionary/header')
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp | 58 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h | 81 |
2 files changed, 59 insertions, 80 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index 300e96c4e..a2a0f11b4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -18,6 +18,8 @@ #include <algorithm> +#include "utils/ngram_utils.h" + namespace latinime { // Note that these are corresponding definitions in Java side in DictionaryHeader. @@ -28,9 +30,11 @@ const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; -const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; -const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; -const char *const HeaderPolicy::TRIGRAM_COUNT_KEY = "TRIGRAM_COUNT"; +const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = + {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT"}; +const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = + {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT"}; +const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000}; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; // Historical info is information that is needed to support decaying such as timestamp, level and // count. @@ -39,18 +43,10 @@ const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; -const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_ENTRY_COUNT"; -const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_ENTRY_COUNT"; -const char *const HeaderPolicy::MAX_TRIGRAM_COUNT_KEY = "MAX_TRIGRAM_ENTRY_COUNT"; - const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; -const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000; -const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 30000; -const int HeaderPolicy::DEFAULT_MAX_TRIGRAM_COUNT = 30000; - // Used for logging. Question mark is used to indicate that the key is not found. void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const { @@ -126,15 +122,22 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTim return true; } +namespace { + +int getIndexFromNgramType(const NgramType ngramType) { + return static_cast<int>(ngramType); +} + +} // namespace + void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, - entryCounts.getUnigramCount()); - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, - entryCounts.getBigramCount()); - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, TRIGRAM_COUNT_KEY, - entryCounts.getTrigramCount()); + for (const auto ngramType : AllNgramTypes::ASCENDING) { + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], + entryCounts.getNgramCount(ngramType)); + } HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, extendedRegionSize); // Set the current time as the generation time. @@ -155,4 +158,25 @@ void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, return attributeMap; } +/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); + entryCounters.setNgramCount(ngramType, entryCount); + } + return entryCounters.getEntryCounts(); +} + +/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int index = getIndexFromNgramType(ngramType); + const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); + entryCounters.setNgramCount(ngramType, maxEntryCount); + } + return entryCounters.getEntryCounts(); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 7a5acd7d5..f76931baa 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -46,12 +46,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), - mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - BIGRAM_COUNT_KEY, 0 /* defaultValue */)), - mTrigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - TRIGRAM_COUNT_KEY, 0 /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( @@ -59,12 +54,6 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), - mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), - mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), - mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Constructs header information using an attribute map. @@ -82,18 +71,13 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mExtendedRegionSize(0), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(0), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), - mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), - mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), - mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Copy header information @@ -105,15 +89,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), mIsDecayingDict(headerPolicy->mIsDecayingDict), mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), - mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount), - mTrigramCount(headerPolicy->mTrigramCount), + mNgramCounts(headerPolicy->mNgramCounts), + mMaxNgramCounts(headerPolicy->mMaxNgramCounts), mExtendedRegionSize(headerPolicy->mExtendedRegionSize), mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), mForgettingCurveProbabilityValuesTableId( headerPolicy->mForgettingCurveProbabilityValuesTableId), - mMaxUnigramCount(headerPolicy->mMaxUnigramCount), - mMaxBigramCount(headerPolicy->mMaxBigramCount), - mMaxTrigramCount(headerPolicy->mMaxTrigramCount), mCodePointTable(headerPolicy->mCodePointTable) {} // Temporary dummy header. @@ -121,10 +102,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), - mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mTrigramCount(0), + mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), - mForgettingCurveProbabilityValuesTableId(0), mMaxUnigramCount(0), mMaxBigramCount(0), - mMaxTrigramCount(0), mCodePointTable(nullptr) {} + mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} ~HeaderPolicy() {} @@ -186,16 +166,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mLastDecayedTime; } - AK_FORCE_INLINE int getUnigramCount() const { - return mUnigramCount; + AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { + return mNgramCounts; } - AK_FORCE_INLINE int getBigramCount() const { - return mBigramCount; - } - - AK_FORCE_INLINE int getTrigramCount() const { - return mTrigramCount; + AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { + return mMaxNgramCounts; } AK_FORCE_INLINE int getExtendedRegionSize() const { @@ -219,18 +195,6 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mForgettingCurveProbabilityValuesTableId; } - AK_FORCE_INLINE int getMaxUnigramCount() const { - return mMaxUnigramCount; - } - - AK_FORCE_INLINE int getMaxBigramCount() const { - return mMaxBigramCount; - } - - AK_FORCE_INLINE int getMaxTrigramCount() const { - return mMaxTrigramCount; - } - void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; @@ -262,24 +226,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const IS_DECAYING_DICT_KEY; static const char *const DATE_KEY; static const char *const LAST_DECAYED_TIME_KEY; - static const char *const UNIGRAM_COUNT_KEY; - static const char *const BIGRAM_COUNT_KEY; - static const char *const TRIGRAM_COUNT_KEY; + static const char *const NGRAM_COUNT_KEYS[]; + static const char *const MAX_NGRAM_COUNT_KEYS[]; + static const int DEFAULT_MAX_NGRAM_COUNTS[]; static const char *const EXTENDED_REGION_SIZE_KEY; static const char *const HAS_HISTORICAL_INFO_KEY; static const char *const LOCALE_KEY; static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; - static const char *const MAX_UNIGRAM_COUNT_KEY; - static const char *const MAX_BIGRAM_COUNT_KEY; - static const char *const MAX_TRIGRAM_COUNT_KEY; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; - static const int DEFAULT_MAX_UNIGRAM_COUNT; - static const int DEFAULT_MAX_BIGRAM_COUNT; - static const int DEFAULT_MAX_TRIGRAM_COUNT; const FormatUtils::FORMAT_VERSION mDictFormatVersion; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; @@ -291,21 +249,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const bool mIsDecayingDict; const int mDate; const int mLastDecayedTime; - const int mUnigramCount; - const int mBigramCount; - const int mTrigramCount; + const EntryCounts mNgramCounts; + const EntryCounts mMaxNgramCounts; const int mExtendedRegionSize; const bool mHasHistoricalInfoOfWords; const int mForgettingCurveProbabilityValuesTableId; - const int mMaxUnigramCount; - const int mMaxBigramCount; - const int mMaxTrigramCount; const int *const mCodePointTable; const std::vector<int> readLocale() const; float readMultipleWordCostMultiplier() const; bool readRequiresGermanUmlautProcessing() const; - + const EntryCounts readNgramCounts() const; + const EntryCounts readMaxNgramCounts() const; static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( const uint8_t *const dictBuf); }; |