diff options
Diffstat (limited to 'native/jni/src/dictionary/header')
-rw-r--r-- | native/jni/src/dictionary/header/header_policy.cpp | 183 | ||||
-rw-r--r-- | native/jni/src/dictionary/header/header_policy.h | 268 | ||||
-rw-r--r-- | native/jni/src/dictionary/header/header_read_write_utils.cpp | 248 | ||||
-rw-r--r-- | native/jni/src/dictionary/header/header_read_write_utils.h | 123 |
4 files changed, 822 insertions, 0 deletions
diff --git a/native/jni/src/dictionary/header/header_policy.cpp b/native/jni/src/dictionary/header/header_policy.cpp new file mode 100644 index 000000000..d4f84d39f --- /dev/null +++ b/native/jni/src/dictionary/header/header_policy.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_policy.h" + +#include <algorithm> + +#include "utils/ngram_utils.h" + +namespace latinime { + +// Note that these are corresponding definitions in Java side in DictionaryHeader. +const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; +const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = + "REQUIRES_GERMAN_UMLAUT_PROCESSING"; +// TODO: Change attribute string to "IS_DECAYING_DICT". +const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; +const char *const HeaderPolicy::DATE_KEY = "date"; +const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; +const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = + {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"}; +const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = + {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT", + "MAX_QUADGRAM_ENTRY_COUNT"}; +const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000}; +const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; +// Historical info is information that is needed to support decaying such as timestamp, level and +// count. +const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; +const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration +const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = + "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; + +const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; +const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; +const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; + +// Used for logging. Question mark is used to indicate that the key is not found. +void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const { + if (outValueSize <= 0) return; + if (outValueSize == 1) { + outValue[0] = '\0'; + return; + } + std::vector<int> keyCodePointVector; + HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); + DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it = + mAttributeMap.find(keyCodePointVector); + if (it == mAttributeMap.end()) { + // The key was not found. + outValue[0] = '?'; + outValue[1] = '\0'; + return; + } + const int terminalIndex = std::min(static_cast<int>(it->second.size()), outValueSize - 1); + for (int i = 0; i < terminalIndex; ++i) { + outValue[i] = it->second[i]; + } + outValue[terminalIndex] = '\0'; +} + +const std::vector<int> HeaderPolicy::readLocale() const { + return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY); +} + +float HeaderPolicy::readMultipleWordCostMultiplier() const { + const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); + if (demotionRate <= 0) { + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + } + return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate); +} + +bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { + return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false); +} + +bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const { + int writingPos = 0; + DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); + fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite); + if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags, + &writingPos)) { + return false; + } + // Temporarily writes a dummy header size. + int headerSizeFieldPos = writingPos; + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite, + &writingPos)) { + return false; + } + // Writes the actual header size. + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos, + &headerSizeFieldPos)) { + return false; + } + return true; +} + +namespace { + +int getIndexFromNgramType(const NgramType ngramType) { + return static_cast<int>(ngramType); +} + +} // namespace + +void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { + for (const auto ngramType : AllNgramTypes::ASCENDING) { + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], + entryCounts.getNgramCount(ngramType)); + } + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, + extendedRegionSize); + // Set the current time as the generation time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY, + TimeKeeper::peekCurrentTime()); + HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale); + if (updatesLastDecayedTime) { + // Set current time as the last updated time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY, + TimeKeeper::peekCurrentTime()); + } +} + +/* static */ DictionaryHeaderStructurePolicy::AttributeMap + HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); + return attributeMap; +} + +/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); + entryCounters.setNgramCount(ngramType, entryCount); + } + return entryCounters.getEntryCounts(); +} + +/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int index = getIndexFromNgramType(ngramType); + const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); + entryCounters.setNgramCount(ngramType, maxEntryCount); + } + return entryCounters.getEntryCounts(); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/header/header_policy.h b/native/jni/src/dictionary/header/header_policy.h new file mode 100644 index 000000000..47cc9196a --- /dev/null +++ b/native/jni/src/dictionary/header/header_policy.h @@ -0,0 +1,268 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_POLICY_H +#define LATINIME_HEADER_POLICY_H + +#include <cstdint> + +#include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/format_utils.h" +#include "utils/char_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class HeaderPolicy : public DictionaryHeaderStructurePolicy { + public: + // Reads information from existing dictionary buffer. + HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion) + : mDictFormatVersion(formatVersion), + mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), + mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), + mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), + mLocale(readLocale()), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} + + // Constructs header information using an attribute map. + HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, + const std::vector<int> &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) + : mDictFormatVersion(dictFormatVersion), + mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(0), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} + + // Copy header information + HeaderPolicy(const HeaderPolicy *const headerPolicy) + : mDictFormatVersion(headerPolicy->mDictFormatVersion), + mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize), + mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale), + mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), + mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), + mIsDecayingDict(headerPolicy->mIsDecayingDict), + mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), + mNgramCounts(headerPolicy->mNgramCounts), + mMaxNgramCounts(headerPolicy->mMaxNgramCounts), + mExtendedRegionSize(headerPolicy->mExtendedRegionSize), + mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), + mForgettingCurveProbabilityValuesTableId( + headerPolicy->mForgettingCurveProbabilityValuesTableId), + mCodePointTable(headerPolicy->mCodePointTable) {} + + // Temporary dummy header. + HeaderPolicy() + : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), + mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), + mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), + mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), + mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), + mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} + + ~HeaderPolicy() {} + + virtual int getFormatVersionNumber() const { + // Conceptually this converts the symbolic value we use in the code into the + // hardcoded of the bytes in the file. But we want the constants to be the + // same so we use them for both here. + switch (mDictFormatVersion) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return FormatUtils::UNKNOWN_VERSION; + case FormatUtils::VERSION_202: + return FormatUtils::VERSION_202; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + return FormatUtils::VERSION_4_ONLY_FOR_TESTING; + case FormatUtils::VERSION_402: + return FormatUtils::VERSION_402; + case FormatUtils::VERSION_403: + return FormatUtils::VERSION_403; + default: + return FormatUtils::UNKNOWN_VERSION; + } + } + + AK_FORCE_INLINE bool isValid() const { + // Decaying dictionary must have historical information. + if (!mIsDecayingDict) { + return true; + } + if (mHasHistoricalInfoOfWords) { + return true; + } else { + return false; + } + } + + AK_FORCE_INLINE int getSize() const { + return mSize; + } + + AK_FORCE_INLINE float getMultiWordCostMultiplier() const { + return mMultiWordCostMultiplier; + } + + AK_FORCE_INLINE bool isDecayingDict() const { + return mIsDecayingDict; + } + + AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { + return mRequiresGermanUmlautProcessing; + } + + AK_FORCE_INLINE int getDate() const { + return mDate; + } + + AK_FORCE_INLINE int getLastDecayedTime() const { + return mLastDecayedTime; + } + + AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { + return mNgramCounts; + } + + AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { + return mMaxNgramCounts; + } + + AK_FORCE_INLINE int getExtendedRegionSize() const { + return mExtendedRegionSize; + } + + AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { + return mHasHistoricalInfoOfWords; + } + + AK_FORCE_INLINE bool shouldBoostExactMatches() const { + // TODO: Investigate better ways to handle exact matches for personalized dictionaries. + return !isDecayingDict(); + } + + const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const { + return &mAttributeMap; + } + + AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { + return mForgettingCurveProbabilityValuesTableId; + } + + void readHeaderValueOrQuestionMark(const char *const key, + int *outValue, int outValueSize) const; + + bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const; + + void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, + const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; + + AK_FORCE_INLINE const std::vector<int> *getLocale() const { + return &mLocale; + } + + bool supportsBeginningOfSentence() const { + return mDictFormatVersion >= FormatUtils::VERSION_402; + } + + const int *getCodePointTable() const { + return mCodePointTable; + } + + private: + DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); + + static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; + static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; + static const char *const IS_DECAYING_DICT_KEY; + static const char *const DATE_KEY; + static const char *const LAST_DECAYED_TIME_KEY; + static const char *const NGRAM_COUNT_KEYS[]; + static const char *const MAX_NGRAM_COUNT_KEYS[]; + static const int DEFAULT_MAX_NGRAM_COUNTS[]; + static const char *const EXTENDED_REGION_SIZE_KEY; + static const char *const HAS_HISTORICAL_INFO_KEY; + static const char *const LOCALE_KEY; + static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; + static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; + static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; + static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; + static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; + static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; + + const FormatUtils::FORMAT_VERSION mDictFormatVersion; + const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; + const int mSize; + DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; + const std::vector<int> mLocale; + const float mMultiWordCostMultiplier; + const bool mRequiresGermanUmlautProcessing; + const bool mIsDecayingDict; + const int mDate; + const int mLastDecayedTime; + const EntryCounts mNgramCounts; + const EntryCounts mMaxNgramCounts; + const int mExtendedRegionSize; + const bool mHasHistoricalInfoOfWords; + const int mForgettingCurveProbabilityValuesTableId; + const int *const mCodePointTable; + + const std::vector<int> readLocale() const; + float readMultipleWordCostMultiplier() const; + bool readRequiresGermanUmlautProcessing() const; + const EntryCounts readNgramCounts() const; + const EntryCounts readMaxNgramCounts() const; + static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( + const uint8_t *const dictBuf); +}; +} // namespace latinime +#endif /* LATINIME_HEADER_POLICY_H */ diff --git a/native/jni/src/dictionary/header/header_read_write_utils.cpp b/native/jni/src/dictionary/header/header_read_write_utils.cpp new file mode 100644 index 000000000..779f8b8c3 --- /dev/null +++ b/native/jni/src/dictionary/header/header_read_write_utils.cpp @@ -0,0 +1,248 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_read_write_utils.h" + +#include <cctype> +#include <cstdio> +#include <memory> +#include <vector> + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. +// As such, this is the maximum number of characters will be needed to represent an int as a +// string, including the terminator; this is used as the size of a string buffer large enough to +// hold any value that is intended to fit in an integer, e.g. in the code that reads the header +// of the binary dictionary where a {key,value} string pair scheme is used. +const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; + +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048; + +const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; +const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; +const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable"; + +const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; + +typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; + +/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { + // See the format of the header in the comment in + // BinaryDictionaryFormatUtils::detectFormatVersion() + return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE + + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) { + return ByteArrayUtils::readUint16(dictBuf, + HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + const AttributeMap *const attributeMap) { + return NO_FLAGS; +} + +/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes) { + const int headerSize = getHeaderSize(dictBuf); + int pos = getHeaderOptionsPosition(); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return; + } + int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; + std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]); + while (pos < headerSize) { + // The values in the header don't use the code point table for their encoding. + const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos); + std::vector<int> key; + key.insert(key.end(), keyBuffer, keyBuffer + keyLength); + const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos); + std::vector<int> value; + value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength); + headerAttributes->insert(AttributeMap::value_type(key, value)); + } +} + +/* static */ const int *HeaderReadWriteUtils::readCodePointTable( + AttributeMap *const headerAttributes) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return nullptr; + } + return it->second.data(); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( + BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, + int *const writingPos) { + if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE, + writingPos)) { + return false; + } + switch (version) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + case FormatUtils::VERSION_202: + // None of the static dictionaries (v2x) support writing + return false; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: + return buffer->writeUintAndAdvancePosition(version /* data */, + HEADER_DICTIONARY_VERSION_SIZE, writingPos); + default: + return false; + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags( + BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags, + int *const writingPos) { + return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize( + BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) { + return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes( + BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes, + int *const writingPos) { + for (AttributeMap::const_iterator it = headerAttributes->begin(); + it != headerAttributes->end(); ++it) { + if (it->first.empty() || it->second.empty()) { + continue; + } + // Write a key. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + // Write a value. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + } + return true; +} + +/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute( + AttributeMap *const headerAttributes, const char *const key, + const std::vector<int> &value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + (*headerAttributes)[keyVector] = value; +} + +/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, + const char *const key, const bool value) { + setIntAttribute(headerAttributes, key, value ? 1 : 0); +} + +/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, + const char *const key, const int value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + setIntAttributeInner(headerAttributes, &keyVector, value); +} + +/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value) { + AttributeMap::mapped_type valueVector; + char charBuf[LARGEST_INT_DIGIT_COUNT]; + snprintf(charBuf, sizeof(charBuf), "%d", value); + insertCharactersIntoVector(charBuf, &valueVector); + (*headerAttributes)[*key] = valueVector; +} + +/* static */ const std::vector<int> HeaderReadWriteUtils::readCodePointVectorAttributeValue( + const AttributeMap *const headerAttributes, const char *const key) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return std::vector<int>(); + } else { + return it->second; + } +} + +/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const bool defaultValue) { + const int intDefaultValue = defaultValue ? 1 : 0; + const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); + return intValue != 0; +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const int defaultValue) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue); +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const int defaultValue) { + AttributeMap::const_iterator it = headerAttributes->find(*key); + if (it != headerAttributes->end()) { + int value = 0; + bool isNegative = false; + for (size_t i = 0; i < it->second.size(); ++i) { + if (i == 0 && it->second.at(i) == '-') { + isNegative = true; + } else { + if (!isdigit(it->second.at(i))) { + // If not a number. + return defaultValue; + } + value *= 10; + value += it->second.at(i) - '0'; + } + } + return isNegative ? -value : value; + } + return defaultValue; +} + +/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters, + std::vector<int> *const vector) { + for (int i = 0; characters[i]; ++i) { + vector->push_back(characters[i]); + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/header/header_read_write_utils.h b/native/jni/src/dictionary/header/header_read_write_utils.h new file mode 100644 index 000000000..f67d614df --- /dev/null +++ b/native/jni/src/dictionary/header/header_read_write_utils.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H +#define LATINIME_HEADER_READ_WRITE_UTILS_H + +#include <cstdint> + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class HeaderReadWriteUtils { + public: + typedef uint16_t DictionaryFlags; + + static int getHeaderSize(const uint8_t *const dictBuf); + + static DictionaryFlags getFlags(const uint8_t *const dictBuf); + + static AK_FORCE_INLINE int getHeaderOptionsPosition() { + return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE + + HEADER_SIZE_FIELD_SIZE; + } + + static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + + static const int *readCodePointTable( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, + const FormatUtils::FORMAT_VERSION version, int *const writingPos); + + static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer, + const DictionaryFlags flags, int *const writingPos); + + static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer, + const int size, int *const writingPos); + + static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + int *const writingPos); + + /** + * Methods for header attributes. + */ + static void setCodePointVectorAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const std::vector<int> &value); + + static void setBoolAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const bool value); + + static void setIntAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const int value); + + static const std::vector<int> readCodePointVectorAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key); + + static bool readBoolAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const bool defaultValue); + + static int readIntAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const int defaultValue); + + static void insertCharactersIntoVector(const char *const characters, + DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); + + static const int LARGEST_INT_DIGIT_COUNT; + static const int MAX_ATTRIBUTE_KEY_LENGTH; + static const int MAX_ATTRIBUTE_VALUE_LENGTH; + + static const int HEADER_MAGIC_NUMBER_SIZE; + static const int HEADER_DICTIONARY_VERSION_SIZE; + static const int HEADER_FLAG_SIZE; + static const int HEADER_SIZE_FIELD_SIZE; + + static const char *const CODE_POINT_TABLE_KEY; + + // Value for the "flags" field. It's unused at the moment. + static const DictionaryFlags NO_FLAGS; + + static void setIntAttributeInner( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int value); + + static int readIntAttributeValueInner( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int defaultValue); +}; +} +#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ |