aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/dictionary/header
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/dictionary/header')
-rw-r--r--native/jni/src/dictionary/header/header_policy.cpp183
-rw-r--r--native/jni/src/dictionary/header/header_policy.h268
-rw-r--r--native/jni/src/dictionary/header/header_read_write_utils.cpp248
-rw-r--r--native/jni/src/dictionary/header/header_read_write_utils.h123
4 files changed, 822 insertions, 0 deletions
diff --git a/native/jni/src/dictionary/header/header_policy.cpp b/native/jni/src/dictionary/header/header_policy.cpp
new file mode 100644
index 000000000..d4f84d39f
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_policy.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/header/header_policy.h"
+
+#include <algorithm>
+
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+// Note that these are corresponding definitions in Java side in DictionaryHeader.
+const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
+const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
+ "REQUIRES_GERMAN_UMLAUT_PROCESSING";
+// TODO: Change attribute string to "IS_DECAYING_DICT".
+const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
+const char *const HeaderPolicy::DATE_KEY = "date";
+const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
+const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] =
+ {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"};
+const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] =
+ {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT",
+ "MAX_QUADGRAM_ENTRY_COUNT"};
+const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000};
+const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
+// Historical info is information that is needed to support decaying such as timestamp, level and
+// count.
+const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
+const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
+const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
+ "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
+
+const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
+const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
+const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
+
+// Used for logging. Question mark is used to indicate that the key is not found.
+void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
+ int outValueSize) const {
+ if (outValueSize <= 0) return;
+ if (outValueSize == 1) {
+ outValue[0] = '\0';
+ return;
+ }
+ std::vector<int> keyCodePointVector;
+ HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector);
+ DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it =
+ mAttributeMap.find(keyCodePointVector);
+ if (it == mAttributeMap.end()) {
+ // The key was not found.
+ outValue[0] = '?';
+ outValue[1] = '\0';
+ return;
+ }
+ const int terminalIndex = std::min(static_cast<int>(it->second.size()), outValueSize - 1);
+ for (int i = 0; i < terminalIndex; ++i) {
+ outValue[i] = it->second[i];
+ }
+ outValue[terminalIndex] = '\0';
+}
+
+const std::vector<int> HeaderPolicy::readLocale() const {
+ return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY);
+}
+
+float HeaderPolicy::readMultipleWordCostMultiplier() const {
+ const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
+ if (demotionRate <= 0) {
+ return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
+ }
+ return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
+}
+
+bool HeaderPolicy::readRequiresGermanUmlautProcessing() const {
+ return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false);
+}
+
+bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ BufferWithExtendableBuffer *const outBuffer) const {
+ int writingPos = 0;
+ DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap);
+ fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite);
+ if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion,
+ &writingPos)) {
+ return false;
+ }
+ if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags,
+ &writingPos)) {
+ return false;
+ }
+ // Temporarily writes a dummy header size.
+ int headerSizeFieldPos = writingPos;
+ if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */,
+ &writingPos)) {
+ return false;
+ }
+ if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite,
+ &writingPos)) {
+ return false;
+ }
+ // Writes the actual header size.
+ if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos,
+ &headerSizeFieldPos)) {
+ return false;
+ }
+ return true;
+}
+
+namespace {
+
+int getIndexFromNgramType(const NgramType ngramType) {
+ return static_cast<int>(ngramType);
+}
+
+} // namespace
+
+void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime,
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const {
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap,
+ NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)],
+ entryCounts.getNgramCount(ngramType));
+ }
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY,
+ extendedRegionSize);
+ // Set the current time as the generation time.
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY,
+ TimeKeeper::peekCurrentTime());
+ HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale);
+ if (updatesLastDecayedTime) {
+ // Set current time as the last updated time.
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY,
+ TimeKeeper::peekCurrentTime());
+ }
+}
+
+/* static */ DictionaryHeaderStructurePolicy::AttributeMap
+ HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) {
+ DictionaryHeaderStructurePolicy::AttributeMap attributeMap;
+ HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap);
+ return attributeMap;
+}
+
+/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const {
+ MutableEntryCounters entryCounters;
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */);
+ entryCounters.setNgramCount(ngramType, entryCount);
+ }
+ return entryCounters.getEntryCounts();
+}
+
+/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const {
+ MutableEntryCounters entryCounters;
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ const int index = getIndexFromNgramType(ngramType);
+ const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]);
+ entryCounters.setNgramCount(ngramType, maxEntryCount);
+ }
+ return entryCounters.getEntryCounts();
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/header/header_policy.h b/native/jni/src/dictionary/header/header_policy.h
new file mode 100644
index 000000000..47cc9196a
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_policy.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_HEADER_POLICY_H
+#define LATINIME_HEADER_POLICY_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/header/header_read_write_utils.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/utils/entry_counters.h"
+#include "dictionary/utils/format_utils.h"
+#include "utils/char_utils.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+class HeaderPolicy : public DictionaryHeaderStructurePolicy {
+ public:
+ // Reads information from existing dictionary buffer.
+ HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion)
+ : mDictFormatVersion(formatVersion),
+ mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
+ mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
+ mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
+ mLocale(readLocale()),
+ mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
+ mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
+ mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ IS_DECAYING_DICT_KEY, false /* defaultValue */)),
+ mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
+ mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
+ mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
+ &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
+ mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
+ DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
+
+ // Constructs header information using an attribute map.
+ HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
+ const std::vector<int> &locale,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap)
+ : mDictFormatVersion(dictFormatVersion),
+ mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
+ attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale),
+ mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
+ mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
+ mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ IS_DECAYING_DICT_KEY, false /* defaultValue */)),
+ mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
+ mExtendedRegionSize(0),
+ mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
+ &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
+ mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
+ DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
+
+ // Copy header information
+ HeaderPolicy(const HeaderPolicy *const headerPolicy)
+ : mDictFormatVersion(headerPolicy->mDictFormatVersion),
+ mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize),
+ mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale),
+ mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier),
+ mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
+ mIsDecayingDict(headerPolicy->mIsDecayingDict),
+ mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
+ mNgramCounts(headerPolicy->mNgramCounts),
+ mMaxNgramCounts(headerPolicy->mMaxNgramCounts),
+ mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
+ mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
+ mForgettingCurveProbabilityValuesTableId(
+ headerPolicy->mForgettingCurveProbabilityValuesTableId),
+ mCodePointTable(headerPolicy->mCodePointTable) {}
+
+ // Temporary dummy header.
+ HeaderPolicy()
+ : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
+ mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
+ mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
+ mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(),
+ mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
+ mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {}
+
+ ~HeaderPolicy() {}
+
+ virtual int getFormatVersionNumber() const {
+ // Conceptually this converts the symbolic value we use in the code into the
+ // hardcoded of the bytes in the file. But we want the constants to be the
+ // same so we use them for both here.
+ switch (mDictFormatVersion) {
+ case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ return FormatUtils::UNKNOWN_VERSION;
+ case FormatUtils::VERSION_202:
+ return FormatUtils::VERSION_202;
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
+ case FormatUtils::VERSION_402:
+ return FormatUtils::VERSION_402;
+ case FormatUtils::VERSION_403:
+ return FormatUtils::VERSION_403;
+ default:
+ return FormatUtils::UNKNOWN_VERSION;
+ }
+ }
+
+ AK_FORCE_INLINE bool isValid() const {
+ // Decaying dictionary must have historical information.
+ if (!mIsDecayingDict) {
+ return true;
+ }
+ if (mHasHistoricalInfoOfWords) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ AK_FORCE_INLINE int getSize() const {
+ return mSize;
+ }
+
+ AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
+ return mMultiWordCostMultiplier;
+ }
+
+ AK_FORCE_INLINE bool isDecayingDict() const {
+ return mIsDecayingDict;
+ }
+
+ AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
+ return mRequiresGermanUmlautProcessing;
+ }
+
+ AK_FORCE_INLINE int getDate() const {
+ return mDate;
+ }
+
+ AK_FORCE_INLINE int getLastDecayedTime() const {
+ return mLastDecayedTime;
+ }
+
+ AK_FORCE_INLINE const EntryCounts &getNgramCounts() const {
+ return mNgramCounts;
+ }
+
+ AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const {
+ return mMaxNgramCounts;
+ }
+
+ AK_FORCE_INLINE int getExtendedRegionSize() const {
+ return mExtendedRegionSize;
+ }
+
+ AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
+ return mHasHistoricalInfoOfWords;
+ }
+
+ AK_FORCE_INLINE bool shouldBoostExactMatches() const {
+ // TODO: Investigate better ways to handle exact matches for personalized dictionaries.
+ return !isDecayingDict();
+ }
+
+ const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const {
+ return &mAttributeMap;
+ }
+
+ AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
+ return mForgettingCurveProbabilityValuesTableId;
+ }
+
+ void readHeaderValueOrQuestionMark(const char *const key,
+ int *outValue, int outValueSize) const;
+
+ bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ BufferWithExtendableBuffer *const outBuffer) const;
+
+ void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts,
+ const int extendedRegionSize,
+ DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
+
+ AK_FORCE_INLINE const std::vector<int> *getLocale() const {
+ return &mLocale;
+ }
+
+ bool supportsBeginningOfSentence() const {
+ return mDictFormatVersion >= FormatUtils::VERSION_402;
+ }
+
+ const int *getCodePointTable() const {
+ return mCodePointTable;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
+
+ static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
+ static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
+ static const char *const IS_DECAYING_DICT_KEY;
+ static const char *const DATE_KEY;
+ static const char *const LAST_DECAYED_TIME_KEY;
+ static const char *const NGRAM_COUNT_KEYS[];
+ static const char *const MAX_NGRAM_COUNT_KEYS[];
+ static const int DEFAULT_MAX_NGRAM_COUNTS[];
+ static const char *const EXTENDED_REGION_SIZE_KEY;
+ static const char *const HAS_HISTORICAL_INFO_KEY;
+ static const char *const LOCALE_KEY;
+ static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
+ static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
+ static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
+ static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
+ static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
+ static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
+
+ const FormatUtils::FORMAT_VERSION mDictFormatVersion;
+ const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
+ const int mSize;
+ DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap;
+ const std::vector<int> mLocale;
+ const float mMultiWordCostMultiplier;
+ const bool mRequiresGermanUmlautProcessing;
+ const bool mIsDecayingDict;
+ const int mDate;
+ const int mLastDecayedTime;
+ const EntryCounts mNgramCounts;
+ const EntryCounts mMaxNgramCounts;
+ const int mExtendedRegionSize;
+ const bool mHasHistoricalInfoOfWords;
+ const int mForgettingCurveProbabilityValuesTableId;
+ const int *const mCodePointTable;
+
+ const std::vector<int> readLocale() const;
+ float readMultipleWordCostMultiplier() const;
+ bool readRequiresGermanUmlautProcessing() const;
+ const EntryCounts readNgramCounts() const;
+ const EntryCounts readMaxNgramCounts() const;
+ static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
+ const uint8_t *const dictBuf);
+};
+} // namespace latinime
+#endif /* LATINIME_HEADER_POLICY_H */
diff --git a/native/jni/src/dictionary/header/header_read_write_utils.cpp b/native/jni/src/dictionary/header/header_read_write_utils.cpp
new file mode 100644
index 000000000..779f8b8c3
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_read_write_utils.cpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/header/header_read_write_utils.h"
+
+#include <cctype>
+#include <cstdio>
+#include <memory>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator.
+// As such, this is the maximum number of characters will be needed to represent an int as a
+// string, including the terminator; this is used as the size of a string buffer large enough to
+// hold any value that is intended to fit in an integer, e.g. in the code that reads the header
+// of the binary dictionary where a {key,value} string pair scheme is used.
+const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
+
+const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
+const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
+
+const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
+const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
+const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
+const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
+const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
+
+const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
+
+typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
+
+/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) {
+ // See the format of the header in the comment in
+ // BinaryDictionaryFormatUtils::detectFormatVersion()
+ return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE
+ + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE);
+}
+
+/* static */ HeaderReadWriteUtils::DictionaryFlags
+ HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) {
+ return ByteArrayUtils::readUint16(dictBuf,
+ HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE);
+}
+
+/* static */ HeaderReadWriteUtils::DictionaryFlags
+ HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
+ const AttributeMap *const attributeMap) {
+ return NO_FLAGS;
+}
+
+/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf,
+ AttributeMap *const headerAttributes) {
+ const int headerSize = getHeaderSize(dictBuf);
+ int pos = getHeaderOptionsPosition();
+ if (pos == NOT_A_DICT_POS) {
+ // The header doesn't have header options.
+ return;
+ }
+ int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
+ std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
+ while (pos < headerSize) {
+ // The values in the header don't use the code point table for their encoding.
+ const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
+ MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
+ std::vector<int> key;
+ key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
+ const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
+ MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
+ std::vector<int> value;
+ value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
+ headerAttributes->insert(AttributeMap::value_type(key, value));
+ }
+}
+
+/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
+ AttributeMap *const headerAttributes) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
+ AttributeMap::const_iterator it = headerAttributes->find(keyVector);
+ if (it == headerAttributes->end()) {
+ return nullptr;
+ }
+ return it->second.data();
+}
+
+/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
+ BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
+ int *const writingPos) {
+ if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE,
+ writingPos)) {
+ return false;
+ }
+ switch (version) {
+ case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
+ case FormatUtils::VERSION_202:
+ // None of the static dictionaries (v2x) support writing
+ return false;
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ case FormatUtils::VERSION_402:
+ case FormatUtils::VERSION_403:
+ return buffer->writeUintAndAdvancePosition(version /* data */,
+ HEADER_DICTIONARY_VERSION_SIZE, writingPos);
+ default:
+ return false;
+ }
+}
+
+/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags(
+ BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags,
+ int *const writingPos) {
+ return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos);
+}
+
+/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize(
+ BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) {
+ return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos);
+}
+
+/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes(
+ BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes,
+ int *const writingPos) {
+ for (AttributeMap::const_iterator it = headerAttributes->begin();
+ it != headerAttributes->end(); ++it) {
+ if (it->first.empty() || it->second.empty()) {
+ continue;
+ }
+ // Write a key.
+ if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(),
+ true /* writesTerminator */, writingPos)) {
+ return false;
+ }
+ // Write a value.
+ if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(),
+ true /* writesTerminator */, writingPos)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute(
+ AttributeMap *const headerAttributes, const char *const key,
+ const std::vector<int> &value) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ (*headerAttributes)[keyVector] = value;
+}
+
+/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const bool value) {
+ setIntAttribute(headerAttributes, key, value ? 1 : 0);
+}
+
+/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const int value) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ setIntAttributeInner(headerAttributes, &keyVector, value);
+}
+
+/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes,
+ const AttributeMap::key_type *const key, const int value) {
+ AttributeMap::mapped_type valueVector;
+ char charBuf[LARGEST_INT_DIGIT_COUNT];
+ snprintf(charBuf, sizeof(charBuf), "%d", value);
+ insertCharactersIntoVector(charBuf, &valueVector);
+ (*headerAttributes)[*key] = valueVector;
+}
+
+/* static */ const std::vector<int> HeaderReadWriteUtils::readCodePointVectorAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ AttributeMap::const_iterator it = headerAttributes->find(keyVector);
+ if (it == headerAttributes->end()) {
+ return std::vector<int>();
+ } else {
+ return it->second;
+ }
+}
+
+/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key,
+ const bool defaultValue) {
+ const int intDefaultValue = defaultValue ? 1 : 0;
+ const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
+ return intValue != 0;
+}
+
+/* static */ int HeaderReadWriteUtils::readIntAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key,
+ const int defaultValue) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue);
+}
+
+/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner(
+ const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
+ const int defaultValue) {
+ AttributeMap::const_iterator it = headerAttributes->find(*key);
+ if (it != headerAttributes->end()) {
+ int value = 0;
+ bool isNegative = false;
+ for (size_t i = 0; i < it->second.size(); ++i) {
+ if (i == 0 && it->second.at(i) == '-') {
+ isNegative = true;
+ } else {
+ if (!isdigit(it->second.at(i))) {
+ // If not a number.
+ return defaultValue;
+ }
+ value *= 10;
+ value += it->second.at(i) - '0';
+ }
+ }
+ return isNegative ? -value : value;
+ }
+ return defaultValue;
+}
+
+/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters,
+ std::vector<int> *const vector) {
+ for (int i = 0; characters[i]; ++i) {
+ vector->push_back(characters[i]);
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/header/header_read_write_utils.h b/native/jni/src/dictionary/header/header_read_write_utils.h
new file mode 100644
index 000000000..f67d614df
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_read_write_utils.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H
+#define LATINIME_HEADER_READ_WRITE_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/utils/format_utils.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class HeaderReadWriteUtils {
+ public:
+ typedef uint16_t DictionaryFlags;
+
+ static int getHeaderSize(const uint8_t *const dictBuf);
+
+ static DictionaryFlags getFlags(const uint8_t *const dictBuf);
+
+ static AK_FORCE_INLINE int getHeaderOptionsPosition() {
+ return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE
+ + HEADER_SIZE_FIELD_SIZE;
+ }
+
+ static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap);
+
+ static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+
+ static const int *readCodePointTable(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+
+ static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
+ const FormatUtils::FORMAT_VERSION version, int *const writingPos);
+
+ static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer,
+ const DictionaryFlags flags, int *const writingPos);
+
+ static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer,
+ const int size, int *const writingPos);
+
+ static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ int *const writingPos);
+
+ /**
+ * Methods for header attributes.
+ */
+ static void setCodePointVectorAttribute(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const std::vector<int> &value);
+
+ static void setBoolAttribute(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const bool value);
+
+ static void setIntAttribute(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const int value);
+
+ static const std::vector<int> readCodePointVectorAttributeValue(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key);
+
+ static bool readBoolAttributeValue(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const bool defaultValue);
+
+ static int readIntAttributeValue(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const int defaultValue);
+
+ static void insertCharactersIntoVector(const char *const characters,
+ DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils);
+
+ static const int LARGEST_INT_DIGIT_COUNT;
+ static const int MAX_ATTRIBUTE_KEY_LENGTH;
+ static const int MAX_ATTRIBUTE_VALUE_LENGTH;
+
+ static const int HEADER_MAGIC_NUMBER_SIZE;
+ static const int HEADER_DICTIONARY_VERSION_SIZE;
+ static const int HEADER_FLAG_SIZE;
+ static const int HEADER_SIZE_FIELD_SIZE;
+
+ static const char *const CODE_POINT_TABLE_KEY;
+
+ // Value for the "flags" field. It's unused at the moment.
+ static const DictionaryFlags NO_FLAGS;
+
+ static void setIntAttributeInner(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key,
+ const int value);
+
+ static int readIntAttributeValueInner(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key,
+ const int defaultValue);
+};
+}
+#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */