aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/dictionary
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/dictionary')
-rw-r--r--native/jni/src/dictionary/header/header_policy.cpp183
-rw-r--r--native/jni/src/dictionary/header/header_policy.h268
-rw-r--r--native/jni/src/dictionary/header/header_read_write_utils.cpp248
-rw-r--r--native/jni/src/dictionary/header/header_read_write_utils.h123
-rw-r--r--native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h42
-rw-r--r--native/jni/src/dictionary/interface/dictionary_header_structure_policy.h63
-rw-r--r--native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h46
-rw-r--r--native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h124
-rw-r--r--native/jni/src/dictionary/interface/ngram_listener.h42
-rw-r--r--native/jni/src/dictionary/property/historical_info.h59
-rw-r--r--native/jni/src/dictionary/property/ngram_context.cpp123
-rw-r--r--native/jni/src/dictionary/property/ngram_context.h78
-rw-r--r--native/jni/src/dictionary/property/ngram_property.h62
-rw-r--r--native/jni/src/dictionary/property/unigram_property.h137
-rw-r--r--native/jni/src/dictionary/property/word_attributes.h68
-rw-r--r--native/jni/src/dictionary/property/word_property.h62
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/Readme.txt1
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp289
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h94
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp226
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h135
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h110
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/dict_content.h47
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp170
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h74
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h90
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp199
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h101
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h88
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp50
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h124
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp111
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h73
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h118
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp157
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h152
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp81
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h84
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp110
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h79
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp442
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h150
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp662
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h181
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp39
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h52
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp307
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h140
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp90
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h57
-rw-r--r--native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp209
-rw-r--r--native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h64
-rw-r--r--native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp95
-rw-r--r--native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h68
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp144
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h173
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp321
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h282
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp72
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h83
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp299
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h96
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp132
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h79
-rw-r--r--native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp164
-rw-r--r--native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h133
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h45
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_params.h262
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_reader.h40
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_writer.h97
-rw-r--r--native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp53
-rw-r--r--native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h72
-rw-r--r--native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h59
-rw-r--r--native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp526
-rw-r--r--native/jni/src/dictionary/structure/v2/patricia_trie_policy.h180
-rw-r--r--native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h73
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp52
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h52
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp54
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h43
-rw-r--r--native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp34
-rw-r--r--native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h77
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp478
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h258
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp32
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h101
-rw-r--r--native/jni/src/dictionary/structure/v4/content/probability_entry.h176
-rw-r--r--native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp182
-rw-r--r--native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h92
-rw-r--r--native/jni/src/dictionary/structure/v4/content/single_dict_content.h64
-rw-r--r--native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp40
-rw-r--r--native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h91
-rw-r--r--native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp98
-rw-r--r--native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h63
-rw-r--r--native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h106
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp194
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h132
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp72
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_constants.h75
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp91
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h55
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp354
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h108
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp603
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h149
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp28
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h37
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp185
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h76
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp79
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h42
-rw-r--r--native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h69
-rw-r--r--native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h61
-rw-r--r--native/jni/src/dictionary/utils/bloom_filter.h69
-rw-r--r--native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp170
-rw-r--r--native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h125
-rw-r--r--native/jni/src/dictionary/utils/byte_array_utils.cpp25
-rw-r--r--native/jni/src/dictionary/utils/byte_array_utils.h290
-rw-r--r--native/jni/src/dictionary/utils/dict_file_writing_utils.cpp144
-rw-r--r--native/jni/src/dictionary/utils/dict_file_writing_utils.h67
-rw-r--r--native/jni/src/dictionary/utils/entry_counters.h89
-rw-r--r--native/jni/src/dictionary/utils/file_utils.cpp171
-rw-r--r--native/jni/src/dictionary/utils/file_utils.h60
-rw-r--r--native/jni/src/dictionary/utils/forgetting_curve_utils.cpp234
-rw-r--r--native/jni/src/dictionary/utils/forgetting_curve_utils.h112
-rw-r--r--native/jni/src/dictionary/utils/format_utils.cpp71
-rw-r--r--native/jni/src/dictionary/utils/format_utils.h59
-rw-r--r--native/jni/src/dictionary/utils/mmapped_buffer.cpp98
-rw-r--r--native/jni/src/dictionary/utils/mmapped_buffer.h76
-rw-r--r--native/jni/src/dictionary/utils/multi_bigram_map.cpp100
-rw-r--r--native/jni/src/dictionary/utils/multi_bigram_map.h84
-rw-r--r--native/jni/src/dictionary/utils/probability_utils.cpp23
-rw-r--r--native/jni/src/dictionary/utils/probability_utils.h69
-rw-r--r--native/jni/src/dictionary/utils/sparse_table.cpp101
-rw-r--r--native/jni/src/dictionary/utils/sparse_table.h60
-rw-r--r--native/jni/src/dictionary/utils/trie_map.cpp460
-rw-r--r--native/jni/src/dictionary/utils/trie_map.h399
137 files changed, 18091 insertions, 0 deletions
diff --git a/native/jni/src/dictionary/header/header_policy.cpp b/native/jni/src/dictionary/header/header_policy.cpp
new file mode 100644
index 000000000..d4f84d39f
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_policy.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/header/header_policy.h"
+
+#include <algorithm>
+
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+// Note that these are corresponding definitions in Java side in DictionaryHeader.
+const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
+const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
+ "REQUIRES_GERMAN_UMLAUT_PROCESSING";
+// TODO: Change attribute string to "IS_DECAYING_DICT".
+const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
+const char *const HeaderPolicy::DATE_KEY = "date";
+const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
+const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] =
+ {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"};
+const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] =
+ {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT",
+ "MAX_QUADGRAM_ENTRY_COUNT"};
+const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000};
+const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
+// Historical info is information that is needed to support decaying such as timestamp, level and
+// count.
+const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
+const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
+const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
+ "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
+
+const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
+const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
+const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
+
+// Used for logging. Question mark is used to indicate that the key is not found.
+void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
+ int outValueSize) const {
+ if (outValueSize <= 0) return;
+ if (outValueSize == 1) {
+ outValue[0] = '\0';
+ return;
+ }
+ std::vector<int> keyCodePointVector;
+ HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector);
+ DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it =
+ mAttributeMap.find(keyCodePointVector);
+ if (it == mAttributeMap.end()) {
+ // The key was not found.
+ outValue[0] = '?';
+ outValue[1] = '\0';
+ return;
+ }
+ const int terminalIndex = std::min(static_cast<int>(it->second.size()), outValueSize - 1);
+ for (int i = 0; i < terminalIndex; ++i) {
+ outValue[i] = it->second[i];
+ }
+ outValue[terminalIndex] = '\0';
+}
+
+const std::vector<int> HeaderPolicy::readLocale() const {
+ return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY);
+}
+
+float HeaderPolicy::readMultipleWordCostMultiplier() const {
+ const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
+ if (demotionRate <= 0) {
+ return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
+ }
+ return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
+}
+
+bool HeaderPolicy::readRequiresGermanUmlautProcessing() const {
+ return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false);
+}
+
+bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ BufferWithExtendableBuffer *const outBuffer) const {
+ int writingPos = 0;
+ DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap);
+ fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite);
+ if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion,
+ &writingPos)) {
+ return false;
+ }
+ if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags,
+ &writingPos)) {
+ return false;
+ }
+ // Temporarily writes a dummy header size.
+ int headerSizeFieldPos = writingPos;
+ if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */,
+ &writingPos)) {
+ return false;
+ }
+ if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite,
+ &writingPos)) {
+ return false;
+ }
+ // Writes the actual header size.
+ if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos,
+ &headerSizeFieldPos)) {
+ return false;
+ }
+ return true;
+}
+
+namespace {
+
+int getIndexFromNgramType(const NgramType ngramType) {
+ return static_cast<int>(ngramType);
+}
+
+} // namespace
+
+void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime,
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const {
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap,
+ NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)],
+ entryCounts.getNgramCount(ngramType));
+ }
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY,
+ extendedRegionSize);
+ // Set the current time as the generation time.
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY,
+ TimeKeeper::peekCurrentTime());
+ HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale);
+ if (updatesLastDecayedTime) {
+ // Set current time as the last updated time.
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY,
+ TimeKeeper::peekCurrentTime());
+ }
+}
+
+/* static */ DictionaryHeaderStructurePolicy::AttributeMap
+ HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) {
+ DictionaryHeaderStructurePolicy::AttributeMap attributeMap;
+ HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap);
+ return attributeMap;
+}
+
+/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const {
+ MutableEntryCounters entryCounters;
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */);
+ entryCounters.setNgramCount(ngramType, entryCount);
+ }
+ return entryCounters.getEntryCounts();
+}
+
+/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const {
+ MutableEntryCounters entryCounters;
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ const int index = getIndexFromNgramType(ngramType);
+ const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]);
+ entryCounters.setNgramCount(ngramType, maxEntryCount);
+ }
+ return entryCounters.getEntryCounts();
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/header/header_policy.h b/native/jni/src/dictionary/header/header_policy.h
new file mode 100644
index 000000000..47cc9196a
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_policy.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_HEADER_POLICY_H
+#define LATINIME_HEADER_POLICY_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/header/header_read_write_utils.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/utils/entry_counters.h"
+#include "dictionary/utils/format_utils.h"
+#include "utils/char_utils.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+class HeaderPolicy : public DictionaryHeaderStructurePolicy {
+ public:
+ // Reads information from existing dictionary buffer.
+ HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion)
+ : mDictFormatVersion(formatVersion),
+ mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
+ mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
+ mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
+ mLocale(readLocale()),
+ mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
+ mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
+ mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ IS_DECAYING_DICT_KEY, false /* defaultValue */)),
+ mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
+ mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
+ mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
+ &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
+ mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
+ DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
+
+ // Constructs header information using an attribute map.
+ HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
+ const std::vector<int> &locale,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap)
+ : mDictFormatVersion(dictFormatVersion),
+ mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
+ attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale),
+ mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
+ mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
+ mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
+ IS_DECAYING_DICT_KEY, false /* defaultValue */)),
+ mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
+ mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
+ mExtendedRegionSize(0),
+ mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
+ &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
+ mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
+ &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
+ DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
+
+ // Copy header information
+ HeaderPolicy(const HeaderPolicy *const headerPolicy)
+ : mDictFormatVersion(headerPolicy->mDictFormatVersion),
+ mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize),
+ mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale),
+ mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier),
+ mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
+ mIsDecayingDict(headerPolicy->mIsDecayingDict),
+ mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
+ mNgramCounts(headerPolicy->mNgramCounts),
+ mMaxNgramCounts(headerPolicy->mMaxNgramCounts),
+ mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
+ mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
+ mForgettingCurveProbabilityValuesTableId(
+ headerPolicy->mForgettingCurveProbabilityValuesTableId),
+ mCodePointTable(headerPolicy->mCodePointTable) {}
+
+ // Temporary dummy header.
+ HeaderPolicy()
+ : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
+ mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
+ mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
+ mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(),
+ mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
+ mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {}
+
+ ~HeaderPolicy() {}
+
+ virtual int getFormatVersionNumber() const {
+ // Conceptually this converts the symbolic value we use in the code into the
+ // hardcoded of the bytes in the file. But we want the constants to be the
+ // same so we use them for both here.
+ switch (mDictFormatVersion) {
+ case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ return FormatUtils::UNKNOWN_VERSION;
+ case FormatUtils::VERSION_202:
+ return FormatUtils::VERSION_202;
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
+ case FormatUtils::VERSION_402:
+ return FormatUtils::VERSION_402;
+ case FormatUtils::VERSION_403:
+ return FormatUtils::VERSION_403;
+ default:
+ return FormatUtils::UNKNOWN_VERSION;
+ }
+ }
+
+ AK_FORCE_INLINE bool isValid() const {
+ // Decaying dictionary must have historical information.
+ if (!mIsDecayingDict) {
+ return true;
+ }
+ if (mHasHistoricalInfoOfWords) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ AK_FORCE_INLINE int getSize() const {
+ return mSize;
+ }
+
+ AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
+ return mMultiWordCostMultiplier;
+ }
+
+ AK_FORCE_INLINE bool isDecayingDict() const {
+ return mIsDecayingDict;
+ }
+
+ AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
+ return mRequiresGermanUmlautProcessing;
+ }
+
+ AK_FORCE_INLINE int getDate() const {
+ return mDate;
+ }
+
+ AK_FORCE_INLINE int getLastDecayedTime() const {
+ return mLastDecayedTime;
+ }
+
+ AK_FORCE_INLINE const EntryCounts &getNgramCounts() const {
+ return mNgramCounts;
+ }
+
+ AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const {
+ return mMaxNgramCounts;
+ }
+
+ AK_FORCE_INLINE int getExtendedRegionSize() const {
+ return mExtendedRegionSize;
+ }
+
+ AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
+ return mHasHistoricalInfoOfWords;
+ }
+
+ AK_FORCE_INLINE bool shouldBoostExactMatches() const {
+ // TODO: Investigate better ways to handle exact matches for personalized dictionaries.
+ return !isDecayingDict();
+ }
+
+ const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const {
+ return &mAttributeMap;
+ }
+
+ AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
+ return mForgettingCurveProbabilityValuesTableId;
+ }
+
+ void readHeaderValueOrQuestionMark(const char *const key,
+ int *outValue, int outValueSize) const;
+
+ bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ BufferWithExtendableBuffer *const outBuffer) const;
+
+ void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts,
+ const int extendedRegionSize,
+ DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
+
+ AK_FORCE_INLINE const std::vector<int> *getLocale() const {
+ return &mLocale;
+ }
+
+ bool supportsBeginningOfSentence() const {
+ return mDictFormatVersion >= FormatUtils::VERSION_402;
+ }
+
+ const int *getCodePointTable() const {
+ return mCodePointTable;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
+
+ static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
+ static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
+ static const char *const IS_DECAYING_DICT_KEY;
+ static const char *const DATE_KEY;
+ static const char *const LAST_DECAYED_TIME_KEY;
+ static const char *const NGRAM_COUNT_KEYS[];
+ static const char *const MAX_NGRAM_COUNT_KEYS[];
+ static const int DEFAULT_MAX_NGRAM_COUNTS[];
+ static const char *const EXTENDED_REGION_SIZE_KEY;
+ static const char *const HAS_HISTORICAL_INFO_KEY;
+ static const char *const LOCALE_KEY;
+ static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
+ static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
+ static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
+ static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
+ static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
+ static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
+
+ const FormatUtils::FORMAT_VERSION mDictFormatVersion;
+ const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
+ const int mSize;
+ DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap;
+ const std::vector<int> mLocale;
+ const float mMultiWordCostMultiplier;
+ const bool mRequiresGermanUmlautProcessing;
+ const bool mIsDecayingDict;
+ const int mDate;
+ const int mLastDecayedTime;
+ const EntryCounts mNgramCounts;
+ const EntryCounts mMaxNgramCounts;
+ const int mExtendedRegionSize;
+ const bool mHasHistoricalInfoOfWords;
+ const int mForgettingCurveProbabilityValuesTableId;
+ const int *const mCodePointTable;
+
+ const std::vector<int> readLocale() const;
+ float readMultipleWordCostMultiplier() const;
+ bool readRequiresGermanUmlautProcessing() const;
+ const EntryCounts readNgramCounts() const;
+ const EntryCounts readMaxNgramCounts() const;
+ static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
+ const uint8_t *const dictBuf);
+};
+} // namespace latinime
+#endif /* LATINIME_HEADER_POLICY_H */
diff --git a/native/jni/src/dictionary/header/header_read_write_utils.cpp b/native/jni/src/dictionary/header/header_read_write_utils.cpp
new file mode 100644
index 000000000..779f8b8c3
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_read_write_utils.cpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/header/header_read_write_utils.h"
+
+#include <cctype>
+#include <cstdio>
+#include <memory>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator.
+// As such, this is the maximum number of characters will be needed to represent an int as a
+// string, including the terminator; this is used as the size of a string buffer large enough to
+// hold any value that is intended to fit in an integer, e.g. in the code that reads the header
+// of the binary dictionary where a {key,value} string pair scheme is used.
+const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
+
+const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
+const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
+
+const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
+const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
+const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
+const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
+const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
+
+const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
+
+typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
+
+/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) {
+ // See the format of the header in the comment in
+ // BinaryDictionaryFormatUtils::detectFormatVersion()
+ return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE
+ + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE);
+}
+
+/* static */ HeaderReadWriteUtils::DictionaryFlags
+ HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) {
+ return ByteArrayUtils::readUint16(dictBuf,
+ HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE);
+}
+
+/* static */ HeaderReadWriteUtils::DictionaryFlags
+ HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
+ const AttributeMap *const attributeMap) {
+ return NO_FLAGS;
+}
+
+/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf,
+ AttributeMap *const headerAttributes) {
+ const int headerSize = getHeaderSize(dictBuf);
+ int pos = getHeaderOptionsPosition();
+ if (pos == NOT_A_DICT_POS) {
+ // The header doesn't have header options.
+ return;
+ }
+ int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
+ std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
+ while (pos < headerSize) {
+ // The values in the header don't use the code point table for their encoding.
+ const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
+ MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
+ std::vector<int> key;
+ key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
+ const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
+ MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
+ std::vector<int> value;
+ value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
+ headerAttributes->insert(AttributeMap::value_type(key, value));
+ }
+}
+
+/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
+ AttributeMap *const headerAttributes) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
+ AttributeMap::const_iterator it = headerAttributes->find(keyVector);
+ if (it == headerAttributes->end()) {
+ return nullptr;
+ }
+ return it->second.data();
+}
+
+/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
+ BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
+ int *const writingPos) {
+ if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE,
+ writingPos)) {
+ return false;
+ }
+ switch (version) {
+ case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
+ case FormatUtils::VERSION_202:
+ // None of the static dictionaries (v2x) support writing
+ return false;
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ case FormatUtils::VERSION_402:
+ case FormatUtils::VERSION_403:
+ return buffer->writeUintAndAdvancePosition(version /* data */,
+ HEADER_DICTIONARY_VERSION_SIZE, writingPos);
+ default:
+ return false;
+ }
+}
+
+/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags(
+ BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags,
+ int *const writingPos) {
+ return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos);
+}
+
+/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize(
+ BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) {
+ return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos);
+}
+
+/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes(
+ BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes,
+ int *const writingPos) {
+ for (AttributeMap::const_iterator it = headerAttributes->begin();
+ it != headerAttributes->end(); ++it) {
+ if (it->first.empty() || it->second.empty()) {
+ continue;
+ }
+ // Write a key.
+ if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(),
+ true /* writesTerminator */, writingPos)) {
+ return false;
+ }
+ // Write a value.
+ if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(),
+ true /* writesTerminator */, writingPos)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute(
+ AttributeMap *const headerAttributes, const char *const key,
+ const std::vector<int> &value) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ (*headerAttributes)[keyVector] = value;
+}
+
+/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const bool value) {
+ setIntAttribute(headerAttributes, key, value ? 1 : 0);
+}
+
+/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
+ const char *const key, const int value) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ setIntAttributeInner(headerAttributes, &keyVector, value);
+}
+
+/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes,
+ const AttributeMap::key_type *const key, const int value) {
+ AttributeMap::mapped_type valueVector;
+ char charBuf[LARGEST_INT_DIGIT_COUNT];
+ snprintf(charBuf, sizeof(charBuf), "%d", value);
+ insertCharactersIntoVector(charBuf, &valueVector);
+ (*headerAttributes)[*key] = valueVector;
+}
+
+/* static */ const std::vector<int> HeaderReadWriteUtils::readCodePointVectorAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ AttributeMap::const_iterator it = headerAttributes->find(keyVector);
+ if (it == headerAttributes->end()) {
+ return std::vector<int>();
+ } else {
+ return it->second;
+ }
+}
+
+/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key,
+ const bool defaultValue) {
+ const int intDefaultValue = defaultValue ? 1 : 0;
+ const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
+ return intValue != 0;
+}
+
+/* static */ int HeaderReadWriteUtils::readIntAttributeValue(
+ const AttributeMap *const headerAttributes, const char *const key,
+ const int defaultValue) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(key, &keyVector);
+ return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue);
+}
+
+/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner(
+ const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
+ const int defaultValue) {
+ AttributeMap::const_iterator it = headerAttributes->find(*key);
+ if (it != headerAttributes->end()) {
+ int value = 0;
+ bool isNegative = false;
+ for (size_t i = 0; i < it->second.size(); ++i) {
+ if (i == 0 && it->second.at(i) == '-') {
+ isNegative = true;
+ } else {
+ if (!isdigit(it->second.at(i))) {
+ // If not a number.
+ return defaultValue;
+ }
+ value *= 10;
+ value += it->second.at(i) - '0';
+ }
+ }
+ return isNegative ? -value : value;
+ }
+ return defaultValue;
+}
+
+/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters,
+ std::vector<int> *const vector) {
+ for (int i = 0; characters[i]; ++i) {
+ vector->push_back(characters[i]);
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/header/header_read_write_utils.h b/native/jni/src/dictionary/header/header_read_write_utils.h
new file mode 100644
index 000000000..f67d614df
--- /dev/null
+++ b/native/jni/src/dictionary/header/header_read_write_utils.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H
+#define LATINIME_HEADER_READ_WRITE_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/utils/format_utils.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class HeaderReadWriteUtils {
+ public:
+ typedef uint16_t DictionaryFlags;
+
+ static int getHeaderSize(const uint8_t *const dictBuf);
+
+ static DictionaryFlags getFlags(const uint8_t *const dictBuf);
+
+ static AK_FORCE_INLINE int getHeaderOptionsPosition() {
+ return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE
+ + HEADER_SIZE_FIELD_SIZE;
+ }
+
+ static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap);
+
+ static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+
+ static const int *readCodePointTable(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+
+ static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
+ const FormatUtils::FORMAT_VERSION version, int *const writingPos);
+
+ static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer,
+ const DictionaryFlags flags, int *const writingPos);
+
+ static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer,
+ const int size, int *const writingPos);
+
+ static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ int *const writingPos);
+
+ /**
+ * Methods for header attributes.
+ */
+ static void setCodePointVectorAttribute(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const std::vector<int> &value);
+
+ static void setBoolAttribute(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const bool value);
+
+ static void setIntAttribute(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const int value);
+
+ static const std::vector<int> readCodePointVectorAttributeValue(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key);
+
+ static bool readBoolAttributeValue(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const bool defaultValue);
+
+ static int readIntAttributeValue(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const char *const key, const int defaultValue);
+
+ static void insertCharactersIntoVector(const char *const characters,
+ DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils);
+
+ static const int LARGEST_INT_DIGIT_COUNT;
+ static const int MAX_ATTRIBUTE_KEY_LENGTH;
+ static const int MAX_ATTRIBUTE_VALUE_LENGTH;
+
+ static const int HEADER_MAGIC_NUMBER_SIZE;
+ static const int HEADER_DICTIONARY_VERSION_SIZE;
+ static const int HEADER_FLAG_SIZE;
+ static const int HEADER_SIZE_FIELD_SIZE;
+
+ static const char *const CODE_POINT_TABLE_KEY;
+
+ // Value for the "flags" field. It's unused at the moment.
+ static const DictionaryFlags NO_FLAGS;
+
+ static void setIntAttributeInner(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key,
+ const int value);
+
+ static int readIntAttributeValueInner(
+ const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
+ const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key,
+ const int defaultValue);
+};
+}
+#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */
diff --git a/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h
new file mode 100644
index 000000000..aa0d068aa
--- /dev/null
+++ b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H
+#define LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H
+
+#include "defines.h"
+
+namespace latinime {
+
+/*
+ * This class abstracts structure of bigrams.
+ */
+class DictionaryBigramsStructurePolicy {
+ public:
+ virtual ~DictionaryBigramsStructurePolicy() {}
+
+ virtual void getNextBigram(int *const outBigramPos, int *const outProbability,
+ bool *const outHasNext, int *const pos) const = 0;
+ virtual bool skipAllBigrams(int *const pos) const = 0;
+
+ protected:
+ DictionaryBigramsStructurePolicy() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DictionaryBigramsStructurePolicy);
+};
+} // namespace latinime
+#endif /* LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H */
diff --git a/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h
new file mode 100644
index 000000000..6da390e55
--- /dev/null
+++ b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H
+#define LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H
+
+#include <map>
+#include <vector>
+
+#include "defines.h"
+
+namespace latinime {
+
+/*
+ * This class abstracts structure of dictionaries.
+ * Implement this policy to support additional dictionaries.
+ */
+class DictionaryHeaderStructurePolicy {
+ public:
+ typedef std::map<std::vector<int>, std::vector<int>> AttributeMap;
+
+ virtual ~DictionaryHeaderStructurePolicy() {}
+
+ virtual int getFormatVersionNumber() const = 0;
+
+ virtual int getSize() const = 0;
+
+ virtual const AttributeMap *getAttributeMap() const = 0;
+
+ virtual bool requiresGermanUmlautProcessing() const = 0;
+
+ virtual float getMultiWordCostMultiplier() const = 0;
+
+ virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue,
+ int outValueSize) const = 0;
+
+ virtual bool shouldBoostExactMatches() const = 0;
+
+ virtual const std::vector<int> *getLocale() const = 0;
+
+ virtual bool supportsBeginningOfSentence() const = 0;
+
+ protected:
+ DictionaryHeaderStructurePolicy() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DictionaryHeaderStructurePolicy);
+};
+} // namespace latinime
+#endif /* LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H */
diff --git a/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h
new file mode 100644
index 000000000..40b6c2de1
--- /dev/null
+++ b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H
+#define LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H
+
+#include "defines.h"
+
+namespace latinime {
+
+/*
+ * This class abstracts structure of shortcuts.
+ */
+class DictionaryShortcutsStructurePolicy {
+ public:
+ virtual ~DictionaryShortcutsStructurePolicy() {}
+
+ virtual int getStartPos(const int pos) const = 0;
+
+ virtual void getNextShortcut(const int maxCodePointCount, int *const outCodePoint,
+ int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext,
+ int *const pos) const = 0;
+
+ virtual void skipAllShortcuts(int *const pos) const = 0;
+
+ protected:
+ DictionaryShortcutsStructurePolicy() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DictionaryShortcutsStructurePolicy);
+};
+} // namespace latinime
+#endif /* LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H */
diff --git a/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h
new file mode 100644
index 000000000..ace48491d
--- /dev/null
+++ b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H
+#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H
+
+#include <memory>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/property/word_attributes.h"
+#include "dictionary/property/word_property.h"
+#include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class DicNode;
+class DicNodeVector;
+class DictionaryHeaderStructurePolicy;
+class MultiBigramMap;
+class NgramListener;
+class NgramContext;
+class UnigramProperty;
+
+/*
+ * This class abstracts the structure of dictionaries.
+ * Implement this policy to support additional dictionaries.
+ */
+class DictionaryStructureWithBufferPolicy {
+ public:
+ typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr;
+
+ virtual ~DictionaryStructureWithBufferPolicy() {}
+
+ virtual int getRootPosition() const = 0;
+
+ virtual void createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const = 0;
+
+ virtual int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const = 0;
+
+ virtual int getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const = 0;
+
+ virtual const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const = 0;
+
+ // TODO: Remove
+ virtual int getProbability(const int unigramProbability, const int bigramProbability) const = 0;
+
+ virtual int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const = 0;
+
+ virtual void iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const = 0;
+
+ virtual BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const = 0;
+
+ virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0;
+
+ // Returns whether the update was success or not.
+ virtual bool addUnigramEntry(const CodePointArrayView wordCodePoints,
+ const UnigramProperty *const unigramProperty) = 0;
+
+ // Returns whether the update was success or not.
+ virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0;
+
+ // Returns whether the update was success or not.
+ virtual bool addNgramEntry(const NgramProperty *const ngramProperty) = 0;
+
+ // Returns whether the update was success or not.
+ virtual bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) = 0;
+
+ // Returns whether the update was success or not.
+ virtual bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo) = 0;
+
+ // Returns whether the flush was success or not.
+ virtual bool flush(const char *const filePath) = 0;
+
+ // Returns whether the GC and flush were success or not.
+ virtual bool flushWithGC(const char *const filePath) = 0;
+
+ virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0;
+
+ // Currently, this method is used only for testing. You may want to consider creating new
+ // dedicated method instead of this if you want to use this in the production.
+ virtual void getProperty(const char *const query, const int queryLength, char *const outResult,
+ const int maxResultLength) = 0;
+
+ virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0;
+
+ // Method to iterate all words in the dictionary.
+ // The returned token has to be used to get the next word. If token is 0, this method newly
+ // starts iterating the dictionary.
+ virtual int getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount) = 0;
+
+ virtual bool isCorrupted() const = 0;
+
+ protected:
+ DictionaryStructureWithBufferPolicy() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DictionaryStructureWithBufferPolicy);
+};
+} // namespace latinime
+#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */
diff --git a/native/jni/src/dictionary/interface/ngram_listener.h b/native/jni/src/dictionary/interface/ngram_listener.h
new file mode 100644
index 000000000..2eb5e9fd1
--- /dev/null
+++ b/native/jni/src/dictionary/interface/ngram_listener.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_NGRAM_LISTENER_H
+#define LATINIME_NGRAM_LISTENER_H
+
+#include "defines.h"
+
+namespace latinime {
+
+/**
+ * Interface to iterate ngram entries.
+ */
+class NgramListener {
+ public:
+ // ngramProbability is always 0 for v403 decaying dictionary.
+ // TODO: Remove ngramProbability.
+ virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0;
+ virtual ~NgramListener() {};
+
+ protected:
+ NgramListener() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(NgramListener);
+
+};
+} // namespace latinime
+#endif /* LATINIME_NGRAM_LISTENER_H */
diff --git a/native/jni/src/dictionary/property/historical_info.h b/native/jni/src/dictionary/property/historical_info.h
new file mode 100644
index 000000000..e5ce1ea25
--- /dev/null
+++ b/native/jni/src/dictionary/property/historical_info.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_HISTORICAL_INFO_H
+#define LATINIME_HISTORICAL_INFO_H
+
+#include "defines.h"
+
+namespace latinime {
+
+class HistoricalInfo {
+ public:
+ // Invalid historical info.
+ HistoricalInfo()
+ : mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0) {}
+
+ HistoricalInfo(const int timestamp, const int level, const int count)
+ : mTimestamp(timestamp), mLevel(level), mCount(count) {}
+
+ bool isValid() const {
+ return mTimestamp != NOT_A_TIMESTAMP;
+ }
+
+ int getTimestamp() const {
+ return mTimestamp;
+ }
+
+ // TODO: Remove
+ int getLevel() const {
+ return mLevel;
+ }
+
+ int getCount() const {
+ return mCount;
+ }
+
+ private:
+ // Default copy constructor is used for using in std::vector.
+ DISALLOW_ASSIGNMENT_OPERATOR(HistoricalInfo);
+
+ const int mTimestamp;
+ const int mLevel;
+ const int mCount;
+};
+} // namespace latinime
+#endif /* LATINIME_HISTORICAL_INFO_H */
diff --git a/native/jni/src/dictionary/property/ngram_context.cpp b/native/jni/src/dictionary/property/ngram_context.cpp
new file mode 100644
index 000000000..7b9c3eff6
--- /dev/null
+++ b/native/jni/src/dictionary/property/ngram_context.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/property/ngram_context.h"
+
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "utils/char_utils.h"
+
+namespace latinime {
+
+NgramContext::NgramContext() : mPrevWordCount(0) {}
+
+NgramContext::NgramContext(const NgramContext &ngramContext)
+ : mPrevWordCount(ngramContext.mPrevWordCount) {
+ for (size_t i = 0; i < mPrevWordCount; ++i) {
+ mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
+ memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
+ sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
+ mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
+ }
+}
+
+NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
+ const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
+ const size_t prevWordCount)
+ : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
+ clear();
+ for (size_t i = 0; i < mPrevWordCount; ++i) {
+ if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
+ continue;
+ }
+ memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
+ sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
+ mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
+ mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
+ }
+}
+
+NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
+ const bool isBeginningOfSentence) : mPrevWordCount(1) {
+ clear();
+ if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
+ return;
+ }
+ memmove(mPrevWordCodePoints[0], prevWordCodePoints,
+ sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
+ mPrevWordCodePointCount[0] = prevWordCodePointCount;
+ mIsBeginningOfSentence[0] = isBeginningOfSentence;
+}
+
+bool NgramContext::isValid() const {
+ if (mPrevWordCodePointCount[0] > 0) {
+ return true;
+ }
+ if (mIsBeginningOfSentence[0]) {
+ return true;
+ }
+ return false;
+}
+
+const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
+ if (n <= 0 || n > mPrevWordCount) {
+ return CodePointArrayView();
+ }
+ return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
+}
+
+bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
+ if (n <= 0 || n > mPrevWordCount) {
+ return false;
+ }
+ return mIsBeginningOfSentence[n - 1];
+}
+
+/* static */ int NgramContext::getWordId(
+ const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
+ const int *const wordCodePoints, const int wordCodePointCount,
+ const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
+ if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
+ return NOT_A_WORD_ID;
+ }
+ int codePoints[MAX_WORD_LENGTH];
+ int codePointCount = wordCodePointCount;
+ memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
+ if (isBeginningOfSentence) {
+ codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
+ MAX_WORD_LENGTH);
+ if (codePointCount <= 0) {
+ return NOT_A_WORD_ID;
+ }
+ }
+ const CodePointArrayView codePointArrayView(codePoints, codePointCount);
+ const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
+ false /* forceLowerCaseSearch */);
+ if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
+ // Return the id when when the word was found or doesn't try lower case search.
+ return wordId;
+ }
+ // Check bigrams for lower-cased previous word if original was not found. Useful for
+ // auto-capitalized words like "The [current_word]".
+ return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
+}
+
+void NgramContext::clear() {
+ for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
+ mPrevWordCodePointCount[i] = 0;
+ mIsBeginningOfSentence[i] = false;
+ }
+}
+} // namespace latinime
diff --git a/native/jni/src/dictionary/property/ngram_context.h b/native/jni/src/dictionary/property/ngram_context.h
new file mode 100644
index 000000000..9b36199c9
--- /dev/null
+++ b/native/jni/src/dictionary/property/ngram_context.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_NGRAM_CONTEXT_H
+#define LATINIME_NGRAM_CONTEXT_H
+
+#include <array>
+
+#include "defines.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class DictionaryStructureWithBufferPolicy;
+
+class NgramContext {
+ public:
+ // No prev word information.
+ NgramContext();
+ // Copy constructor to use this class with std::vector and use this class as a return value.
+ NgramContext(const NgramContext &ngramContext);
+ // Construct from previous words.
+ NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
+ const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
+ const size_t prevWordCount);
+ // Construct from a previous word.
+ NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
+ const bool isBeginningOfSentence);
+
+ size_t getPrevWordCount() const {
+ return mPrevWordCount;
+ }
+ bool isValid() const;
+
+ template<size_t N>
+ const WordIdArrayView getPrevWordIds(
+ const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
+ WordIdArray<N> *const prevWordIdBuffer, const bool tryLowerCaseSearch) const {
+ for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) {
+ prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy, mPrevWordCodePoints[i],
+ mPrevWordCodePointCount[i], mIsBeginningOfSentence[i], tryLowerCaseSearch);
+ }
+ return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount);
+ }
+
+ // n is 1-indexed.
+ const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const;
+ // n is 1-indexed.
+ bool isNthPrevWordBeginningOfSentence(const size_t n) const;
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(NgramContext);
+
+ static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
+ const int *const wordCodePoints, const int wordCodePointCount,
+ const bool isBeginningOfSentence, const bool tryLowerCaseSearch);
+ void clear();
+
+ const size_t mPrevWordCount;
+ int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
+ int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+};
+} // namespace latinime
+#endif // LATINIME_NGRAM_CONTEXT_H
diff --git a/native/jni/src/dictionary/property/ngram_property.h b/native/jni/src/dictionary/property/ngram_property.h
new file mode 100644
index 000000000..5f259ec59
--- /dev/null
+++ b/native/jni/src/dictionary/property/ngram_property.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_NGRAM_PROPERTY_H
+#define LATINIME_NGRAM_PROPERTY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/property/ngram_context.h"
+
+namespace latinime {
+
+class NgramProperty {
+ public:
+ NgramProperty(const NgramContext &ngramContext, const std::vector<int> &&targetCodePoints,
+ const int probability, const HistoricalInfo historicalInfo)
+ : mNgramContext(ngramContext), mTargetCodePoints(std::move(targetCodePoints)),
+ mProbability(probability), mHistoricalInfo(historicalInfo) {}
+
+ const NgramContext *getNgramContext() const {
+ return &mNgramContext;
+ }
+
+ const std::vector<int> *getTargetCodePoints() const {
+ return &mTargetCodePoints;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ const HistoricalInfo getHistoricalInfo() const {
+ return mHistoricalInfo;
+ }
+
+ private:
+ // Default copy constructor is used for using in std::vector.
+ DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty);
+ DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty);
+
+ const NgramContext mNgramContext;
+ const std::vector<int> mTargetCodePoints;
+ const int mProbability;
+ const HistoricalInfo mHistoricalInfo;
+};
+} // namespace latinime
+#endif // LATINIME_NGRAM_PROPERTY_H
diff --git a/native/jni/src/dictionary/property/unigram_property.h b/native/jni/src/dictionary/property/unigram_property.h
new file mode 100644
index 000000000..92f61b85d
--- /dev/null
+++ b/native/jni/src/dictionary/property/unigram_property.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_UNIGRAM_PROPERTY_H
+#define LATINIME_UNIGRAM_PROPERTY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+
+namespace latinime {
+
+class UnigramProperty {
+ public:
+ class ShortcutProperty {
+ public:
+ ShortcutProperty(const std::vector<int> &&targetCodePoints, const int probability)
+ : mTargetCodePoints(std::move(targetCodePoints)),
+ mProbability(probability) {}
+
+ const std::vector<int> *getTargetCodePoints() const {
+ return &mTargetCodePoints;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ private:
+ // Default copy constructor is used for using in std::vector.
+ DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty);
+
+ const std::vector<int> mTargetCodePoints;
+ const int mProbability;
+ };
+
+ UnigramProperty()
+ : mRepresentsBeginningOfSentence(false), mIsNotAWord(false),
+ mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY),
+ mHistoricalInfo(), mShortcuts() {}
+
+ // In contexts which do not support the Blacklisted flag (v2, v4<403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(false),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
+
+ // Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(false),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts() {}
+
+ // In contexts which DO support the Blacklisted flag (v403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
+
+ // Without shortcuts, in contexts which DO support the Blacklisted flag (v403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts() {}
+
+ bool representsBeginningOfSentence() const {
+ return mRepresentsBeginningOfSentence;
+ }
+
+ bool isNotAWord() const {
+ return mIsNotAWord;
+ }
+
+ bool isPossiblyOffensive() const {
+ return mIsPossiblyOffensive;
+ }
+
+ bool isBlacklisted() const {
+ return mIsBlacklisted;
+ }
+
+ bool hasShortcuts() const {
+ return !mShortcuts.empty();
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ const HistoricalInfo getHistoricalInfo() const {
+ return mHistoricalInfo;
+ }
+
+ const std::vector<ShortcutProperty> &getShortcuts() const {
+ return mShortcuts;
+ }
+
+ private:
+ // Default copy constructor is used for using as a return value.
+ DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty);
+
+ const bool mRepresentsBeginningOfSentence;
+ const bool mIsNotAWord;
+ const bool mIsBlacklisted;
+ const bool mIsPossiblyOffensive;
+ const int mProbability;
+ const HistoricalInfo mHistoricalInfo;
+ const std::vector<ShortcutProperty> mShortcuts;
+};
+} // namespace latinime
+#endif // LATINIME_UNIGRAM_PROPERTY_H
diff --git a/native/jni/src/dictionary/property/word_attributes.h b/native/jni/src/dictionary/property/word_attributes.h
new file mode 100644
index 000000000..5351e7d7d
--- /dev/null
+++ b/native/jni/src/dictionary/property/word_attributes.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_WORD_ATTRIBUTES_H
+#define LATINIME_WORD_ATTRIBUTES_H
+
+#include "defines.h"
+
+class WordAttributes {
+ public:
+ // Invalid word attributes.
+ WordAttributes()
+ : mProbability(NOT_A_PROBABILITY), mIsBlacklisted(false), mIsNotAWord(false),
+ mIsPossiblyOffensive(false) {}
+
+ WordAttributes(const int probability, const bool isBlacklisted, const bool isNotAWord,
+ const bool isPossiblyOffensive)
+ : mProbability(probability), mIsBlacklisted(isBlacklisted), mIsNotAWord(isNotAWord),
+ mIsPossiblyOffensive(isPossiblyOffensive) {}
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ bool isBlacklisted() const {
+ return mIsBlacklisted;
+ }
+
+ bool isNotAWord() const {
+ return mIsNotAWord;
+ }
+
+ // Whether or not a word is possibly offensive.
+ // * Static dictionaries <v202, as well as dynamic dictionaries <v403, will set this based on
+ // whether or not the probability of the word is zero.
+ // * Static dictionaries >=v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag.
+ // * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model
+ // flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero)
+ //
+ // See the ::getWordAttributes function for each of these dictionary policies for more details.
+ bool isPossiblyOffensive() const {
+ return mIsPossiblyOffensive;
+ }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(WordAttributes);
+
+ int mProbability;
+ bool mIsBlacklisted;
+ bool mIsNotAWord;
+ bool mIsPossiblyOffensive;
+};
+
+ // namespace
+#endif /* LATINIME_WORD_ATTRIBUTES_H */
diff --git a/native/jni/src/dictionary/property/word_property.h b/native/jni/src/dictionary/property/word_property.h
new file mode 100644
index 000000000..3028e020a
--- /dev/null
+++ b/native/jni/src/dictionary/property/word_property.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_WORD_PROPERTY_H
+#define LATINIME_WORD_PROPERTY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+// This class is used for returning information belonging to a word to java side.
+class WordProperty {
+ public:
+ // Default constructor is used to create an instance that indicates an invalid word.
+ WordProperty()
+ : mCodePoints(), mUnigramProperty(), mNgrams() {}
+
+ WordProperty(const std::vector<int> &&codePoints, const UnigramProperty &unigramProperty,
+ const std::vector<NgramProperty> &ngrams)
+ : mCodePoints(std::move(codePoints)), mUnigramProperty(unigramProperty),
+ mNgrams(ngrams) {}
+
+ const CodePointArrayView getCodePoints() const {
+ return CodePointArrayView(mCodePoints);
+ }
+
+ const UnigramProperty &getUnigramProperty() const {
+ return mUnigramProperty;
+ }
+
+ const std::vector<NgramProperty> &getNgramProperties() const {
+ return mNgrams;
+ }
+
+ private:
+ // Default copy constructor is used for using as a return value.
+ DISALLOW_ASSIGNMENT_OPERATOR(WordProperty);
+
+ const std::vector<int> mCodePoints;
+ const UnigramProperty mUnigramProperty;
+ const std::vector<NgramProperty> mNgrams;
+};
+} // namespace latinime
+#endif // LATINIME_WORD_PROPERTY_H
diff --git a/native/jni/src/dictionary/structure/backward/v402/Readme.txt b/native/jni/src/dictionary/structure/backward/v402/Readme.txt
new file mode 100644
index 000000000..9e29e836c
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/Readme.txt
@@ -0,0 +1 @@
+Files under this directory have been auto generated.
diff --git a/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp
new file mode 100644
index 000000000..60749bce6
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
+ * Do not edit this file other than updating policy's interface.
+ *
+ * This file was generated from
+ * dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp
+ */
+
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
+#include "dictionary/structure/backward/v402/content/bigram_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability,
+ bool *const outHasNext, int *const bigramEntryPos) const {
+ const BigramEntry bigramEntry =
+ mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos);
+ if (outBigramPos) {
+ // Lookup target PtNode position.
+ *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition(
+ bigramEntry.getTargetTerminalId());
+ }
+ if (outProbability) {
+ if (bigramEntry.hasHistoricalInfo()) {
+ *outProbability =
+ ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(),
+ mHeaderPolicy);
+ } else {
+ *outProbability = bigramEntry.getProbability();
+ }
+ }
+ if (outHasNext) {
+ *outHasNext = bigramEntry.hasNext();
+ }
+}
+
+bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) {
+ // 1. The word has no bigrams yet.
+ // 2. The word has bigrams, and there is the target in the list.
+ // 3. The word has bigrams, and there is an invalid entry that can be reclaimed.
+ // 4. The word has bigrams. We have to append new bigram entry to the list.
+ // 5. Same as 4, but the list is the last entry of the content file.
+ if (outAddedNewEntry) {
+ *outAddedNewEntry = false;
+ }
+ const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
+ if (bigramListPos == NOT_A_DICT_POS) {
+ // Case 1. PtNode that doesn't have a bigram list.
+ // Create new bigram list.
+ if (!mBigramDictContent->createNewBigramList(terminalId)) {
+ return false;
+ }
+ const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
+ newTargetTerminalId);
+ const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry,
+ ngramProperty);
+ // Write an entry.
+ const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
+ if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) {
+ return false;
+ }
+ if (outAddedNewEntry) {
+ *outAddedNewEntry = true;
+ }
+ return true;
+ }
+
+ int tailEntryPos = NOT_A_DICT_POS;
+ const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos,
+ &tailEntryPos);
+ if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) {
+ // Case 4, 5.
+ // Add new entry to the bigram list.
+ if (tailEntryPos == NOT_A_DICT_POS) {
+ // Case 4. Create new bigram list.
+ if (!mBigramDictContent->createNewBigramList(terminalId)) {
+ return false;
+ }
+ const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId);
+ // Copy existing bigram list.
+ if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) {
+ return false;
+ }
+ }
+ // Write new entry at the tail position of the bigram content.
+ const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
+ newTargetTerminalId);
+ const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
+ &newBigramEntry, ngramProperty);
+ if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) {
+ return false;
+ }
+ // Update has next flag of the tail entry.
+ if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) {
+ return false;
+ }
+ if (outAddedNewEntry) {
+ *outAddedNewEntry = true;
+ }
+ return true;
+ }
+
+ // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry.
+ const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate);
+ if (!originalBigramEntry.isValid()) {
+ // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing
+ // entry is updated.
+ if (outAddedNewEntry) {
+ *outAddedNewEntry = true;
+ }
+ }
+ const BigramEntry updatedBigramEntry =
+ originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId);
+ const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
+ &updatedBigramEntry, ngramProperty);
+ return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate);
+}
+
+bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) {
+ const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
+ if (bigramListPos == NOT_A_DICT_POS) {
+ // Bigram list doesn't exist.
+ return false;
+ }
+ const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos,
+ nullptr /* outTailEntryPos */);
+ if (entryPosToUpdate == NOT_A_DICT_POS) {
+ // Bigram entry doesn't exist.
+ return false;
+ }
+ const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate);
+ if (targetTerminalId != bigramEntry.getTargetTerminalId()) {
+ // Bigram entry doesn't exist.
+ return false;
+ }
+ // Remove bigram entry by marking it as invalid entry and overwriting the original entry.
+ const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry();
+ return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate);
+}
+
+bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
+ int *const outBigramCount) {
+ const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
+ if (bigramListPos == NOT_A_DICT_POS) {
+ // Bigram list doesn't exist.
+ return true;
+ }
+ bool hasNext = true;
+ int readingPos = bigramListPos;
+ while (hasNext) {
+ const int entryPos = readingPos;
+ const BigramEntry bigramEntry =
+ mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = bigramEntry.hasNext();
+ if (!bigramEntry.isValid()) {
+ continue;
+ }
+ const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition(
+ bigramEntry.getTargetTerminalId());
+ if (targetPtNodePos == NOT_A_DICT_POS) {
+ // Invalidate bigram entry.
+ const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry();
+ if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
+ return false;
+ }
+ } else if (bigramEntry.hasHistoricalInfo()) {
+ const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
+ bigramEntry.getHistoricalInfo(), mHeaderPolicy);
+ if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) {
+ const BigramEntry updatedBigramEntry =
+ bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
+ if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
+ return false;
+ }
+ *outBigramCount += 1;
+ } else {
+ // Remove entry.
+ const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry();
+ if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
+ return false;
+ }
+ }
+ } else {
+ *outBigramCount += 1;
+ }
+ }
+ return true;
+}
+
+int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) {
+ const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
+ if (bigramListPos == NOT_A_DICT_POS) {
+ // Bigram list doesn't exist.
+ return 0;
+ }
+ int bigramCount = 0;
+ bool hasNext = true;
+ int readingPos = bigramListPos;
+ while (hasNext) {
+ const BigramEntry bigramEntry =
+ mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = bigramEntry.hasNext();
+ if (bigramEntry.isValid()) {
+ bigramCount++;
+ }
+ }
+ return bigramCount;
+}
+
+int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
+ const int bigramListPos, int *const outTailEntryPos) const {
+ if (outTailEntryPos) {
+ *outTailEntryPos = NOT_A_DICT_POS;
+ }
+ bool hasNext = true;
+ int invalidEntryPos = NOT_A_DICT_POS;
+ int readingPos = bigramListPos;
+ while (hasNext) {
+ const int entryPos = readingPos;
+ const BigramEntry bigramEntry =
+ mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = bigramEntry.hasNext();
+ if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) {
+ // Entry with same target is found.
+ return entryPos;
+ } else if (!bigramEntry.isValid()) {
+ // Invalid entry that can be reused is found.
+ invalidEntryPos = entryPos;
+ }
+ if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) {
+ if (outTailEntryPos) {
+ *outTailEntryPos = entryPos;
+ }
+ }
+ }
+ return invalidEntryPos;
+}
+
+const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
+ const BigramEntry *const originalBigramEntry,
+ const NgramProperty *const ngramProperty) const {
+ // TODO: Consolidate historical info and probability.
+ if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
+ const HistoricalInfo &historicalInfoForUpdate = ngramProperty->getHistoricalInfo();
+ const HistoricalInfo updatedHistoricalInfo =
+ ForgettingCurveUtils::createUpdatedHistoricalInfo(
+ originalBigramEntry->getHistoricalInfo(), ngramProperty->getProbability(),
+ &historicalInfoForUpdate, mHeaderPolicy);
+ return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
+ } else {
+ return originalBigramEntry->updateProbabilityAndGetEntry(ngramProperty->getProbability());
+ }
+}
+
+bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) {
+ const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos);
+ const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext);
+ return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos);
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h
new file mode 100644
index 000000000..58c88ce8a
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
+ * Do not edit this file other than updating policy's interface.
+ *
+ * This file was generated from
+ * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H
+#define LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
+#include "dictionary/structure/backward/v402/content/bigram_entry.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class BigramDictContent;
+} // namespace v402
+} // namespace backward
+class NgramProperty;
+namespace backward {
+namespace v402 {
+} // namespace v402
+} // namespace backward
+class HeaderPolicy;
+namespace backward {
+namespace v402 {
+class TerminalPositionLookupTable;
+
+class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
+ public:
+ Ver4BigramListPolicy(BigramDictContent *const bigramDictContent,
+ const TerminalPositionLookupTable *const terminalPositionLookupTable,
+ const HeaderPolicy *const headerPolicy)
+ : mBigramDictContent(bigramDictContent),
+ mTerminalPositionLookupTable(terminalPositionLookupTable),
+ mHeaderPolicy(headerPolicy) {}
+
+ void getNextBigram(int *const outBigramPos, int *const outProbability,
+ bool *const outHasNext, int *const bigramEntryPos) const;
+
+ bool skipAllBigrams(int *const pos) const {
+ // Do nothing because we don't need to skip bigram lists in ver4 dictionaries.
+ return true;
+ }
+
+ bool addNewEntry(const int terminalId, const int newTargetTerminalId,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
+
+ bool removeEntry(const int terminalId, const int targetTerminalId);
+
+ bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
+ int *const outBigramCount);
+
+ int getBigramEntryConut(const int terminalId);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy);
+
+ int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos,
+ int *const outTailEntryPos) const;
+
+ const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry,
+ const NgramProperty *const ngramProperty) const;
+
+ bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos);
+
+ BigramDictContent *const mBigramDictContent;
+ const TerminalPositionLookupTable *const mTerminalPositionLookupTable;
+ const HeaderPolicy *const mHeaderPolicy;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
new file mode 100644
index 000000000..7fa85dec2
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/bigram_dict_content.cpp
+ */
+
+#include "dictionary/structure/backward/v402/content/bigram_dict_content.h"
+
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
+ int *const bigramEntryPos) const {
+ const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer();
+ const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize();
+ if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) {
+ AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, "
+ "bufSize: %d", *bigramEntryPos, bigramEntryTailPos,
+ bigramListBuffer->getTailPosition());
+ ASSERT(false);
+ return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
+ Ver4DictConstants::NOT_A_TERMINAL_ID);
+ }
+ const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos);
+ const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0;
+ int probability = NOT_A_PROBABILITY;
+ int timestamp = NOT_A_TIMESTAMP;
+ int level = 0;
+ int count = 0;
+ if (mHasHistoricalInfo) {
+ timestamp = bigramListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos);
+ level = bigramListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos);
+ count = bigramListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos);
+ } else {
+ probability = bigramListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos);
+ }
+ const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos);
+ const int targetTerminalId =
+ (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ?
+ Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId;
+ if (mHasHistoricalInfo) {
+ // Hack for better migration.
+ count += level;
+ const HistoricalInfo historicalInfo(timestamp, level, count);
+ return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId);
+ } else {
+ return BigramEntry(hasNext, probability, targetTerminalId);
+ }
+}
+
+bool BigramDictContent::writeBigramEntryAndAdvancePosition(
+ const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) {
+ BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer();
+ const int bigramFlags = createAndGetBigramFlags(bigramEntryToWrite->hasNext());
+ if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags,
+ Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) {
+ AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags);
+ return false;
+ }
+ if (mHasHistoricalInfo) {
+ const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo();
+ if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimestamp(),
+ Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) {
+ AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos,
+ historicalInfo->getTimestamp());
+ return false;
+ }
+ if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(),
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) {
+ AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos,
+ historicalInfo->getLevel());
+ return false;
+ }
+ if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getCount(),
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) {
+ AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos,
+ historicalInfo->getCount());
+ return false;
+ }
+ } else {
+ if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(),
+ Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) {
+ AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos,
+ bigramEntryToWrite->getProbability());
+ return false;
+ }
+ }
+ const int targetTerminalIdToWrite =
+ (bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ?
+ Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID :
+ bigramEntryToWrite->getTargetTerminalId();
+ if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite,
+ Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) {
+ AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d",
+ *entryWritingPos, bigramEntryToWrite->getTargetTerminalId());
+ return false;
+ }
+ return true;
+}
+
+bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos,
+ int *const outTailEntryPos) {
+ int readingPos = bigramListPos;
+ int writingPos = toPos;
+ bool hasNext = true;
+ while (hasNext) {
+ const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = bigramEntry.hasNext();
+ if (!hasNext) {
+ *outTailEntryPos = writingPos;
+ }
+ if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) {
+ AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const BigramDictContent *const originalBigramDictContent,
+ int *const outBigramEntryCount) {
+ for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
+ it != terminalIdMap->end(); ++it) {
+ const int originalBigramListPos =
+ originalBigramDictContent->getBigramListHeadPos(it->first);
+ if (originalBigramListPos == NOT_A_DICT_POS) {
+ // This terminal does not have a bigram list.
+ continue;
+ }
+ const int bigramListPos = getContentBuffer()->getTailPosition();
+ int bigramEntryCount = 0;
+ // Copy bigram list with GC from original content.
+ if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos,
+ terminalIdMap, &bigramEntryCount)) {
+ AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d",
+ originalBigramListPos, bigramListPos);
+ return false;
+ }
+ if (bigramEntryCount == 0) {
+ // All bigram entries are useless. This terminal does not have a bigram list.
+ continue;
+ }
+ *outBigramEntryCount += bigramEntryCount;
+ // Set bigram list position to the lookup table.
+ if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) {
+ AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d",
+ it->second, bigramListPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+// Returns whether GC for the bigram list was succeeded or not.
+bool BigramDictContent::runGCBigramList(const int bigramListPos,
+ const BigramDictContent *const sourceBigramDictContent, const int toPos,
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ int *const outEntrycount) {
+ bool hasNext = true;
+ int readingPos = bigramListPos;
+ int writingPos = toPos;
+ int lastEntryPos = NOT_A_DICT_POS;
+ while (hasNext) {
+ const BigramEntry originalBigramEntry =
+ sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = originalBigramEntry.hasNext();
+ if (originalBigramEntry.getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) {
+ continue;
+ }
+ TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
+ terminalIdMap->find(originalBigramEntry.getTargetTerminalId());
+ if (it == terminalIdMap->end()) {
+ // Target word has been removed.
+ continue;
+ }
+ lastEntryPos = hasNext ? writingPos : NOT_A_DICT_POS;
+ const BigramEntry updatedBigramEntry =
+ originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second);
+ if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) {
+ AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos);
+ return false;
+ }
+ *outEntrycount += 1;
+ }
+ if (lastEntryPos != NOT_A_DICT_POS) {
+ // Update has next flag in the last written entry.
+ const BigramEntry bigramEntry = getBigramEntry(lastEntryPos).updateHasNextAndGetEntry(
+ false /* hasNext */);
+ if (!writeBigramEntry(&bigramEntry, lastEntryPos)) {
+ AKLOGE("Cannot write bigram entry to set hasNext flag after GC. pos: %d", writingPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h
new file mode 100644
index 000000000..14f334a12
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/bigram_dict_content.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H
+#define LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H
+
+#include "defines.h"
+#include "dictionary/structure/backward/v402/content/bigram_entry.h"
+#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class BigramDictContent : public SparseTableDictContent {
+ public:
+ BigramDictContent(const char *const dictPath, const bool hasHistoricalInfo,
+ const bool isUpdatable)
+ : SparseTableDictContent(dictPath,
+ Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::BIGRAM_FILE_EXTENSION, isUpdatable,
+ Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE,
+ Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE),
+ mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ BigramDictContent(const bool hasHistoricalInfo)
+ : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE,
+ Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE),
+ mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ const BigramEntry getBigramEntry(const int bigramEntryPos) const {
+ int readingPos = bigramEntryPos;
+ return getBigramEntryAndAdvancePosition(&readingPos);
+ }
+
+ const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const;
+
+ // Returns head position of bigram list for a PtNode specified by terminalId.
+ int getBigramListHeadPos(const int terminalId) const {
+ const SparseTable *const addressLookupTable = getAddressLookupTable();
+ if (!addressLookupTable->contains(terminalId)) {
+ return NOT_A_DICT_POS;
+ }
+ return addressLookupTable->get(terminalId);
+ }
+
+ bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) {
+ int writingPos = getContentBuffer()->getTailPosition();
+ return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos);
+ }
+
+ bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) {
+ int writingPos = entryWritingPos;
+ return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos);
+ }
+
+ bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite,
+ int *const entryWritingPos);
+
+ bool createNewBigramList(const int terminalId) {
+ const int bigramListPos = getContentBuffer()->getTailPosition();
+ return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos);
+ }
+
+ bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos);
+
+ bool flushToFile(const char *const dictPath) const {
+ return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::BIGRAM_FILE_EXTENSION);
+ }
+
+ bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const BigramDictContent *const originalBigramDictContent,
+ int *const outBigramEntryCount);
+
+ bool isContentTailPos(const int pos) const {
+ return pos == getContentBuffer()->getTailPosition();
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(BigramDictContent);
+
+ int createAndGetBigramFlags(const bool hasNext) const {
+ return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0;
+ }
+
+ int getBigramEntrySize() const {
+ if (mHasHistoricalInfo) {
+ return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE
+ + Ver4DictConstants::TIME_STAMP_FIELD_SIZE
+ + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
+ + Ver4DictConstants::WORD_COUNT_FIELD_SIZE
+ + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE;
+ } else {
+ return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE
+ + Ver4DictConstants::PROBABILITY_SIZE
+ + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE;
+ }
+ }
+
+ bool runGCBigramList(const int bigramListPos,
+ const BigramDictContent *const sourceBigramDictContent, const int toPos,
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ int *const outEntryCount);
+
+ bool mHasHistoricalInfo;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h
new file mode 100644
index 000000000..36ad855ee
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/bigram_entry.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H
+#define LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class BigramEntry {
+ public:
+ BigramEntry(const BigramEntry& bigramEntry)
+ : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability),
+ mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {}
+
+ // Entry with historical information.
+ BigramEntry(const bool hasNext, const int probability, const int targetTerminalId)
+ : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(),
+ mTargetTerminalId(targetTerminalId) {}
+
+ // Entry with historical information.
+ BigramEntry(const bool hasNext, const int probability,
+ const HistoricalInfo *const historicalInfo, const int targetTerminalId)
+ : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo),
+ mTargetTerminalId(targetTerminalId) {}
+
+ const BigramEntry getInvalidatedEntry() const {
+ return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID);
+ }
+
+ const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const {
+ return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId);
+ }
+
+ const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const {
+ return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId);
+ }
+
+ const BigramEntry updateProbabilityAndGetEntry(const int probability) const {
+ return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId);
+ }
+
+ const BigramEntry updateHistoricalInfoAndGetEntry(
+ const HistoricalInfo *const historicalInfo) const {
+ return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId);
+ }
+
+ bool isValid() const {
+ return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID;
+ }
+
+ bool hasNext() const {
+ return mHasNext;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ bool hasHistoricalInfo() const {
+ return mHistoricalInfo.isValid();
+ }
+
+ const HistoricalInfo *getHistoricalInfo() const {
+ return &mHistoricalInfo;
+ }
+
+ int getTargetTerminalId() const {
+ return mTargetTerminalId;
+ }
+
+ private:
+ // Copy constructor is public to use this class as a type of return value.
+ DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry);
+ DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry);
+
+ const bool mHasNext;
+ const int mProbability;
+ const HistoricalInfo mHistoricalInfo;
+ const int mTargetTerminalId;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h
new file mode 100644
index 000000000..d3b84fa04
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/dict_content.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_DICT_CONTENT_H
+#define LATINIME_BACKWARD_V402_DICT_CONTENT_H
+
+#include "defines.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class DictContent {
+ public:
+ virtual ~DictContent() {}
+ virtual bool isValid() const = 0;
+
+ protected:
+ DictContent() {}
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DictContent);
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp
new file mode 100644
index 000000000..b167f0ab2
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/probability_dict_content.cpp
+ */
+
+#include "dictionary/structure/backward/v402/content/probability_dict_content.h"
+
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int terminalId) const {
+ if (terminalId < 0 || terminalId >= mSize) {
+ // This method can be called with invalid terminal id during GC.
+ return ProbabilityEntry(0 /* flags */, NOT_A_PROBABILITY);
+ }
+ const BufferWithExtendableBuffer *const buffer = getBuffer();
+ int entryPos = getEntryPos(terminalId);
+ const int flags = buffer->readUintAndAdvancePosition(
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos);
+ const int probability = buffer->readUintAndAdvancePosition(
+ Ver4DictConstants::PROBABILITY_SIZE, &entryPos);
+ if (mHasHistoricalInfo) {
+ const int timestamp = buffer->readUintAndAdvancePosition(
+ Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos);
+ const int level = buffer->readUintAndAdvancePosition(
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos);
+ const int count = buffer->readUintAndAdvancePosition(
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos);
+ // Hack for better migration.
+ const HistoricalInfo historicalInfo(timestamp, level, count + level);
+ return ProbabilityEntry(flags, probability, &historicalInfo);
+ } else {
+ return ProbabilityEntry(flags, probability);
+ }
+}
+
+bool ProbabilityDictContent::setProbabilityEntry(const int terminalId,
+ const ProbabilityEntry *const probabilityEntry) {
+ if (terminalId < 0) {
+ return false;
+ }
+ const int entryPos = getEntryPos(terminalId);
+ if (terminalId >= mSize) {
+ ProbabilityEntry dummyEntry;
+ // Write new entry.
+ int writingPos = getBuffer()->getTailPosition();
+ while (writingPos <= entryPos) {
+ // Fulfilling with dummy entries until writingPos.
+ if (!writeEntry(&dummyEntry, writingPos)) {
+ AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize);
+ return false;
+ }
+ writingPos += getEntrySize();
+ }
+ mSize = terminalId + 1;
+ }
+ return writeEntry(probabilityEntry, entryPos);
+}
+
+bool ProbabilityDictContent::flushToFile(const char *const dictPath) const {
+ if (getEntryPos(mSize) < getBuffer()->getTailPosition()) {
+ ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo);
+ for (int i = 0; i < mSize; ++i) {
+ const ProbabilityEntry probabilityEntry = getProbabilityEntry(i);
+ if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) {
+ AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i);
+ return false;
+ }
+ }
+ return probabilityDictContentToWrite.flush(dictPath,
+ Ver4DictConstants::FREQ_FILE_EXTENSION);
+ } else {
+ return flush(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION);
+ }
+}
+
+bool ProbabilityDictContent::runGC(
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const ProbabilityDictContent *const originalProbabilityDictContent) {
+ for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
+ it != terminalIdMap->end(); ++it) {
+ const ProbabilityEntry probabilityEntry =
+ originalProbabilityDictContent->getProbabilityEntry(it->first);
+ if (!setProbabilityEntry(it->second, &probabilityEntry)) {
+ AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second);
+ return false;
+ }
+ }
+ return true;
+}
+
+int ProbabilityDictContent::getEntrySize() const {
+ if (mHasHistoricalInfo) {
+ return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
+ + Ver4DictConstants::PROBABILITY_SIZE
+ + Ver4DictConstants::TIME_STAMP_FIELD_SIZE
+ + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
+ + Ver4DictConstants::WORD_COUNT_FIELD_SIZE;
+ } else {
+ return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
+ + Ver4DictConstants::PROBABILITY_SIZE;
+ }
+}
+
+int ProbabilityDictContent::getEntryPos(const int terminalId) const {
+ return terminalId * getEntrySize();
+}
+
+bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry,
+ const int entryPos) {
+ BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer();
+ int writingPos = entryPos;
+ if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(),
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) {
+ AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos);
+ return false;
+ }
+ if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(),
+ Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
+ AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos);
+ return false;
+ }
+ if (mHasHistoricalInfo) {
+ const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo();
+ if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(),
+ Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) {
+ AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos);
+ return false;
+ }
+ if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(),
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) {
+ AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos);
+ return false;
+ }
+ if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(),
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) {
+ AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h
new file mode 100644
index 000000000..464b29f3f
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/probability_dict_content.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H
+#define LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H
+
+#include "defines.h"
+#include "dictionary/structure/backward/v402/content/single_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class ProbabilityEntry;
+
+class ProbabilityDictContent : public SingleDictContent {
+ public:
+ ProbabilityDictContent(const char *const dictPath, const bool hasHistoricalInfo,
+ const bool isUpdatable)
+ : SingleDictContent(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable),
+ mHasHistoricalInfo(hasHistoricalInfo),
+ mSize(getBuffer()->getTailPosition() / getEntrySize()) {}
+
+ ProbabilityDictContent(const bool hasHistoricalInfo)
+ : mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {}
+
+ const ProbabilityEntry getProbabilityEntry(const int terminalId) const;
+
+ bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry);
+
+ bool flushToFile(const char *const dictPath) const;
+
+ bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const ProbabilityDictContent *const originalProbabilityDictContent);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent);
+
+ int getEntrySize() const;
+
+ int getEntryPos(const int terminalId) const;
+
+ bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos);
+
+ bool mHasHistoricalInfo;
+ int mSize;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h
new file mode 100644
index 000000000..94e36bf51
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/probability_entry.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H
+#define LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class ProbabilityEntry {
+ public:
+ ProbabilityEntry(const ProbabilityEntry &probabilityEntry)
+ : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability),
+ mHistoricalInfo(probabilityEntry.mHistoricalInfo) {}
+
+ // Dummy entry
+ ProbabilityEntry()
+ : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {}
+
+ // Entry without historical information
+ ProbabilityEntry(const int flags, const int probability)
+ : mFlags(flags), mProbability(probability), mHistoricalInfo() {}
+
+ // Entry with historical information.
+ ProbabilityEntry(const int flags, const int probability,
+ const HistoricalInfo *const historicalInfo)
+ : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {}
+
+ const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const {
+ return ProbabilityEntry(mFlags, probability, &mHistoricalInfo);
+ }
+
+ const ProbabilityEntry createEntryWithUpdatedHistoricalInfo(
+ const HistoricalInfo *const historicalInfo) const {
+ return ProbabilityEntry(mFlags, mProbability, historicalInfo);
+ }
+
+ bool hasHistoricalInfo() const {
+ return mHistoricalInfo.isValid();
+ }
+
+ int getFlags() const {
+ return mFlags;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ const HistoricalInfo *getHistoricalInfo() const {
+ return &mHistoricalInfo;
+ }
+
+ private:
+ // Copy constructor is public to use this class as a type of return value.
+ DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry);
+
+ const int mFlags;
+ const int mProbability;
+ const HistoricalInfo mHistoricalInfo;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp
new file mode 100644
index 000000000..e538a02a1
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/shortcut_dict_content.cpp
+ */
+
+#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h"
+
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
+ int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
+ bool *const outhasNext, int *const shortcutEntryPos) const {
+ const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer();
+ if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) {
+ AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d",
+ *shortcutEntryPos, shortcutListBuffer->getTailPosition());
+ ASSERT(false);
+ if (outhasNext) {
+ *outhasNext = false;
+ }
+ if (outCodePointCount) {
+ *outCodePointCount = 0;
+ }
+ return;
+ }
+
+ const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
+ if (outProbability) {
+ *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK;
+ }
+ if (outhasNext) {
+ *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK;
+ }
+ if (outCodePoint && outCodePointCount) {
+ shortcutListBuffer->readCodePointsAndAdvancePosition(
+ maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos);
+ }
+}
+
+int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const {
+ const SparseTable *const addressLookupTable = getAddressLookupTable();
+ if (!addressLookupTable->contains(terminalId)) {
+ return NOT_A_DICT_POS;
+ }
+ return addressLookupTable->get(terminalId);
+}
+
+bool ShortcutDictContent::flushToFile(const char *const dictPath) const {
+ return flush(dictPath, Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::SHORTCUT_FILE_EXTENSION);
+}
+
+bool ShortcutDictContent::runGC(
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const ShortcutDictContent *const originalShortcutDictContent) {
+ for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
+ it != terminalIdMap->end(); ++it) {
+ const int originalShortcutListPos =
+ originalShortcutDictContent->getShortcutListHeadPos(it->first);
+ if (originalShortcutListPos == NOT_A_DICT_POS) {
+ continue;
+ }
+ const int shortcutListPos = getContentBuffer()->getTailPosition();
+ // Copy shortcut list from original content.
+ if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent,
+ shortcutListPos)) {
+ AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d",
+ originalShortcutListPos, shortcutListPos);
+ return false;
+ }
+ // Set shortcut list position to the lookup table.
+ if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) {
+ AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d",
+ it->second, shortcutListPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool ShortcutDictContent::createNewShortcutList(const int terminalId) {
+ const int shortcutListListPos = getContentBuffer()->getTailPosition();
+ return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos);
+}
+
+bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) {
+ return copyShortcutListFromDictContent(shortcutListPos, this, toPos);
+}
+
+bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos,
+ const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) {
+ bool hasNext = true;
+ int readingPos = shortcutListPos;
+ int writingPos = toPos;
+ int codePoints[MAX_WORD_LENGTH];
+ while (hasNext) {
+ int probability = 0;
+ int codePointCount = 0;
+ sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH,
+ codePoints, &codePointCount, &probability, &hasNext, &readingPos);
+ if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability,
+ hasNext, &writingPos)) {
+ AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) {
+ BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer();
+ const int shortcutFlags = shortcutListBuffer->readUint(
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
+ const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK;
+ const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext);
+ return shortcutListBuffer->writeUint(shortcutFlagsToWrite,
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
+}
+
+bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint,
+ const int codePointCount, const int probability, const bool hasNext,
+ int *const shortcutEntryPos) {
+ BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer();
+ const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext);
+ if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags,
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) {
+ AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos);
+ return false;
+ }
+ if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount,
+ true /* writesTerminator */, shortcutEntryPos)) {
+ AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos);
+ return false;
+ }
+ return true;
+}
+
+// Find a shortcut entry that has specified target and return its position.
+int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos,
+ const int *const targetCodePointsToFind, const int codePointCount) const {
+ bool hasNext = true;
+ int readingPos = shortcutListPos;
+ int targetCodePoints[MAX_WORD_LENGTH];
+ while (hasNext) {
+ const int entryPos = readingPos;
+ int probability = 0;
+ int targetCodePointCount = 0;
+ getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount,
+ &probability, &hasNext, &readingPos);
+ if (targetCodePointCount != codePointCount) {
+ continue;
+ }
+ bool matched = true;
+ for (int i = 0; i < codePointCount; ++i) {
+ if (targetCodePointsToFind[i] != targetCodePoints[i]) {
+ matched = false;
+ break;
+ }
+ }
+ if (matched) {
+ return entryPos;
+ }
+ }
+ return NOT_A_DICT_POS;
+}
+
+int ShortcutDictContent::createAndGetShortcutFlags(const int probability,
+ const bool hasNext) const {
+ return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK)
+ | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0);
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h
new file mode 100644
index 000000000..3b725e896
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/shortcut_dict_content.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H
+#define LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H
+
+#include "defines.h"
+#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class ShortcutDictContent : public SparseTableDictContent {
+ public:
+ ShortcutDictContent(const char *const dictPath, const bool isUpdatable)
+ : SparseTableDictContent(dictPath,
+ Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION,
+ Ver4DictConstants::SHORTCUT_FILE_EXTENSION, isUpdatable,
+ Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
+ Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
+
+ ShortcutDictContent()
+ : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
+ Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
+
+ void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint,
+ int *const outCodePointCount, int *const outProbability, bool *const outhasNext,
+ const int shortcutEntryPos) {
+ int readingPos = shortcutEntryPos;
+ return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint,
+ outCodePointCount, outProbability, outhasNext, &readingPos);
+ }
+
+ void getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
+ int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
+ bool *const outhasNext, int *const shortcutEntryPos) const;
+
+ // Returns head position of shortcut list for a PtNode specified by terminalId.
+ int getShortcutListHeadPos(const int terminalId) const;
+
+ bool flushToFile(const char *const dictPath) const;
+
+ bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const ShortcutDictContent *const originalShortcutDictContent);
+
+ bool createNewShortcutList(const int terminalId);
+
+ bool copyShortcutList(const int shortcutListPos, const int toPos);
+
+ bool setProbability(const int probability, const int shortcutEntryPos);
+
+ bool writeShortcutEntry(const int *const codePoint, const int codePointCount,
+ const int probability, const bool hasNext, const int shortcutEntryPos) {
+ int writingPos = shortcutEntryPos;
+ return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability,
+ hasNext, &writingPos);
+ }
+
+ bool writeShortcutEntryAndAdvancePosition(const int *const codePoint,
+ const int codePointCount, const int probability, const bool hasNext,
+ int *const shortcutEntryPos);
+
+ int findShortcutEntryAndGetPos(const int shortcutListPos,
+ const int *const targetCodePointsToFind, const int codePointCount) const;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent);
+
+ bool copyShortcutListFromDictContent(const int shortcutListPos,
+ const ShortcutDictContent *const sourceShortcutDictContent, const int toPos);
+
+ int createAndGetShortcutFlags(const int probability, const bool hasNext) const;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h
new file mode 100644
index 000000000..89df2a1e0
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/single_dict_content.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H
+#define LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H
+
+#include "defines.h"
+#include "dictionary/structure/backward/v402/content/dict_content.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class SingleDictContent : public DictContent {
+ public:
+ SingleDictContent(const char *const dictPath, const char *const contentFileName,
+ const bool isUpdatable)
+ : mMmappedBuffer(MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)),
+ mExpandableContentBuffer(
+ mMmappedBuffer ? mMmappedBuffer->getReadWriteByteArrayView() :
+ ReadWriteByteArrayView(),
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mIsValid(mMmappedBuffer) {}
+
+ SingleDictContent()
+ : mMmappedBuffer(nullptr),
+ mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mIsValid(true) {}
+
+ virtual ~SingleDictContent() {}
+
+ virtual bool isValid() const {
+ return mIsValid;
+ }
+
+ bool isNearSizeLimit() const {
+ return mExpandableContentBuffer.isNearSizeLimit();
+ }
+
+ protected:
+ BufferWithExtendableBuffer *getWritableBuffer() {
+ return &mExpandableContentBuffer;
+ }
+
+ const BufferWithExtendableBuffer *getBuffer() const {
+ return &mExpandableContentBuffer;
+ }
+
+ bool flush(const char *const dictPath, const char *const contentFileNameSuffix) const {
+ return DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath,
+ contentFileNameSuffix, &mExpandableContentBuffer);
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(SingleDictContent);
+
+ const MmappedBuffer::MmappedBufferPtr mMmappedBuffer;
+ BufferWithExtendableBuffer mExpandableContentBuffer;
+ const bool mIsValid;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp
new file mode 100644
index 000000000..280f0f85a
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/sparse_table_dict_content.cpp
+ */
+
+#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+bool SparseTableDictContent::flush(const char *const dictPath,
+ const char *const lookupTableFileNameSuffix, const char *const addressTableFileNameSuffix,
+ const char *const contentFileNameSuffix) const {
+ if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, lookupTableFileNameSuffix,
+ &mExpandableLookupTableBuffer)){
+ return false;
+ }
+ if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, addressTableFileNameSuffix,
+ &mExpandableAddressTableBuffer)) {
+ return false;
+ }
+ if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, contentFileNameSuffix,
+ &mExpandableContentBuffer)) {
+ return false;
+ }
+ return true;
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h
new file mode 100644
index 000000000..4b5af87ad
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/sparse_table_dict_content.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H
+#define LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H
+
+#include "defines.h"
+#include "dictionary/structure/backward/v402/content/dict_content.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
+#include "dictionary/utils/sparse_table.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+// TODO: Support multiple contents.
+class SparseTableDictContent : public DictContent {
+ public:
+ AK_FORCE_INLINE SparseTableDictContent(const char *const dictPath,
+ const char *const lookupTableFileName, const char *const addressTableFileName,
+ const char *const contentFileName, const bool isUpdatable,
+ const int sparseTableBlockSize, const int sparseTableDataSize)
+ : mLookupTableBuffer(
+ MmappedBuffer::openBuffer(dictPath, lookupTableFileName, isUpdatable)),
+ mAddressTableBuffer(
+ MmappedBuffer::openBuffer(dictPath, addressTableFileName, isUpdatable)),
+ mContentBuffer(
+ MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)),
+ mExpandableLookupTableBuffer(
+ mLookupTableBuffer ? mLookupTableBuffer->getReadWriteByteArrayView() :
+ ReadWriteByteArrayView(),
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mExpandableAddressTableBuffer(
+ mAddressTableBuffer ? mAddressTableBuffer->getReadWriteByteArrayView() :
+ ReadWriteByteArrayView(),
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mExpandableContentBuffer(
+ mContentBuffer ? mContentBuffer->getReadWriteByteArrayView() :
+ ReadWriteByteArrayView(),
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer,
+ sparseTableBlockSize, sparseTableDataSize),
+ mIsValid(mLookupTableBuffer && mAddressTableBuffer && mContentBuffer) {}
+
+ SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize)
+ : mLookupTableBuffer(), mAddressTableBuffer(), mContentBuffer(),
+ mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer,
+ sparseTableBlockSize, sparseTableDataSize), mIsValid(true) {}
+
+ virtual ~SparseTableDictContent() {}
+
+ virtual bool isValid() const {
+ return mIsValid;
+ }
+
+ bool isNearSizeLimit() const {
+ return mExpandableLookupTableBuffer.isNearSizeLimit()
+ || mExpandableAddressTableBuffer.isNearSizeLimit()
+ || mExpandableContentBuffer.isNearSizeLimit();
+ }
+
+ protected:
+ SparseTable *getUpdatableAddressLookupTable() {
+ return &mAddressLookupTable;
+ }
+
+ const SparseTable *getAddressLookupTable() const {
+ return &mAddressLookupTable;
+ }
+
+ BufferWithExtendableBuffer *getWritableContentBuffer() {
+ return &mExpandableContentBuffer;
+ }
+
+ const BufferWithExtendableBuffer *getContentBuffer() const {
+ return &mExpandableContentBuffer;
+ }
+
+ bool flush(const char *const dictDirPath, const char *const lookupTableFileName,
+ const char *const addressTableFileName, const char *const contentFileName) const;
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent);
+
+ const MmappedBuffer::MmappedBufferPtr mLookupTableBuffer;
+ const MmappedBuffer::MmappedBufferPtr mAddressTableBuffer;
+ const MmappedBuffer::MmappedBufferPtr mContentBuffer;
+ BufferWithExtendableBuffer mExpandableLookupTableBuffer;
+ BufferWithExtendableBuffer mExpandableAddressTableBuffer;
+ BufferWithExtendableBuffer mExpandableContentBuffer;
+ SparseTable mAddressLookupTable;
+ const bool mIsValid;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp
new file mode 100644
index 000000000..30b72bbd1
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/terminal_position_lookup_table.cpp
+ */
+
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const {
+ if (terminalId < 0 || terminalId >= mSize) {
+ return NOT_A_DICT_POS;
+ }
+ const int terminalPos = getBuffer()->readUint(
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId));
+ return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ?
+ NOT_A_DICT_POS : terminalPos;
+}
+
+bool TerminalPositionLookupTable::setTerminalPtNodePosition(
+ const int terminalId, const int terminalPtNodePos) {
+ if (terminalId < 0) {
+ return NOT_A_DICT_POS;
+ }
+ while (terminalId >= mSize) {
+ // Write new entry.
+ if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) {
+ return false;
+ }
+ mSize++;
+ }
+ const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ?
+ terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS;
+ return getWritableBuffer()->writeUint(terminalPos,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId));
+}
+
+bool TerminalPositionLookupTable::flushToFile(const char *const dictPath) const {
+ // If the used buffer size is smaller than the actual buffer size, regenerate the lookup
+ // table and write the new table to the file.
+ if (getEntryPos(mSize) < getBuffer()->getTailPosition()) {
+ TerminalPositionLookupTable lookupTableToWrite;
+ for (int i = 0; i < mSize; ++i) {
+ const int terminalPtNodePosition = getTerminalPtNodePosition(i);
+ if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) {
+ AKLOGE("Cannot set terminal position to lookupTableToWrite."
+ " terminalId: %d, position: %d", i, terminalPtNodePosition);
+ return false;
+ }
+ }
+ return lookupTableToWrite.flush(dictPath,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
+ } else {
+ // We can simply use this lookup table because the buffer size has not been
+ // changed.
+ return flush(dictPath, Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
+ }
+}
+
+bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) {
+ int removedEntryCount = 0;
+ int nextNewTerminalId = 0;
+ for (int i = 0; i < mSize; ++i) {
+ const int terminalPos = getBuffer()->readUint(
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i));
+ if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) {
+ // This entry is a garbage.
+ removedEntryCount++;
+ } else {
+ // Give a new terminal id to the entry.
+ if (!getWritableBuffer()->writeUint(terminalPos,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
+ getEntryPos(nextNewTerminalId))) {
+ return false;
+ }
+ // Memorize the mapping to the old terminal id to the new terminal id.
+ terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId));
+ nextNewTerminalId++;
+ }
+ }
+ mSize = nextNewTerminalId;
+ return true;
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h
new file mode 100644
index 000000000..641c7496f
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/content/terminal_position_lookup_table.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H
+#define LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H
+
+#include <unordered_map>
+
+#include "defines.h"
+#include "dictionary/structure/backward/v402/content/single_dict_content.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class TerminalPositionLookupTable : public SingleDictContent {
+ public:
+ typedef std::unordered_map<int, int> TerminalIdMap;
+
+ TerminalPositionLookupTable(const char *const dictPath, const bool isUpdatable)
+ : SingleDictContent(dictPath,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION, isUpdatable),
+ mSize(getBuffer()->getTailPosition()
+ / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {}
+
+ TerminalPositionLookupTable() : mSize(0) {}
+
+ int getTerminalPtNodePosition(const int terminalId) const;
+
+ bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos);
+
+ int getNextTerminalId() const {
+ return mSize;
+ }
+
+ bool flushToFile(const char *const dictPath) const;
+
+ bool runGCTerminalIds(TerminalIdMap *const terminalIdMap);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable);
+
+ int getEntryPos(const int terminalId) const {
+ return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
+ }
+
+ int mSize;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif // LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H
diff --git a/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h
new file mode 100644
index 000000000..8cda8c5cf
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
+ * Do not edit this file other than updating policy's interface.
+ *
+ * This file was generated from
+ * dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H
+#define LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
+ public:
+ Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent,
+ const TerminalPositionLookupTable *const terminalPositionLookupTable)
+ : mShortcutDictContent(shortcutDictContent) {}
+
+ ~Ver4ShortcutListPolicy() {}
+
+ int getStartPos(const int pos) const {
+ // The first shortcut entry is located at the head position of the shortcut list.
+ return pos;
+ }
+
+ void getNextShortcut(const int maxCodePointCount, int *const outCodePoint,
+ int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext,
+ int *const pos) const {
+ int probability = 0;
+ mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount,
+ outCodePoint, outCodePointCount, &probability, outHasNext, pos);
+ if (outIsWhitelist) {
+ *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability);
+ }
+ }
+
+ void skipAllShortcuts(int *const pos) const {
+ // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries.
+ }
+
+ bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount,
+ const int probability) {
+ const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId);
+ if (shortcutListPos == NOT_A_DICT_POS) {
+ // Create shortcut list.
+ if (!mShortcutDictContent->createNewShortcutList(terminalId)) {
+ AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId);
+ return false;
+ }
+ const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId);
+ return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability,
+ false /* hasNext */, writingPos);
+ }
+ const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos,
+ codePoints, codePointCount);
+ if (entryPos == NOT_A_DICT_POS) {
+ // Add new entry to the shortcut list.
+ // Create new shortcut list.
+ if (!mShortcutDictContent->createNewShortcutList(terminalId)) {
+ AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId);
+ return false;
+ }
+ int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId);
+ if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints,
+ codePointCount, probability, true /* hasNext */, &writingPos)) {
+ AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId,
+ writingPos);
+ return false;
+ }
+ return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos);
+ }
+ // Overwrite existing entry.
+ bool hasNext = false;
+ mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */,
+ 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos);
+ if (!mShortcutDictContent->writeShortcutEntry(codePoints,
+ codePointCount, probability, hasNext, entryPos)) {
+ AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId,
+ entryPos);
+ return false;
+ }
+ return true;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy);
+
+ ShortcutDictContent *const mShortcutDictContent;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif // LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp
new file mode 100644
index 000000000..4a9704f4d
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_dict_buffers.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+
+#include <cerrno>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/file_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers(
+ const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer,
+ const FormatUtils::FORMAT_VERSION formatVersion) {
+ if (!headerBuffer) {
+ ASSERT(false);
+ AKLOGE("The header buffer must be valid to open ver4 dict buffers.");
+ return Ver4DictBuffersPtr(nullptr);
+ }
+ // TODO: take only dictDirPath, and open both header and trie files in the constructor below
+ const bool isUpdatable = headerBuffer->isUpdatable();
+ return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable,
+ formatVersion));
+}
+
+bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath,
+ const BufferWithExtendableBuffer *const headerBuffer) const {
+ // Create temporary directory.
+ const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath,
+ DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE);
+ char tmpDirPath[tmpDirPathBufSize];
+ FileUtils::getFilePathWithSuffix(dictDirPath,
+ DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize,
+ tmpDirPath);
+ if (FileUtils::existsDir(tmpDirPath)) {
+ if (!FileUtils::removeDirAndFiles(tmpDirPath)) {
+ AKLOGE("Existing directory %s cannot be removed.", tmpDirPath);
+ ASSERT(false);
+ return false;
+ }
+ }
+ umask(S_IWGRP | S_IWOTH);
+ if (mkdir(tmpDirPath, S_IRWXU) == -1) {
+ AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno);
+ return false;
+ }
+ // Get dictionary base path.
+ const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */;
+ char dictName[dictNameBufSize];
+ FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName);
+ const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName);
+ char dictPath[dictPathBufSize];
+ FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath);
+
+ // Write header file.
+ if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath,
+ Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) {
+ AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath,
+ Ver4DictConstants::HEADER_FILE_EXTENSION);
+ return false;
+ }
+ // Write trie file.
+ if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath,
+ Ver4DictConstants::TRIE_FILE_EXTENSION, &mExpandableTrieBuffer)) {
+ AKLOGE("Dictionary trie file %s%s cannot be written.", tmpDirPath,
+ Ver4DictConstants::TRIE_FILE_EXTENSION);
+ return false;
+ }
+ // Write dictionary contents.
+ if (!mTerminalPositionLookupTable.flushToFile(dictPath)) {
+ AKLOGE("Terminal position lookup table cannot be written. %s", tmpDirPath);
+ return false;
+ }
+ if (!mProbabilityDictContent.flushToFile(dictPath)) {
+ AKLOGE("Probability dict content cannot be written. %s", tmpDirPath);
+ return false;
+ }
+ if (!mBigramDictContent.flushToFile(dictPath)) {
+ AKLOGE("Bigram dict content cannot be written. %s", tmpDirPath);
+ return false;
+ }
+ if (!mShortcutDictContent.flushToFile(dictPath)) {
+ AKLOGE("Shortcut dict content cannot be written. %s", tmpDirPath);
+ return false;
+ }
+ // Remove existing dictionary.
+ if (!FileUtils::removeDirAndFiles(dictDirPath)) {
+ AKLOGE("Existing directory %s cannot be removed.", dictDirPath);
+ ASSERT(false);
+ return false;
+ }
+ // Rename temporary directory.
+ if (rename(tmpDirPath, dictDirPath) != 0) {
+ AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath);
+ ASSERT(false);
+ return false;
+ }
+ return true;
+}
+
+Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath,
+ MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable,
+ const FormatUtils::FORMAT_VERSION formatVersion)
+ : mHeaderBuffer(std::move(headerBuffer)),
+ mDictBuffer(MmappedBuffer::openBuffer(dictPath,
+ Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)),
+ mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion),
+ mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(),
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mExpandableTrieBuffer(
+ mDictBuffer ? mDictBuffer->getReadWriteByteArrayView() :
+ ReadWriteByteArrayView(),
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mTerminalPositionLookupTable(dictPath, isUpdatable),
+ mProbabilityDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable),
+ mBigramDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable),
+ mShortcutDictContent(dictPath, isUpdatable),
+ mIsUpdatable(isUpdatable) {}
+
+Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize)
+ : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy),
+ mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(),
+ mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()),
+ mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(),
+ mIsUpdatable(true) {}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h
new file mode 100644
index 000000000..0d09fee9a
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_dict_buffers.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H
+#define LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H
+
+#include <memory>
+
+#include "defines.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/backward/v402/content/bigram_dict_content.h"
+#include "dictionary/structure/backward/v402/content/probability_dict_content.h"
+#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/mmapped_buffer.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+class Ver4DictBuffers {
+ public:
+ typedef std::unique_ptr<Ver4DictBuffers> Ver4DictBuffersPtr;
+
+ static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath,
+ MmappedBuffer::MmappedBufferPtr headerBuffer,
+ const FormatUtils::FORMAT_VERSION formatVersion);
+
+ static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers(
+ const HeaderPolicy *const headerPolicy, const int maxTrieSize) {
+ return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize));
+ }
+
+ AK_FORCE_INLINE bool isValid() const {
+ return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid()
+ && mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid()
+ && mBigramDictContent.isValid() && mShortcutDictContent.isValid();
+ }
+
+ AK_FORCE_INLINE bool isNearSizeLimit() const {
+ return mExpandableTrieBuffer.isNearSizeLimit()
+ || mTerminalPositionLookupTable.isNearSizeLimit()
+ || mProbabilityDictContent.isNearSizeLimit()
+ || mBigramDictContent.isNearSizeLimit()
+ || mShortcutDictContent.isNearSizeLimit();
+ }
+
+ AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const {
+ return &mHeaderPolicy;
+ }
+
+ AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() {
+ return &mExpandableHeaderBuffer;
+ }
+
+ AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() {
+ return &mExpandableTrieBuffer;
+ }
+
+ AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const {
+ return &mExpandableTrieBuffer;
+ }
+
+ AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() {
+ return &mTerminalPositionLookupTable;
+ }
+
+ AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const {
+ return &mTerminalPositionLookupTable;
+ }
+
+ AK_FORCE_INLINE ProbabilityDictContent *getMutableProbabilityDictContent() {
+ return &mProbabilityDictContent;
+ }
+
+ AK_FORCE_INLINE const ProbabilityDictContent *getProbabilityDictContent() const {
+ return &mProbabilityDictContent;
+ }
+
+ AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() {
+ return &mBigramDictContent;
+ }
+
+ AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const {
+ return &mBigramDictContent;
+ }
+
+ AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() {
+ return &mShortcutDictContent;
+ }
+
+ AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const {
+ return &mShortcutDictContent;
+ }
+
+ AK_FORCE_INLINE bool isUpdatable() const {
+ return mIsUpdatable;
+ }
+
+ bool flush(const char *const dictDirPath) const {
+ return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer);
+ }
+
+ bool flushHeaderAndDictBuffers(const char *const dictDirPath,
+ const BufferWithExtendableBuffer *const headerBuffer) const;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers);
+
+ Ver4DictBuffers(const char *const dictDirPath,
+ const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable,
+ const FormatUtils::FORMAT_VERSION formatVersion);
+
+ Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize);
+
+ const MmappedBuffer::MmappedBufferPtr mHeaderBuffer;
+ const MmappedBuffer::MmappedBufferPtr mDictBuffer;
+ const HeaderPolicy mHeaderPolicy;
+ BufferWithExtendableBuffer mExpandableHeaderBuffer;
+ BufferWithExtendableBuffer mExpandableTrieBuffer;
+ TerminalPositionLookupTable mTerminalPositionLookupTable;
+ ProbabilityDictContent mProbabilityDictContent;
+ BigramDictContent mBigramDictContent;
+ ShortcutDictContent mShortcutDictContent;
+ const int mIsUpdatable;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp
new file mode 100644
index 000000000..2948d0716
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_dict_constants.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+// These values MUST match the definitions in FormatSpec.java.
+const char *const Ver4DictConstants::TRIE_FILE_EXTENSION = ".trie";
+const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header";
+const char *const Ver4DictConstants::FREQ_FILE_EXTENSION = ".freq";
+// tat = Terminal Address Table
+const char *const Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
+const char *const Ver4DictConstants::BIGRAM_FILE_EXTENSION = ".bigram_freq";
+const char *const Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup";
+const char *const Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION = ".bigram_index_freq";
+const char *const Ver4DictConstants::SHORTCUT_FILE_EXTENSION = ".shortcut_shortcut";
+const char *const Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION = ".shortcut_lookup";
+const char *const Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION =
+ ".shortcut_index_shortcut";
+
+// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets.
+const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024;
+// Extended region size, which is not GCed region size in dict file + additional buffer size, is
+// limited to 1MB to prevent from inefficient traversing.
+const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024;
+
+const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1;
+const int Ver4DictConstants::PROBABILITY_SIZE = 1;
+const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1;
+const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
+const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
+const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
+const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4;
+const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
+const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
+
+const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16;
+const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4;
+const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
+const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
+
+const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3;
+// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing
+// invalid terminal ID in bigram lists.
+const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID =
+ (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1;
+const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1;
+const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F;
+const int Ver4DictConstants::BIGRAM_HAS_NEXT_MASK = 0x80;
+const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1;
+
+const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1;
+const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F;
+const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80;
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h
new file mode 100644
index 000000000..15581d852
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_dict_constants.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H
+#define LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H
+
+#include "defines.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+// TODO: Create PtConstants under the pt_common and move some constant values there.
+// Note that there are corresponding definitions in FormatSpec.java.
+class Ver4DictConstants {
+ public:
+ static const char *const TRIE_FILE_EXTENSION;
+ static const char *const HEADER_FILE_EXTENSION;
+ static const char *const FREQ_FILE_EXTENSION;
+ static const char *const TERMINAL_ADDRESS_TABLE_FILE_EXTENSION;
+ static const char *const BIGRAM_FILE_EXTENSION;
+ static const char *const BIGRAM_LOOKUP_TABLE_FILE_EXTENSION;
+ static const char *const BIGRAM_CONTENT_TABLE_FILE_EXTENSION;
+ static const char *const SHORTCUT_FILE_EXTENSION;
+ static const char *const SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION;
+ static const char *const SHORTCUT_CONTENT_TABLE_FILE_EXTENSION;
+
+ static const int MAX_DICTIONARY_SIZE;
+ static const int MAX_DICT_EXTENDED_REGION_SIZE;
+
+ static const int NOT_A_TERMINAL_ID;
+ static const int PROBABILITY_SIZE;
+ static const int FLAGS_IN_PROBABILITY_FILE_SIZE;
+ static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
+ static const int NOT_A_TERMINAL_ADDRESS;
+ static const int TERMINAL_ID_FIELD_SIZE;
+ static const int TIME_STAMP_FIELD_SIZE;
+ static const int WORD_LEVEL_FIELD_SIZE;
+ static const int WORD_COUNT_FIELD_SIZE;
+
+ static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE;
+ static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE;
+ static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
+ static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
+
+ static const int BIGRAM_FLAGS_FIELD_SIZE;
+ static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE;
+ static const int INVALID_BIGRAM_TARGET_TERMINAL_ID;
+ static const int BIGRAM_PROBABILITY_MASK;
+ static const int BIGRAM_HAS_NEXT_MASK;
+ // Used when bigram list has time stamp.
+ static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE;
+
+ static const int SHORTCUT_FLAGS_FIELD_SIZE;
+ static const int SHORTCUT_PROBABILITY_MASK;
+ static const int SHORTCUT_HAS_NEXT_MASK;
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants);
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
new file mode 100644
index 000000000..871ef7aaf
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/backward/v402/content/probability_dict_content.h"
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode(
+ const int ptNodePos, const int siblingNodePos) const {
+ if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) {
+ // Reading invalid position because of bug or broken dictionary.
+ AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d",
+ ptNodePos, mBuffer->getTailPosition());
+ ASSERT(false);
+ return PtNodeParams();
+ }
+ const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos);
+ const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
+ int pos = ptNodePos;
+ const int headPos = ptNodePos;
+ if (usesAdditionalBuffer) {
+ pos -= mBuffer->getOriginalBufferSize();
+ }
+ const PatriciaTrieReadingUtils::NodeFlags flags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const int parentPosOffset =
+ DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition(
+ dictBuf, &pos);
+ const int parentPos =
+ DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
+ int codePoints[MAX_WORD_LENGTH];
+ const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
+ dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
+ int terminalIdFieldPos = NOT_A_DICT_POS;
+ int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
+ int probability = NOT_A_PROBABILITY;
+ if (PatriciaTrieReadingUtils::isTerminal(flags)) {
+ terminalIdFieldPos = pos;
+ if (usesAdditionalBuffer) {
+ terminalIdFieldPos += mBuffer->getOriginalBufferSize();
+ }
+ terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
+ const ProbabilityEntry probabilityEntry =
+ mProbabilityDictContent->getProbabilityEntry(terminalId);
+ if (probabilityEntry.hasHistoricalInfo()) {
+ probability = ForgettingCurveUtils::decodeProbability(
+ probabilityEntry.getHistoricalInfo(), mHeaderPolicy);
+ } else {
+ probability = probabilityEntry.getProbability();
+ }
+ }
+ int childrenPosFieldPos = pos;
+ if (usesAdditionalBuffer) {
+ childrenPosFieldPos += mBuffer->getOriginalBufferSize();
+ }
+ int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition(
+ dictBuf, &pos);
+ if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) {
+ childrenPos += mBuffer->getOriginalBufferSize();
+ }
+ if (usesAdditionalBuffer) {
+ pos += mBuffer->getOriginalBufferSize();
+ }
+ // Sibling position is the tail position of original PtNode.
+ int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos;
+ // Read destination node if the read node is a moved node.
+ if (DynamicPtReadingUtils::isMoved(flags)) {
+ // The destination position is stored at the same place as the parent position.
+ return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
+ } else {
+ return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
+ terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
+ newSiblingNodePos);
+ }
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h
new file mode 100644
index 000000000..367d6f9f8
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_node_reader.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H
+#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+} // namespace v402
+} // namespace backward
+class BufferWithExtendableBuffer;
+namespace backward {
+namespace v402 {
+} // namespace v402
+} // namespace backward
+class HeaderPolicy;
+namespace backward {
+namespace v402 {
+class ProbabilityDictContent;
+
+/*
+ * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved
+ * node and reads node attributes including probability form probabilityBuffer.
+ */
+class Ver4PatriciaTrieNodeReader : public PtNodeReader {
+ public:
+ Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer,
+ const ProbabilityDictContent *const probabilityDictContent,
+ const HeaderPolicy *const headerPolicy)
+ : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent),
+ mHeaderPolicy(headerPolicy) {}
+
+ ~Ver4PatriciaTrieNodeReader() {}
+
+ virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const {
+ return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos,
+ NOT_A_DICT_POS /* siblingNodePos */);
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader);
+
+ const BufferWithExtendableBuffer *const mBuffer;
+ const ProbabilityDictContent *const mProbabilityDictContent;
+ const HeaderPolicy *const mHeaderPolicy;
+
+ const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos,
+ const int siblingNodePos) const;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp
new file mode 100644
index 000000000..e3ab5ec20
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp
@@ -0,0 +1,442 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
+#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3;
+
+bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
+ const PtNodeParams *const toBeUpdatedPtNodeParams) {
+ int pos = toBeUpdatedPtNodeParams->getHeadPos();
+ const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
+ const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
+ if (usesAdditionalBuffer) {
+ pos -= mTrieBuffer->getOriginalBufferSize();
+ }
+ // Read original flags
+ const PatriciaTrieReadingUtils::NodeFlags originalFlags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
+ DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
+ true /* isDeleted */, false /* willBecomeNonTerminal */);
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
+ // Update flags.
+ if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
+ &writingPos)) {
+ return false;
+ }
+ if (toBeUpdatedPtNodeParams->isTerminal()) {
+ // The PtNode is a terminal. Delete entry from the terminal position lookup table.
+ return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
+ toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
+ } else {
+ return true;
+ }
+}
+
+bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
+ const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int movedPos, const int bigramLinkedNodePos) {
+ int pos = toBeUpdatedPtNodeParams->getHeadPos();
+ const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
+ const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
+ if (usesAdditionalBuffer) {
+ pos -= mTrieBuffer->getOriginalBufferSize();
+ }
+ // Read original flags
+ const PatriciaTrieReadingUtils::NodeFlags originalFlags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
+ DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */,
+ false /* isDeleted */, false /* willBecomeNonTerminal */);
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
+ // Update flags.
+ if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
+ &writingPos)) {
+ return false;
+ }
+ // Update moved position, which is stored in the parent offset field.
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
+ mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
+ return false;
+ }
+ if (toBeUpdatedPtNodeParams->hasChildren()) {
+ // Update children's parent position.
+ mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos());
+ while (!mReadingHelper.isEnd()) {
+ const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams());
+ int parentOffsetFieldPos = childPtNodeParams.getHeadPos()
+ + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
+ mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(),
+ &parentOffsetFieldPos)) {
+ // Parent offset cannot be written because of a bug or a broken dictionary; thus,
+ // we give up to update dictionary.
+ return false;
+ }
+ mReadingHelper.readNextSiblingNode(childPtNodeParams);
+ }
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal(
+ const PtNodeParams *const toBeUpdatedPtNodeParams) {
+ int pos = toBeUpdatedPtNodeParams->getHeadPos();
+ const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
+ const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
+ if (usesAdditionalBuffer) {
+ pos -= mTrieBuffer->getOriginalBufferSize();
+ }
+ // Read original flags
+ const PatriciaTrieReadingUtils::NodeFlags originalFlags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
+ DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
+ false /* isDeleted */, true /* willBecomeNonTerminal */);
+ if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
+ toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) {
+ AKLOGE("Cannot update terminal position lookup table. terminal id: %d",
+ toBeUpdatedPtNodeParams->getTerminalId());
+ return false;
+ }
+ // Update flags.
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
+ return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
+ &writingPos);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty(
+ const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const UnigramProperty *const unigramProperty) {
+ // Update probability and historical information.
+ // TODO: Update other information in the unigram property.
+ if (!toBeUpdatedPtNodeParams->isTerminal()) {
+ return false;
+ }
+ const ProbabilityEntry originalProbabilityEntry =
+ mBuffers->getProbabilityDictContent()->getProbabilityEntry(
+ toBeUpdatedPtNodeParams->getTerminalId());
+ const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry,
+ unigramProperty);
+ return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
+ toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
+ const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) {
+ if (!toBeUpdatedPtNodeParams->isTerminal()) {
+ AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
+ return false;
+ }
+ const ProbabilityEntry originalProbabilityEntry =
+ mBuffers->getProbabilityDictContent()->getProbabilityEntry(
+ toBeUpdatedPtNodeParams->getTerminalId());
+ if (originalProbabilityEntry.hasHistoricalInfo()) {
+ const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
+ originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy);
+ const ProbabilityEntry probabilityEntry =
+ originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
+ if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
+ toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
+ AKLOGE("Cannot write updated probability entry. terminalId: %d",
+ toBeUpdatedPtNodeParams->getTerminalId());
+ return false;
+ }
+ const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
+ if (!isValid) {
+ if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
+ AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
+ return false;
+ }
+ }
+ *outNeedsToKeepPtNode = isValid;
+ } else {
+ // No need to update probability.
+ *outNeedsToKeepPtNode = true;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
+ const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) {
+ int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos();
+ return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
+ newChildrenPosition, &childrenPosFieldPos);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int newTerminalId) {
+ return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE,
+ toBeUpdatedPtNodeParams->getTerminalIdFieldPos());
+}
+
+bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) {
+ return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */,
+ ptNodeWritingPos);
+}
+
+
+bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
+ int *const ptNodeWritingPos) {
+ int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
+ if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId,
+ ptNodeWritingPos)) {
+ return false;
+ }
+ // Write probability.
+ ProbabilityEntry newProbabilityEntry;
+ const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom(
+ &newProbabilityEntry, unigramProperty);
+ return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId,
+ &probabilityEntryToWrite);
+}
+
+bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) {
+ if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, ngramProperty, outAddedNewEntry)) {
+ AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d",
+ prevWordIds[0], wordId);
+ return false;
+ }
+ const int ptNodePos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(prevWordIds[0]);
+ const PtNodeParams sourcePtNodeParams =
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (!sourcePtNodeParams.hasBigrams()) {
+ // Update has bigrams flag.
+ return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(),
+ sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(),
+ sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(),
+ true /* hasBigrams */,
+ sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */);
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds,
+ const int wordId) {
+ return mBigramPolicy->removeEntry(prevWordIds[0], wordId);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
+ const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
+ return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(
+ sourcePtNodeParams->getTerminalId(), outBigramEntryCount);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
+ const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const DictPositionRelocationMap *const dictPositionRelocationMap,
+ int *const outBigramEntryCount) {
+ int parentPos = toBeUpdatedPtNodeParams->getParentPos();
+ if (parentPos != NOT_A_DICT_POS) {
+ PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
+ dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
+ if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
+ parentPos = it->second;
+ }
+ }
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
+ + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
+ // Write updated parent offset.
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
+ parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
+ return false;
+ }
+
+ // Updates children position.
+ int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
+ if (childrenPos != NOT_A_DICT_POS) {
+ PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
+ dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
+ if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
+ childrenPos = it->second;
+ }
+ }
+ if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
+ return false;
+ }
+
+ // Counts bigram entries.
+ if (outBigramEntryCount) {
+ *outBigramEntryCount = mBigramPolicy->getBigramEntryConut(
+ toBeUpdatedPtNodeParams->getTerminalId());
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
+ const int *const targetCodePoints, const int targetCodePointCount,
+ const int shortcutProbability) {
+ if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
+ targetCodePoints, targetCodePointCount, shortcutProbability)) {
+ AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId());
+ return false;
+ }
+ if (!ptNodeParams->hasShortcutTargets()) {
+ // Update has shortcut targets flag.
+ return updatePtNodeFlags(ptNodeParams->getHeadPos(),
+ ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(),
+ ptNodeParams->isTerminal(), true /* hasShortcutTargets */,
+ ptNodeParams->hasBigrams(),
+ ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags(
+ const PtNodeParams *const ptNodeParams) {
+ const bool hasBigrams = mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
+ const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
+ ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
+ return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(),
+ ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets,
+ hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
+}
+
+bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, int *const outTerminalId,
+ int *const ptNodeWritingPos) {
+ const int nodePos = *ptNodeWritingPos;
+ // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the
+ // PtNode writing.
+ if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer,
+ 0 /* nodeFlags */, ptNodeWritingPos)) {
+ return false;
+ }
+ // Calculate a parent offset and write the offset.
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
+ ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) {
+ return false;
+ }
+ // Write code points
+ if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer,
+ ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) {
+ return false;
+ }
+ int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
+ if (!ptNodeParams->willBecomeNonTerminal()) {
+ if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) {
+ terminalId = ptNodeParams->getTerminalId();
+ } else if (ptNodeParams->isTerminal()) {
+ // Write terminal information using a new terminal id.
+ // Get a new unused terminal id.
+ terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId();
+ }
+ }
+ const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID;
+ if (isTerminal) {
+ // Update the lookup table.
+ if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
+ terminalId, nodePos)) {
+ return false;
+ }
+ // Write terminal Id.
+ if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId,
+ Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) {
+ return false;
+ }
+ if (outTerminalId) {
+ *outTerminalId = terminalId;
+ }
+ }
+ // Write children position
+ if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
+ ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
+ return false;
+ }
+ return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(),
+ ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(),
+ ptNodeParams->hasBigrams(),
+ ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
+}
+
+const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
+ const ProbabilityEntry *const originalProbabilityEntry,
+ const UnigramProperty *const unigramProperty) const {
+ // TODO: Consolidate historical info and probability.
+ if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
+ const HistoricalInfo &historicalInfoForUpdate = unigramProperty->getHistoricalInfo();
+ const HistoricalInfo updatedHistoricalInfo =
+ ForgettingCurveUtils::createUpdatedHistoricalInfo(
+ originalProbabilityEntry->getHistoricalInfo(),
+ unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy);
+ return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
+ &updatedHistoricalInfo);
+ } else {
+ return originalProbabilityEntry->createEntryWithUpdatedProbability(
+ unigramProperty->getProbability());
+ }
+}
+
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos,
+ const bool isBlacklisted, const bool isNotAWord, const bool isTerminal,
+ const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars) {
+ // Create node flags and write them.
+ PatriciaTrieReadingUtils::NodeFlags nodeFlags =
+ PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal,
+ hasShortcutTargets, hasBigrams, hasMultipleChars,
+ CHILDREN_POSITION_FIELD_SIZE);
+ if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
+ AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::suppressUnigramEntry(const PtNodeParams *const ptNodeParams) {
+ if (!mHeaderPolicy->hasHistoricalInfoOfWords()) {
+ // Require historical info to suppress unigram entry.
+ return false;
+ }
+ const HistoricalInfo suppressedHistorycalInfo(0 /* timestamp */, 0 /* level */, 0 /* count */);
+ const ProbabilityEntry probabilityEntryToWrite =
+ ProbabilityEntry().createEntryWithUpdatedHistoricalInfo(&suppressedHistorycalInfo);
+ return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
+ ptNodeParams->getTerminalId(), &probabilityEntryToWrite);
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h
new file mode 100644
index 000000000..db3cea174
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_node_writer.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H
+#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+} // namespace v402
+} // namespace backward
+class BufferWithExtendableBuffer;
+namespace backward {
+namespace v402 {
+} // namespace v402
+} // namespace backward
+class HeaderPolicy;
+namespace backward {
+namespace v402 {
+class Ver4BigramListPolicy;
+class Ver4DictBuffers;
+class Ver4PatriciaTrieNodeReader;
+class Ver4PtNodeArrayReader;
+class Ver4ShortcutListPolicy;
+
+/*
+ * This class is used for helping to writes nodes of ver4 patricia trie.
+ */
+class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
+ public:
+ Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer,
+ Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy,
+ const PtNodeReader *const ptNodeReader,
+ const PtNodeArrayReader *const ptNodeArrayReader,
+ Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy)
+ : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy),
+ mPtNodeReader(ptNodeReader), mReadingHelper(ptNodeReader, ptNodeArrayReader),
+ mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {}
+
+ virtual ~Ver4PatriciaTrieNodeWriter() {}
+
+ virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams);
+
+ virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int movedPos, const int bigramLinkedNodePos);
+
+ virtual bool markPtNodeAsWillBecomeNonTerminal(
+ const PtNodeParams *const toBeUpdatedPtNodeParams);
+
+ virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const UnigramProperty *const unigramProperty);
+
+ virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
+ const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode);
+
+ virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int newChildrenPosition);
+
+ bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int newTerminalId);
+
+ virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
+ int *const ptNodeWritingPos);
+
+ virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
+ const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
+
+ virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
+
+ virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
+
+ virtual bool updateAllBigramEntriesAndDeleteUselessEntries(
+ const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount);
+
+ virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const DictPositionRelocationMap *const dictPositionRelocationMap,
+ int *const outBigramEntryCount);
+
+ virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
+ const int *const targetCodePoints, const int targetCodePointCount,
+ const int shortcutProbability);
+
+ bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams);
+
+ // Suppress unigram not to use the word for generating suggestions. So, this method can be used
+ // only for dictionaries with historical info. Also, suppressed entries are included in unigram
+ // count. They will be removed from the dictionary during GC.
+ bool suppressUnigramEntry(const PtNodeParams *const ptNodeParams);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter);
+
+ bool writePtNodeAndGetTerminalIdAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, int *const outTerminalId,
+ int *const ptNodeWritingPos);
+
+ // Create updated probability entry using given unigram property. In addition to the
+ // probability, this method updates historical information if needed.
+ // TODO: Update flags belonging to the unigram property.
+ const ProbabilityEntry createUpdatedEntryFrom(
+ const ProbabilityEntry *const originalProbabilityEntry,
+ const UnigramProperty *const unigramProperty) const;
+
+ bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord,
+ const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams,
+ const bool hasMultipleChars);
+
+ static const int CHILDREN_POSITION_FIELD_SIZE;
+
+ BufferWithExtendableBuffer *const mTrieBuffer;
+ Ver4DictBuffers *const mBuffers;
+ const HeaderPolicy *const mHeaderPolicy;
+ const PtNodeReader *const mPtNodeReader;
+ DynamicPtReadingHelper mReadingHelper;
+ Ver4BigramListPolicy *const mBigramPolicy;
+ Ver4ShortcutListPolicy *const mShortcutPolicy;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
new file mode 100644
index 000000000..6fb9cffb7
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
@@ -0,0 +1,662 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
+ * Do not edit this file other than updating policy's interface.
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
+
+#include <vector>
+
+#include "suggest/core/dicnode/dic_node.h"
+#include "suggest/core/dicnode/dic_node_vector.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/property/ngram_context.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/property/word_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/utils/multi_bigram_map.h"
+#include "dictionary/utils/probability_utils.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
+// BinaryDictionaryDecayingTests.
+const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
+const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
+const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
+ Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
+const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
+
+void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const {
+ if (!dicNode->hasChildren()) {
+ return;
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
+ while (!readingHelper.isEnd()) {
+ const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
+ if (!ptNodeParams.isValid()) {
+ break;
+ }
+ bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
+ if (isTerminal && mHeaderPolicy->isDecayingDict()) {
+ // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
+ // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
+ // valid terminal DicNode.
+ isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
+ }
+ readingHelper.readNextSiblingNode(ptNodeParams);
+ if (ptNodeParams.representsNonWordInfo()) {
+ // Skip PtNodes that represent non-word information.
+ continue;
+ }
+ const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
+ childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
+ wordId, ptNodeParams.getCodePointArrayView());
+ }
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ }
+}
+
+int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ readingHelper.initWithPtNodePos(ptNodePos);
+ const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount(
+ maxCodePointCount, outCodePoints);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
+ }
+ return codePointCount;
+}
+
+int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
+ wordCodePoints.size(), forceLowerCaseSearch);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in getWordId().");
+ }
+ return getWordIdFromTerminalPtNodePos(ptNodePos);
+}
+
+const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
+ const WordIdArrayView prevWordIds, const int wordId,
+ MultiBigramMap *const multiBigramMap) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return WordAttributes();
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (multiBigramMap) {
+ const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
+ prevWordIds, wordId, ptNodeParams.getProbability());
+ return getWordAttributes(probability, ptNodeParams);
+ }
+ if (!prevWordIds.empty()) {
+ const int probability = getProbabilityOfWord(prevWordIds, wordId);
+ if (probability != NOT_A_PROBABILITY) {
+ return getWordAttributes(probability, ptNodeParams);
+ }
+ }
+ return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
+ ptNodeParams);
+}
+
+const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const {
+ return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
+ ptNodeParams.getProbability() == 0);
+}
+
+int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
+ const int bigramProbability) const {
+ // In the v4 format, bigramProbability is a conditional probability.
+ const int bigramConditionalProbability = bigramProbability;
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ }
+ if (bigramConditionalProbability == NOT_A_PROBABILITY) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ }
+ return bigramConditionalProbability;
+}
+
+int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
+ const int wordId) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return NOT_A_PROBABILITY;
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) {
+ return NOT_A_PROBABILITY;
+ }
+ if (prevWordIds.empty()) {
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ }
+ if (prevWordIds[0] == NOT_A_WORD_ID) {
+ return NOT_A_PROBABILITY;
+ }
+ const PtNodeParams prevWordPtNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
+ if (prevWordPtNodeParams.isDeleted()) {
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ }
+ const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ prevWordPtNodeParams.getTerminalId());
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ if (bigramsIt.getBigramPos() == ptNodePos
+ && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
+ const int bigramConditionalProbability = getBigramConditionalProbability(
+ prevWordPtNodeParams.getProbability(),
+ prevWordPtNodeParams.representsBeginningOfSentence(),
+ bigramsIt.getProbability());
+ return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability);
+ }
+ }
+ return NOT_A_PROBABILITY;
+}
+
+void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const {
+ if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) {
+ return;
+ }
+ const PtNodeParams prevWordPtNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
+ if (prevWordPtNodeParams.isDeleted()) {
+ return;
+ }
+ const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ prevWordPtNodeParams.getTerminalId());
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ const int bigramConditionalProbability = getBigramConditionalProbability(
+ prevWordPtNodeParams.getProbability(),
+ prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability());
+ listener->onVisitEntry(bigramConditionalProbability,
+ getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
+ }
+}
+
+int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability,
+ const bool isInBeginningOfSentenceContext, const int bigramProbability) const {
+ if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
+ if (isInBeginningOfSentenceContext) {
+ return bigramProbability;
+ }
+ // Calculate conditional probability.
+ return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability,
+ MAX_PROBABILITY);
+ } else {
+ // bigramProbability is a conditional probability.
+ return bigramProbability;
+ }
+}
+
+BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
+ const int wordId) const {
+ const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
+ return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
+}
+
+int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_DICT_POS;
+ }
+ const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_DICT_POS;
+ }
+ return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
+ ptNodeParams.getTerminalId());
+}
+
+int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_DICT_POS;
+ }
+ const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_DICT_POS;
+ }
+ return mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ ptNodeParams.getTerminalId());
+}
+
+bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
+ const UnigramProperty *const unigramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert to the dictionary, length: %zd",
+ wordCodePoints.size());
+ return false;
+ }
+ for (const auto &shortcut : unigramProperty->getShortcuts()) {
+ if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
+ shortcut.getTargetCodePoints()->size());
+ return false;
+ }
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ bool addedNewUnigram = false;
+ int codePointsToAdd[MAX_WORD_LENGTH];
+ int codePointCountToAdd = wordCodePoints.size();
+ memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
+ if (unigramProperty->representsBeginningOfSentence()) {
+ codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
+ codePointCountToAdd, MAX_WORD_LENGTH);
+ }
+ if (codePointCountToAdd <= 0) {
+ return false;
+ }
+ const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
+ if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
+ &addedNewUnigram)) {
+ if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
+ mEntryCounters.incrementNgramCount(NgramType::Unigram);
+ }
+ if (unigramProperty->getShortcuts().size() > 0) {
+ // Add shortcut target.
+ const int wordPos = getTerminalPtNodePosFromWordId(
+ getWordId(codePointArrayView, false /* forceLowerCaseSearch */));
+ if (wordPos == NOT_A_DICT_POS) {
+ AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
+ return false;
+ }
+ for (const auto &shortcut : unigramProperty->getShortcuts()) {
+ if (!mUpdatingHelper.addShortcutTarget(wordPos,
+ CodePointArrayView(*shortcut.getTargetCodePoints()),
+ shortcut.getProbability())) {
+ AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
+ "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
+ shortcut.getProbability());
+ return false;
+ }
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(
+ getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return false;
+ }
+ const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ return mNodeWriter.suppressUnigramEntry(&ptNodeParams);
+}
+
+bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ const NgramContext *const ngramContext = ngramProperty->getNgramContext();
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
+ return false;
+ }
+ if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert the ngram to the dictionary. "
+ "length: %zd", ngramProperty->getTargetCodePoints()->size());
+ return false;
+ }
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSearch */);
+ if (prevWordIds.empty()) {
+ return false;
+ }
+ if (prevWordIds[0] == NOT_A_WORD_ID) {
+ if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
+ const UnigramProperty beginningOfSentenceUnigramProperty(
+ true /* representsBeginningOfSentence */, true /* isNotAWord */,
+ false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
+ if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
+ &beginningOfSentenceUnigramProperty)) {
+ AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
+ return false;
+ }
+ // Refresh word ids.
+ ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
+ } else {
+ return false;
+ }
+ }
+ const int wordPos = getTerminalPtNodePosFromWordId(getWordId(
+ CodePointArrayView(*ngramProperty->getTargetCodePoints()),
+ false /* forceLowerCaseSearch */));
+ if (wordPos == NOT_A_DICT_POS) {
+ return false;
+ }
+ bool addedNewBigram = false;
+ const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
+ if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
+ wordPos, ngramProperty, &addedNewBigram)) {
+ if (addedNewBigram) {
+ mEntryCounters.incrementNgramCount(NgramType::Bigram);
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
+ return false;
+ }
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
+ wordCodePoints.size());
+ }
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSerch */);
+ if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
+ return false;
+ }
+ const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints,
+ false /* forceLowerCaseSearch */));
+ if (wordPos == NOT_A_DICT_POS) {
+ return false;
+ }
+ const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
+ if (mUpdatingHelper.removeNgramEntry(
+ PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) {
+ mEntryCounters.decrementNgramCount(NgramType::Bigram);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
+ const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
+ const bool isValidWord, const HistoricalInfo historicalInfo) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
+ "dictionary.");
+ return false;
+ }
+ const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY;
+ const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
+ false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo);
+ if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
+ AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
+ ? NOT_A_PROBABILITY : probability;
+ const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram,
+ historicalInfo);
+ if (!addNgramEntry(&ngramProperty)) {
+ AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
+ AKLOGE("Cannot flush the dictionary to file.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
+ AKLOGE("Cannot flush the dictionary to file with GC.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mBuffers->isNearSizeLimit()) {
+ // Additional buffer size is near the limit.
+ return true;
+ } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
+ > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
+ // Total extended region size of the trie exceeds the limit.
+ return true;
+ } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
+ && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
+ // Needs to reduce dictionary size.
+ return true;
+ } else if (mHeaderPolicy->isDecayingDict()) {
+ return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
+ mHeaderPolicy);
+ }
+ return false;
+}
+
+void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
+ char *const outResult, const int maxResultLength) {
+ const int compareLength = queryLength + 1 /* terminator */;
+ if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mEntryCounters.getNgramCount(NgramType::Unigram));
+ } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
+ } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Unigram)) :
+ static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Bigram)) :
+ static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ }
+}
+
+const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
+ const CodePointArrayView wordCodePoints) const {
+ const int ptNodePos = getTerminalPtNodePosFromWordId(
+ getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
+ if (ptNodePos == NOT_A_DICT_POS) {
+ AKLOGE("getWordProperty is called for invalid word.");
+ return WordProperty();
+ }
+ const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ const ProbabilityEntry probabilityEntry =
+ mBuffers->getProbabilityDictContent()->getProbabilityEntry(
+ ptNodeParams.getTerminalId());
+ const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
+ // Fetch bigram information.
+ std::vector<NgramProperty> ngrams;
+ const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
+ if (bigramListPos != NOT_A_DICT_POS) {
+ int bigramWord1CodePoints[MAX_WORD_LENGTH];
+ const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent();
+ const TerminalPositionLookupTable *const terminalPositionLookupTable =
+ mBuffers->getTerminalPositionLookupTable();
+ bool hasNext = true;
+ int readingPos = bigramListPos;
+ while (hasNext) {
+ const BigramEntry bigramEntry =
+ bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = bigramEntry.hasNext();
+ const int word1TerminalId = bigramEntry.getTargetTerminalId();
+ const int word1TerminalPtNodePos =
+ terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId);
+ if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
+ continue;
+ }
+ const int codePointCount = getCodePointsAndReturnCodePointCount(
+ getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH,
+ bigramWord1CodePoints);
+ const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
+ const int rawBigramProbability = bigramEntry.hasHistoricalInfo()
+ ? ForgettingCurveUtils::decodeProbability(
+ bigramEntry.getHistoricalInfo(), mHeaderPolicy)
+ : bigramEntry.getProbability();
+ const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(),
+ ptNodeParams.representsBeginningOfSentence(), rawBigramProbability);
+ ngrams.emplace_back(
+ NgramContext(wordCodePoints.data(), wordCodePoints.size(),
+ ptNodeParams.representsBeginningOfSentence()),
+ CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
+ probability, *historicalInfo);
+ }
+ }
+ // Fetch shortcut information.
+ std::vector<UnigramProperty::ShortcutProperty> shortcuts;
+ int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
+ if (shortcutPos != NOT_A_DICT_POS) {
+ int shortcutTarget[MAX_WORD_LENGTH];
+ const ShortcutDictContent *const shortcutDictContent =
+ mBuffers->getShortcutDictContent();
+ bool hasNext = true;
+ while (hasNext) {
+ int shortcutTargetLength = 0;
+ int shortcutProbability = NOT_A_PROBABILITY;
+ shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
+ &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
+ shortcuts.emplace_back(
+ CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
+ shortcutProbability);
+ }
+ }
+ const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
+ ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
+ ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts));
+ return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
+}
+
+int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount) {
+ *outCodePointCount = 0;
+ if (token == 0) {
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
+ &mTerminalPtNodePositionsForIteratingWords);
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
+ }
+ const int terminalPtNodePositionsVectorSize =
+ static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
+ if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
+ AKLOGE("Given token %d is invalid.", token);
+ return 0;
+ }
+ const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
+ *outCodePointCount = getCodePointsAndReturnCodePointCount(
+ getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
+ const int nextToken = token + 1;
+ if (nextToken >= terminalPtNodePositionsVectorSize) {
+ // All words have been iterated.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ return 0;
+ }
+ return nextToken;
+}
+
+int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
+ return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
+}
+
+int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
+ return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
new file mode 100644
index 000000000..bce5f6bea
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
+ * Do not edit this file other than updating policy's interface.
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_policy.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H
+#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
+#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
+#include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/entry_counters.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+} // namespace v402
+} // namespace backward
+class DicNode;
+namespace backward {
+namespace v402 {
+} // namespace v402
+} // namespace backward
+class DicNodeVector;
+namespace backward {
+namespace v402 {
+
+// Word id = Position of a PtNode that represents the word.
+// Max supported n-gram is bigram.
+class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
+ public:
+ Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
+ : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()),
+ mDictBuffer(mBuffers->getWritableTrieBuffer()),
+ mBigramPolicy(mBuffers->getMutableBigramDictContent(),
+ mBuffers->getTerminalPositionLookupTable(), mHeaderPolicy),
+ mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
+ mBuffers->getTerminalPositionLookupTable()),
+ mNodeReader(mDictBuffer, mBuffers->getProbabilityDictContent(), mHeaderPolicy),
+ mPtNodeArrayReader(mDictBuffer),
+ mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader,
+ &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy),
+ mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
+ mWritingHelper(mBuffers.get()),
+ mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()),
+ mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
+
+ virtual int getRootPosition() const {
+ return 0;
+ }
+
+ void createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const;
+
+ int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const;
+
+ int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
+
+ const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const;
+
+ int getProbability(const int unigramProbability, const int bigramProbability) const;
+
+ int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
+
+ void iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const;
+
+ BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
+
+ const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
+ return mHeaderPolicy;
+ }
+
+ bool addUnigramEntry(const CodePointArrayView wordCodePoints,
+ const UnigramProperty *const unigramProperty);
+
+ bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
+
+ bool addNgramEntry(const NgramProperty *const ngramProperty);
+
+ bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints);
+
+ bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo);
+
+ bool flush(const char *const filePath);
+
+ bool flushWithGC(const char *const filePath);
+
+ bool needsToRunGC(const bool mindsBlockByGC) const;
+
+ void getProperty(const char *const query, const int queryLength, char *const outResult,
+ const int maxResultLength);
+
+ const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
+
+ int getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount);
+
+ bool isCorrupted() const {
+ return mIsCorrupted;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy);
+
+ static const char *const UNIGRAM_COUNT_QUERY;
+ static const char *const BIGRAM_COUNT_QUERY;
+ static const char *const MAX_UNIGRAM_COUNT_QUERY;
+ static const char *const MAX_BIGRAM_COUNT_QUERY;
+ // When the dictionary size is near the maximum size, we have to refuse dynamic operations to
+ // prevent the dictionary from overflowing.
+ static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
+ static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
+ static const int DUMMY_PROBABILITY_FOR_VALID_WORDS;
+
+ const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
+ const HeaderPolicy *const mHeaderPolicy;
+ BufferWithExtendableBuffer *const mDictBuffer;
+ Ver4BigramListPolicy mBigramPolicy;
+ Ver4ShortcutListPolicy mShortcutPolicy;
+ Ver4PatriciaTrieNodeReader mNodeReader;
+ Ver4PtNodeArrayReader mPtNodeArrayReader;
+ Ver4PatriciaTrieNodeWriter mNodeWriter;
+ DynamicPtUpdatingHelper mUpdatingHelper;
+ Ver4PatriciaTrieWritingHelper mWritingHelper;
+ MutableEntryCounters mEntryCounters;
+ std::vector<int> mTerminalPtNodePositionsForIteratingWords;
+ mutable bool mIsCorrupted;
+
+ int getBigramsPositionOfPtNode(const int ptNodePos) const;
+ int getShortcutPositionOfPtNode(const int ptNodePos) const;
+ int getWordIdFromTerminalPtNodePos(const int ptNodePos) const;
+ int getTerminalPtNodePosFromWordId(const int wordId) const;
+ const WordAttributes getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const;
+ int getBigramConditionalProbability(const int prevWordUnigramProbability,
+ const bool isInBeginningOfSentenceContext, const int bigramProbability) const;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif // LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp
new file mode 100644
index 000000000..b8a4cf847
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
+
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(
+ const uint8_t *const buffer, int *pos) {
+ return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos);
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h
new file mode 100644
index 000000000..c3e736bdc
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H
+#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+} // namespace v402
+} // namespace backward
+class BufferWithExtendableBuffer;
+namespace backward {
+namespace v402 {
+
+class Ver4PatriciaTrieReadingUtils {
+ public:
+ static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer,
+ int *const pos);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils);
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp
new file mode 100644
index 000000000..c0af9eae6
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
+
+#include <cstring>
+#include <queue>
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
+ const EntryCounts &entryCounts) const {
+ const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
+ BufferWithExtendableBuffer headerBuffer(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
+ + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
+ if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
+ entryCounts, extendedRegionSize, &headerBuffer)) {
+ AKLOGE("Cannot write header structure to buffer. "
+ "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
+ "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
+ entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize);
+ return false;
+ }
+ return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
+}
+
+bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
+ const char *const dictDirPath) {
+ const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
+ Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
+ Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
+ Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ int unigramCount = 0;
+ int bigramCount = 0;
+ if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
+ return false;
+ }
+ BufferWithExtendableBuffer headerBuffer(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ MutableEntryCounters entryCounters;
+ entryCounters.setNgramCount(NgramType::Unigram, unigramCount);
+ entryCounters.setNgramCount(NgramType::Bigram, bigramCount);
+ if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
+ entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
+ return false;
+ }
+ return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
+}
+
+bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
+ const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
+ int *const outUnigramCount, int *const outBigramCount) {
+ Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
+ mBuffers->getProbabilityDictContent(), headerPolicy);
+ Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
+ Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
+ mBuffers->getTerminalPositionLookupTable(), headerPolicy);
+ Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
+ mBuffers->getTerminalPositionLookupTable());
+ Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
+ mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
+ &shortcutPolicy);
+
+ DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ DynamicPtGcEventListeners
+ ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
+ &ptNodeWriter);
+ if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
+ return false;
+ }
+ const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ .getValidUnigramCount();
+ const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram);
+ if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
+ if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
+ AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
+ maxUnigramCount);
+ return false;
+ }
+ }
+
+ readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
+ traversePolicyToUpdateBigramProbability(&ptNodeWriter);
+ if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicyToUpdateBigramProbability)) {
+ return false;
+ }
+ const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
+ const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram);
+ if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
+ if (!truncateBigrams(maxBigramCount)) {
+ AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
+ return false;
+ }
+ }
+
+ // Mapping from positions in mBuffer to positions in bufferToWrite.
+ PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
+ readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
+ buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
+ &shortcutPolicy);
+ DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
+ traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
+ buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
+ if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
+ return false;
+ }
+
+ // Create policy instances for the GCed dictionary.
+ Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
+ buffersToWrite->getProbabilityDictContent(), headerPolicy);
+ Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
+ Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
+ buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
+ Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
+ buffersToWrite->getTerminalPositionLookupTable());
+ Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
+ buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
+ &newShortcutPolicy);
+ // Re-assign terminal IDs for valid terminal PtNodes.
+ TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
+ if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
+ &terminalIdMap)) {
+ return false;
+ }
+ // Run GC for probability dict content.
+ if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
+ mBuffers->getProbabilityDictContent())) {
+ return false;
+ }
+ // Run GC for bigram dict content.
+ if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
+ mBuffers->getBigramDictContent(), outBigramCount)) {
+ return false;
+ }
+ // Run GC for shortcut dict content.
+ if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
+ mBuffers->getShortcutDictContent())) {
+ return false;
+ }
+ DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
+ newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
+ traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
+ if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ &traversePolicyToUpdateAllPositionFields)) {
+ return false;
+ }
+ newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
+ if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
+ return false;
+ }
+ *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
+ return true;
+}
+
+bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
+ const Ver4PatriciaTrieNodeReader *const ptNodeReader,
+ Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
+ const TerminalPositionLookupTable *const terminalPosLookupTable =
+ mBuffers->getTerminalPositionLookupTable();
+ const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
+ std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
+ priorityQueue;
+ for (int i = 0; i < nextTerminalId; ++i) {
+ const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
+ if (terminalPos == NOT_A_DICT_POS) {
+ continue;
+ }
+ const ProbabilityEntry probabilityEntry =
+ mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
+ const int probability = probabilityEntry.hasHistoricalInfo() ?
+ ForgettingCurveUtils::decodeProbability(
+ probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
+ probabilityEntry.getProbability();
+ priorityQueue.push(DictProbability(terminalPos, probability,
+ probabilityEntry.getHistoricalInfo()->getTimestamp()));
+ }
+
+ // Delete unigrams.
+ while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
+ const int ptNodePos = priorityQueue.top().getDictPos();
+ priorityQueue.pop();
+ const PtNodeParams ptNodeParams =
+ ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (ptNodeParams.representsNonWordInfo()) {
+ continue;
+ }
+ if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
+ AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
+ const TerminalPositionLookupTable *const terminalPosLookupTable =
+ mBuffers->getTerminalPositionLookupTable();
+ const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
+ std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
+ priorityQueue;
+ BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
+ for (int i = 0; i < nextTerminalId; ++i) {
+ const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
+ if (bigramListPos == NOT_A_DICT_POS) {
+ continue;
+ }
+ bool hasNext = true;
+ int readingPos = bigramListPos;
+ while (hasNext) {
+ const int entryPos = readingPos;
+ const BigramEntry bigramEntry =
+ bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = bigramEntry.hasNext();
+ if (!bigramEntry.isValid()) {
+ continue;
+ }
+ const int probability = bigramEntry.hasHistoricalInfo() ?
+ ForgettingCurveUtils::decodeProbability(
+ bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
+ bigramEntry.getProbability();
+ priorityQueue.push(DictProbability(entryPos, probability,
+ bigramEntry.getHistoricalInfo()->getTimestamp()));
+ }
+ }
+
+ // Delete bigrams.
+ while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
+ const int entryPos = priorityQueue.top().getDictPos();
+ const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
+ const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
+ if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
+ AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
+ return false;
+ }
+ priorityQueue.pop();
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
+ if (!ptNodeParams->isTerminal()) {
+ return true;
+ }
+ TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
+ mTerminalIdMap->find(ptNodeParams->getTerminalId());
+ if (it == mTerminalIdMap->end()) {
+ AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
+ ptNodeParams->getTerminalId(), mTerminalIdMap->size());
+ return false;
+ }
+ if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
+ AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
+ }
+ return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h
new file mode 100644
index 000000000..f2b873826
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H
+#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/utils/entry_counters.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+} // namespace v402
+} // namespace backward
+class HeaderPolicy;
+namespace backward {
+namespace v402 {
+class Ver4DictBuffers;
+class Ver4PatriciaTrieNodeReader;
+class Ver4PatriciaTrieNodeWriter;
+
+class Ver4PatriciaTrieWritingHelper {
+ public:
+ Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers)
+ : mBuffers(buffers) {}
+
+ bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const;
+
+ // This method cannot be const because the original dictionary buffer will be updated to detect
+ // useless PtNodes during GC.
+ bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper);
+
+ class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ : public DynamicPtReadingHelper::TraversingEventListener {
+ public:
+ TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(
+ Ver4PatriciaTrieNodeWriter *const ptNodeWriter,
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap)
+ : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {}
+
+ bool onAscend() { return true; }
+
+ bool onDescend(const int ptNodeArrayPos) { return true; }
+
+ bool onReadingPtNodeArrayTail() { return true; }
+
+ bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds);
+
+ Ver4PatriciaTrieNodeWriter *const mPtNodeWriter;
+ const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
+ };
+
+ // For truncateUnigrams() and truncateBigrams().
+ class DictProbability {
+ public:
+ DictProbability(const int dictPos, const int probability, const int timestamp)
+ : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {}
+
+ int getDictPos() const {
+ return mDictPos;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ int getTimestamp() const {
+ return mTimestamp;
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability);
+
+ int mDictPos;
+ int mProbability;
+ int mTimestamp;
+ };
+
+ // For truncateUnigrams() and truncateBigrams().
+ class DictProbabilityComparator {
+ public:
+ bool operator()(const DictProbability &left, const DictProbability &right) {
+ if (left.getProbability() != right.getProbability()) {
+ return left.getProbability() > right.getProbability();
+ }
+ if (left.getTimestamp() != right.getTimestamp()) {
+ return left.getTimestamp() < right.getTimestamp();
+ }
+ return left.getDictPos() > right.getDictPos();
+ }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator);
+ };
+
+ bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
+ Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
+ int *const outBigramCount);
+
+ bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader,
+ Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount);
+
+ bool truncateBigrams(const int maxBigramCount);
+
+ Ver4DictBuffers *const mBuffers;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+
+#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H */
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp
new file mode 100644
index 000000000..d27d70816
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_pt_node_array_reader.cpp
+ */
+
+#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
+
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
+ int *const outPtNodeCount, int *const outFirstPtNodePos) const {
+ if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) {
+ // Reading invalid position because of a bug or a broken dictionary.
+ AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d",
+ ptNodeArrayPos, mBuffer->getTailPosition());
+ ASSERT(false);
+ return false;
+ }
+ const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos);
+ const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
+ int readingPos = ptNodeArrayPos;
+ if (usesAdditionalBuffer) {
+ readingPos -= mBuffer->getOriginalBufferSize();
+ }
+ const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
+ dictBuf, &readingPos);
+ if (usesAdditionalBuffer) {
+ readingPos += mBuffer->getOriginalBufferSize();
+ }
+ if (ptNodeCountInArray < 0) {
+ AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray);
+ return false;
+ }
+ *outPtNodeCount = ptNodeCountInArray;
+ *outFirstPtNodePos = readingPos;
+ return true;
+}
+
+bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos,
+ int *const outNextPtNodeArrayPos) const {
+ if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) {
+ // Reading invalid position because of bug or broken dictionary.
+ AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d",
+ forwordLinkPos, mBuffer->getTailPosition());
+ ASSERT(false);
+ return false;
+ }
+ const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos);
+ const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
+ int readingPos = forwordLinkPos;
+ if (usesAdditionalBuffer) {
+ readingPos -= mBuffer->getOriginalBufferSize();
+ }
+ const int nextPtNodeArrayOffset =
+ DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos);
+ if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) {
+ *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset;
+ } else {
+ *outNextPtNodeArrayPos = NOT_A_DICT_POS;
+ }
+ return true;
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h
new file mode 100644
index 000000000..0039bf8fc
--- /dev/null
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT EDIT THIS FILE !!!!!
+ *
+ * This file was generated from
+ * dictionary/structure/v4/ver4_pt_node_array_reader.h
+ */
+
+#ifndef LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H
+#define LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+} // namespace v402
+} // namespace backward
+class BufferWithExtendableBuffer;
+namespace backward {
+namespace v402 {
+
+class Ver4PtNodeArrayReader : public PtNodeArrayReader {
+ public:
+ Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {};
+
+ virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
+ int *const outPtNodeCount, int *const outFirstPtNodePos) const;
+ virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos,
+ int *const outNextPtNodeArrayPos) const;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader);
+
+ const BufferWithExtendableBuffer *const mBuffer;
+};
+} // namespace v402
+} // namespace backward
+} // namespace latinime
+#endif /* LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H */
diff --git a/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
new file mode 100644
index 000000000..4470e8568
--- /dev/null
+++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/dictionary_structure_with_buffer_policy_factory.h"
+
+#include <climits>
+
+#include "defines.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/v2/patricia_trie_policy.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_policy.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/format_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ DictionaryStructureWithBufferPolicyFactory::newPolicyForExistingDictFile(
+ const char *const path, const int bufOffset, const int size,
+ const bool isUpdatable) {
+ if (FileUtils::existsDir(path)) {
+ // Given path represents a directory.
+ return newPolicyForDirectoryDict(path, isUpdatable);
+ } else {
+ if (isUpdatable) {
+ AKLOGE("One file dictionaries don't support updating. path: %s", path);
+ ASSERT(false);
+ return nullptr;
+ }
+ return newPolicyForFileDict(path, bufOffset, size);
+ }
+}
+
+/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict(
+ const int formatVersion, const std::vector<int> &locale,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) {
+ FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion);
+ switch (dictFormatVersion) {
+ case FormatUtils::VERSION_402: {
+ return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants,
+ backward::v402::Ver4DictBuffers,
+ backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr,
+ backward::v402::Ver4PatriciaTriePolicy>(
+ dictFormatVersion, locale, attributeMap);
+ }
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ case FormatUtils::VERSION_403: {
+ return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers,
+ Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>(
+ dictFormatVersion, locale, attributeMap);
+ }
+ default:
+ AKLOGE("DICT: dictionary format %d is not supported for on memory dictionary",
+ formatVersion);
+ break;
+ }
+ return nullptr;
+}
+
+template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy>
+/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ DictionaryStructureWithBufferPolicyFactory::newPolicyForOnMemoryV4Dict(
+ const FormatUtils::FORMAT_VERSION formatVersion,
+ const std::vector<int> &locale,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) {
+ HeaderPolicy headerPolicy(formatVersion, locale, attributeMap);
+ DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy,
+ DictConstants::MAX_DICT_EXTENDED_REGION_SIZE);
+ if (!DynamicPtWritingUtils::writeEmptyDictionary(
+ dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) {
+ AKLOGE("Empty ver4 dictionary structure cannot be created on memory.");
+ return nullptr;
+ }
+ return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
+ new StructurePolicy(std::move(dictBuffers)));
+}
+
+/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ DictionaryStructureWithBufferPolicyFactory::newPolicyForDirectoryDict(
+ const char *const path, const bool isUpdatable) {
+ const int headerFilePathBufSize = PATH_MAX + 1 /* terminator */;
+ char headerFilePath[headerFilePathBufSize];
+ getHeaderFilePathInDictDir(path, headerFilePathBufSize, headerFilePath);
+ // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of
+ // MmappedBufferPtr if the instance has the responsibility.
+ MmappedBuffer::MmappedBufferPtr mmappedBuffer =
+ MmappedBuffer::openBuffer(headerFilePath, isUpdatable);
+ if (!mmappedBuffer) {
+ return nullptr;
+ }
+ const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion(
+ mmappedBuffer->getReadOnlyByteArrayView());
+ switch (formatVersion) {
+ case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
+ case FormatUtils::VERSION_202:
+ AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path);
+ break;
+ case FormatUtils::VERSION_402: {
+ return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
+ backward::v402::Ver4DictBuffers,
+ backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr,
+ backward::v402::Ver4PatriciaTriePolicy>(
+ headerFilePath, formatVersion, std::move(mmappedBuffer));
+ }
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ case FormatUtils::VERSION_403: {
+ return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers,
+ Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>(
+ headerFilePath, formatVersion, std::move(mmappedBuffer));
+ }
+ default:
+ AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path);
+ break;
+ }
+ ASSERT(false);
+ return nullptr;
+}
+
+template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy>
+/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ DictionaryStructureWithBufferPolicyFactory::newPolicyForV4Dict(
+ const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion,
+ MmappedBuffer::MmappedBufferPtr &&mmappedBuffer) {
+ const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */;
+ char dictPath[dictDirPathBufSize];
+ if (!FileUtils::getFilePathWithoutSuffix(headerFilePath,
+ DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) {
+ AKLOGE("Dictionary file name is not valid as a ver4 dictionary. header path: %s",
+ headerFilePath);
+ ASSERT(false);
+ return nullptr;
+ }
+ DictBuffersPtr dictBuffers =
+ DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion);
+ if (!dictBuffers || !dictBuffers->isValid()) {
+ AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s",
+ dictPath);
+ ASSERT(false);
+ return nullptr;
+ }
+ return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
+ new StructurePolicy(std::move(dictBuffers)));
+}
+
+/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ DictionaryStructureWithBufferPolicyFactory::newPolicyForFileDict(
+ const char *const path, const int bufOffset, const int size) {
+ // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of
+ // MmappedBufferPtr if the instance has the responsibility.
+ MmappedBuffer::MmappedBufferPtr mmappedBuffer(
+ MmappedBuffer::openBuffer(path, bufOffset, size, false /* isUpdatable */));
+ if (!mmappedBuffer) {
+ return nullptr;
+ }
+ switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
+ case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ break;
+ case FormatUtils::VERSION_202:
+ return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
+ new PatriciaTriePolicy(std::move(mmappedBuffer)));
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ case FormatUtils::VERSION_402:
+ case FormatUtils::VERSION_403:
+ AKLOGE("Given path is a file but the format is version 4. path: %s", path);
+ break;
+ default:
+ AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path);
+ break;
+ }
+ ASSERT(false);
+ return nullptr;
+}
+
+/* static */ void DictionaryStructureWithBufferPolicyFactory::getHeaderFilePathInDictDir(
+ const char *const dictDirPath, const int outHeaderFileBufSize,
+ char *const outHeaderFilePath) {
+ const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */;
+ char dictName[dictNameBufSize];
+ FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName);
+ snprintf(outHeaderFilePath, outHeaderFileBufSize, "%s/%s%s", dictDirPath,
+ dictName, Ver4DictConstants::HEADER_FILE_EXTENSION);
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h
new file mode 100644
index 000000000..b0c04c0b1
--- /dev/null
+++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H
+#define LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/utils/format_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
+
+namespace latinime {
+
+class DictionaryStructureWithBufferPolicyFactory {
+ public:
+ static DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ newPolicyForExistingDictFile(const char *const path, const int bufOffset,
+ const int size, const bool isUpdatable);
+
+ static DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ newPolicyForOnMemoryDict(const int formatVersion, const std::vector<int> &locale,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory);
+
+ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy>
+ static DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ newPolicyForOnMemoryV4Dict(const FormatUtils::FORMAT_VERSION formatVersion,
+ const std::vector<int> &locale,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap);
+
+ static DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ newPolicyForDirectoryDict(const char *const path, const bool isUpdatable);
+
+ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class StructurePolicy>
+ static DictionaryStructureWithBufferPolicy::StructurePolicyPtr newPolicyForV4Dict(
+ const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion,
+ MmappedBuffer::MmappedBufferPtr &&mmappedBuffer);
+
+ static DictionaryStructureWithBufferPolicy::StructurePolicyPtr
+ newPolicyForFileDict(const char *const path, const int bufOffset, const int size);
+
+ static void getHeaderFilePathInDictDir(const char *const dirPath,
+ const int outHeaderFileBufSize, char *const outHeaderFilePath);
+};
+} // namespace latinime
+#endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H
diff --git a/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp
new file mode 100644
index 000000000..64f9b6663
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
+
+#include "dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE =
+ 0x30;
+const BigramListReadWriteUtils::BigramFlags
+ BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
+const BigramListReadWriteUtils::BigramFlags
+ BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
+const BigramListReadWriteUtils::BigramFlags
+ BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
+const BigramListReadWriteUtils::BigramFlags
+ BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
+// Flag for presence of more attributes
+const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT =
+ 0x80;
+// Mask for attribute probability, stored on 4 bits inside the flags byte.
+const BigramListReadWriteUtils::BigramFlags
+ BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F;
+
+/* static */ bool BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(
+ const ReadOnlyByteArrayView buffer, BigramFlags *const outBigramFlags,
+ int *const outTargetPtNodePos, int *const bigramEntryPos) {
+ if (static_cast<int>(buffer.size()) <= *bigramEntryPos) {
+ AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %zd, "
+ "bigramEntryPos: %d.", buffer.size(), *bigramEntryPos);
+ return false;
+ }
+ const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(),
+ bigramEntryPos);
+ if (outBigramFlags) {
+ *outBigramFlags = bigramFlags;
+ }
+ const int targetPos = getBigramAddressAndAdvancePosition(buffer, bigramFlags, bigramEntryPos);
+ if (outTargetPtNodePos) {
+ *outTargetPtNodePos = targetPos;
+ }
+ return true;
+}
+
+/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const ReadOnlyByteArrayView buffer,
+ int *const bigramListPos) {
+ BigramFlags flags;
+ do {
+ if (!getBigramEntryPropertiesAndAdvancePosition(buffer, &flags, 0 /* outTargetPtNodePos */,
+ bigramListPos)) {
+ return false;
+ }
+ } while(hasNext(flags));
+ return true;
+}
+
+/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition(
+ const ReadOnlyByteArrayView buffer, const BigramFlags flags, int *const pos) {
+ int offset = 0;
+ const int origin = *pos;
+ switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
+ offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos);
+ break;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
+ offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos);
+ break;
+ case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
+ offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer.data(), pos);
+ break;
+ }
+ if (isOffsetNegative(flags)) {
+ return origin - offset;
+ } else {
+ return origin + offset;
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h
new file mode 100644
index 000000000..a0f7d5e83
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H
+#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "defines.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class BigramListReadWriteUtils {
+public:
+ typedef uint8_t BigramFlags;
+
+ static bool getBigramEntryPropertiesAndAdvancePosition(const ReadOnlyByteArrayView buffer,
+ BigramFlags *const outBigramFlags, int *const outTargetPtNodePos,
+ int *const bigramEntryPos);
+
+ static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) {
+ return flags & MASK_ATTRIBUTE_PROBABILITY;
+ }
+
+ static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) {
+ return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0;
+ }
+
+ // Bigrams reading methods
+ static bool skipExistingBigrams(const ReadOnlyByteArrayView buffer, int *const bigramListPos);
+
+private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils);
+
+ static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE;
+ static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
+ static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
+ static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
+ static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE;
+ static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT;
+ static const BigramFlags MASK_ATTRIBUTE_PROBABILITY;
+
+ static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) {
+ return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0;
+ }
+
+ static int getBigramAddressAndAdvancePosition(const ReadOnlyByteArrayView buffer,
+ const BigramFlags flags, int *const pos);
+};
+} // namespace latinime
+#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
new file mode 100644
index 000000000..b5e2e9dae
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
+
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+
+namespace latinime {
+
+bool DynamicPtGcEventListeners
+ ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
+ // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless
+ // children.
+ bool isUselessPtNode = !ptNodeParams->isTerminal();
+ if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) {
+ bool needsToKeepPtNode = true;
+ if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
+ ptNodeParams, &needsToKeepPtNode)) {
+ AKLOGE("Cannot update PtNode probability or get needs to keep PtNode after GC.");
+ return false;
+ }
+ if (!needsToKeepPtNode) {
+ isUselessPtNode = true;
+ }
+ }
+ if (mChildrenValue > 0) {
+ isUselessPtNode = false;
+ } else if (ptNodeParams->isTerminal()) {
+ // Remove children as all children are useless.
+ if (!mPtNodeWriter->updateChildrenPosition(ptNodeParams,
+ NOT_A_DICT_POS /* newChildrenPosition */)) {
+ return false;
+ }
+ }
+ if (isUselessPtNode) {
+ // Current PtNode is no longer needed. Mark it as deleted.
+ if (!mPtNodeWriter->markPtNodeAsDeleted(ptNodeParams)) {
+ return false;
+ }
+ } else {
+ mValueStack.back() += 1;
+ if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) {
+ mValidUnigramCount += 1;
+ }
+ }
+ return true;
+}
+
+bool DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
+ ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
+ if (!ptNodeParams->isDeleted()) {
+ int bigramEntryCount = 0;
+ if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams,
+ &bigramEntryCount)) {
+ return false;
+ }
+ mValidBigramEntryCount += bigramEntryCount;
+ }
+ return true;
+}
+
+// Writes dummy PtNode array size when the head of PtNode array is read.
+bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
+ ::onDescend(const int ptNodeArrayPos) {
+ mValidPtNodeCount = 0;
+ int writingPos = mBufferToWrite->getTailPosition();
+ mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert(
+ PtNodeWriter::PtNodeArrayPositionRelocationMap::value_type(ptNodeArrayPos, writingPos));
+ // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes.
+ // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count.
+ mPtNodeArraySizeFieldPos = writingPos;
+ return DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(
+ mBufferToWrite, 0 /* arraySize */, &writingPos);
+}
+
+// Write PtNode array terminal and actual PtNode array size.
+bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
+ ::onReadingPtNodeArrayTail() {
+ int writingPos = mBufferToWrite->getTailPosition();
+ // Write PtNode array terminal.
+ if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(
+ mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
+ return false;
+ }
+ // Write actual PtNode array size.
+ if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(
+ mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) {
+ return false;
+ }
+ return true;
+}
+
+// Write valid PtNode to buffer and memorize mapping from the old position to the new position.
+bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
+ ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
+ if (ptNodeParams->isDeleted()) {
+ // Current PtNode is not written in new buffer because it has been deleted.
+ mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert(
+ PtNodeWriter::PtNodePositionRelocationMap::value_type(
+ ptNodeParams->getHeadPos(), NOT_A_DICT_POS));
+ return true;
+ }
+ int writingPos = mBufferToWrite->getTailPosition();
+ mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert(
+ PtNodeWriter::PtNodePositionRelocationMap::value_type(
+ ptNodeParams->getHeadPos(), writingPos));
+ mValidPtNodeCount++;
+ // Writes current PtNode.
+ return mPtNodeWriter->writePtNodeAndAdvancePosition(ptNodeParams, &writingPos);
+}
+
+bool DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
+ ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
+ // Updates parent position.
+ int bigramCount = 0;
+ if (!mPtNodeWriter->updateAllPositionFields(ptNodeParams, mDictPositionRelocationMap,
+ &bigramCount)) {
+ return false;
+ }
+ mBigramCount += bigramCount;
+ if (ptNodeParams->isTerminal()) {
+ mUnigramCount++;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h
new file mode 100644
index 000000000..8c7ad965b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H
+#define LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+class PtNodeParams;
+
+class DynamicPtGcEventListeners {
+ public:
+ // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or
+ // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC.
+ // TODO: Concatenate non-terminal PtNodes.
+ class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ : public DynamicPtReadingHelper::TraversingEventListener {
+ public:
+ TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
+ PtNodeWriter *const ptNodeWriter)
+ : mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0),
+ mValidUnigramCount(0) {}
+
+ ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
+
+ bool onAscend() {
+ if (mValueStack.empty()) {
+ return false;
+ }
+ mChildrenValue = mValueStack.back();
+ mValueStack.pop_back();
+ return true;
+ }
+
+ bool onDescend(const int ptNodeArrayPos) {
+ mValueStack.push_back(0);
+ mChildrenValue = 0;
+ return true;
+ }
+
+ bool onReadingPtNodeArrayTail() { return true; }
+
+ bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+ int getValidUnigramCount() const {
+ return mValidUnigramCount;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(
+ TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
+
+ PtNodeWriter *const mPtNodeWriter;
+ std::vector<int> mValueStack;
+ int mChildrenValue;
+ int mValidUnigramCount;
+ };
+
+ // TODO: Remove when we stop supporting v402 format.
+ // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
+ // entries.
+ class TraversePolicyToUpdateBigramProbability
+ : public DynamicPtReadingHelper::TraversingEventListener {
+ public:
+ TraversePolicyToUpdateBigramProbability(PtNodeWriter *const ptNodeWriter)
+ : mPtNodeWriter(ptNodeWriter), mValidBigramEntryCount(0) {}
+
+ bool onAscend() { return true; }
+
+ bool onDescend(const int ptNodeArrayPos) { return true; }
+
+ bool onReadingPtNodeArrayTail() { return true; }
+
+ bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+ int getValidBigramEntryCount() const {
+ return mValidBigramEntryCount;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability);
+
+ PtNodeWriter *const mPtNodeWriter;
+ int mValidBigramEntryCount;
+ };
+
+ class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
+ : public DynamicPtReadingHelper::TraversingEventListener {
+ public:
+ TraversePolicyToPlaceAndWriteValidPtNodesToBuffer(
+ PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const bufferToWrite,
+ PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap)
+ : mPtNodeWriter(ptNodeWriter), mBufferToWrite(bufferToWrite),
+ mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0),
+ mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {};
+
+ bool onAscend() { return true; }
+
+ bool onDescend(const int ptNodeArrayPos);
+
+ bool onReadingPtNodeArrayTail();
+
+ bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer);
+
+ PtNodeWriter *const mPtNodeWriter;
+ BufferWithExtendableBuffer *const mBufferToWrite;
+ PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap;
+ int mValidPtNodeCount;
+ int mPtNodeArraySizeFieldPos;
+ };
+
+ class TraversePolicyToUpdateAllPositionFields
+ : public DynamicPtReadingHelper::TraversingEventListener {
+ public:
+ TraversePolicyToUpdateAllPositionFields(PtNodeWriter *const ptNodeWriter,
+ const PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap)
+ : mPtNodeWriter(ptNodeWriter),
+ mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0),
+ mBigramCount(0) {};
+
+ bool onAscend() { return true; }
+
+ bool onDescend(const int ptNodeArrayPos) { return true; }
+
+ bool onReadingPtNodeArrayTail() { return true; }
+
+ bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+ int getUnigramCount() const {
+ return mUnigramCount;
+ }
+
+ int getBigramCount() const {
+ return mBigramCount;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields);
+
+ PtNodeWriter *const mPtNodeWriter;
+ const PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap;
+ int mUnigramCount;
+ int mBigramCount;
+ };
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtGcEventListeners);
+};
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp
new file mode 100644
index 000000000..294bc6ea9
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
+#include "utils/char_utils.h"
+
+namespace latinime {
+
+// To avoid infinite loop caused by invalid or malicious forward links.
+const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
+const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
+const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
+
+bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
+ const PtNodeParams *const ptNodeParams) {
+ if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
+ mTerminalPositions->push_back(ptNodeParams->getHeadPos());
+ }
+ return true;
+}
+
+// Visits all PtNodes in post-order depth first manner.
+// For example, visits c -> b -> y -> x -> a for the following dictionary:
+// a _ b _ c
+// \ x _ y
+bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner(
+ TraversingEventListener *const listener) {
+ bool alreadyVisitedChildren = false;
+ // Descend from the root to the root PtNode array.
+ if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
+ return false;
+ }
+ while (!isEnd()) {
+ const PtNodeParams ptNodeParams(getPtNodeParams());
+ if (!ptNodeParams.isValid()) {
+ break;
+ }
+ if (!alreadyVisitedChildren) {
+ if (ptNodeParams.hasChildren()) {
+ // Move to the first child.
+ if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
+ return false;
+ }
+ pushReadingStateToStack();
+ readChildNode(ptNodeParams);
+ } else {
+ alreadyVisitedChildren = true;
+ }
+ } else {
+ if (!listener->onVisitingPtNode(&ptNodeParams)) {
+ return false;
+ }
+ readNextSiblingNode(ptNodeParams);
+ if (isEnd()) {
+ // All PtNodes in current linked PtNode arrays have been visited.
+ // Return to the parent.
+ if (!listener->onReadingPtNodeArrayTail()) {
+ return false;
+ }
+ if (mReadingStateStack.size() <= 0) {
+ break;
+ }
+ if (!listener->onAscend()) {
+ return false;
+ }
+ popReadingStateFromStack();
+ alreadyVisitedChildren = true;
+ } else {
+ // Process sibling PtNode.
+ alreadyVisitedChildren = false;
+ }
+ }
+ }
+ // Ascend from the root PtNode array to the root.
+ if (!listener->onAscend()) {
+ return false;
+ }
+ return !isError();
+}
+
+// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order
+// that PtNodes are written in the dictionary buffer.
+// For example, visits a -> b -> x -> c -> y for the following dictionary:
+// a _ b _ c
+// \ x _ y
+bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ TraversingEventListener *const listener) {
+ bool alreadyVisitedAllPtNodesInArray = false;
+ bool alreadyVisitedChildren = false;
+ // Descend from the root to the root PtNode array.
+ if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
+ return false;
+ }
+ if (isEnd()) {
+ // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array.
+ if (!listener->onReadingPtNodeArrayTail()) {
+ return false;
+ }
+ }
+ pushReadingStateToStack();
+ while (!isEnd()) {
+ const PtNodeParams ptNodeParams(getPtNodeParams());
+ if (!ptNodeParams.isValid()) {
+ break;
+ }
+ if (alreadyVisitedAllPtNodesInArray) {
+ if (alreadyVisitedChildren) {
+ // Move to next sibling PtNode's children.
+ readNextSiblingNode(ptNodeParams);
+ if (isEnd()) {
+ // Return to the parent PTNode.
+ if (!listener->onAscend()) {
+ return false;
+ }
+ if (mReadingStateStack.size() <= 0) {
+ break;
+ }
+ popReadingStateFromStack();
+ alreadyVisitedChildren = true;
+ alreadyVisitedAllPtNodesInArray = true;
+ } else {
+ alreadyVisitedChildren = false;
+ }
+ } else {
+ if (ptNodeParams.hasChildren()) {
+ // Move to the first child.
+ if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
+ return false;
+ }
+ pushReadingStateToStack();
+ readChildNode(ptNodeParams);
+ // Push state to return the head of PtNode array.
+ pushReadingStateToStack();
+ alreadyVisitedAllPtNodesInArray = false;
+ alreadyVisitedChildren = false;
+ } else {
+ alreadyVisitedChildren = true;
+ }
+ }
+ } else {
+ if (!listener->onVisitingPtNode(&ptNodeParams)) {
+ return false;
+ }
+ readNextSiblingNode(ptNodeParams);
+ if (isEnd()) {
+ if (!listener->onReadingPtNodeArrayTail()) {
+ return false;
+ }
+ // Return to the head of current PtNode array.
+ popReadingStateFromStack();
+ alreadyVisitedAllPtNodesInArray = true;
+ }
+ }
+ }
+ popReadingStateFromStack();
+ // Ascend from the root PtNode array to the root.
+ if (!listener->onAscend()) {
+ return false;
+ }
+ return !isError();
+}
+
+int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount,
+ int *const outCodePoints) {
+ // This method traverses parent nodes from the terminal by following parent pointers; thus,
+ // node code points are stored in the buffer in the reverse order.
+ int reverseCodePoints[maxCodePointCount];
+ const PtNodeParams terminalPtNodeParams(getPtNodeParams());
+ // First, read the terminal node and get its probability.
+ if (!isValidTerminalNode(terminalPtNodeParams)) {
+ // Node at the ptNodePos is not a valid terminal node.
+ return 0;
+ }
+ // Then, following parent node link to the dictionary root and fetch node code points.
+ int totalCodePointCount = 0;
+ while (!isEnd()) {
+ const PtNodeParams ptNodeParams(getPtNodeParams());
+ totalCodePointCount = getTotalCodePointCount(ptNodeParams);
+ if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) {
+ // The ptNodePos is not a valid terminal node position in the dictionary.
+ return 0;
+ }
+ // Store node code points to buffer in the reverse order.
+ fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(),
+ reverseCodePoints);
+ // Follow parent node toward the root node.
+ readParentNode(ptNodeParams);
+ }
+ if (isError()) {
+ // The node position or the dictionary is invalid.
+ return 0;
+ }
+ // Reverse the stored code points to output them.
+ for (int i = 0; i < totalCodePointCount; ++i) {
+ outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1];
+ }
+ return totalCodePointCount;
+}
+
+int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord,
+ const size_t length, const bool forceLowerCaseSearch) {
+ int searchCodePoints[length];
+ for (size_t i = 0; i < length; ++i) {
+ searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
+ }
+ while (!isEnd()) {
+ const PtNodeParams ptNodeParams(getPtNodeParams());
+ const int matchedCodePointCount = getPrevTotalCodePointCount();
+ if (getTotalCodePointCount(ptNodeParams) > length
+ || !isMatchedCodePoint(ptNodeParams, 0 /* index */,
+ searchCodePoints[matchedCodePointCount])) {
+ // Current node has too many code points or its first code point is different from
+ // target code point. Skip this node and read the next sibling node.
+ readNextSiblingNode(ptNodeParams);
+ continue;
+ }
+ // Check following merged node code points.
+ const int nodeCodePointCount = ptNodeParams.getCodePointCount();
+ for (int j = 1; j < nodeCodePointCount; ++j) {
+ if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) {
+ // Different code point is found. The given word is not included in the dictionary.
+ return NOT_A_DICT_POS;
+ }
+ }
+ // All characters are matched.
+ if (length == getTotalCodePointCount(ptNodeParams)) {
+ if (!ptNodeParams.isTerminal()) {
+ return NOT_A_DICT_POS;
+ }
+ // Terminal position is found.
+ return ptNodeParams.getHeadPos();
+ }
+ if (!ptNodeParams.hasChildren()) {
+ return NOT_A_DICT_POS;
+ }
+ // Advance to the children nodes.
+ readChildNode(ptNodeParams);
+ }
+ // If we already traversed the tree further than the word is long, there means
+ // there was no match (or we would have found it).
+ return NOT_A_DICT_POS;
+}
+
+// Read node array size and process empty node arrays. Nodes and arrays are counted up in this
+// method to avoid an infinite loop.
+void DynamicPtReadingHelper::nextPtNodeArray() {
+ int ptNodeCountInArray = 0;
+ int firstPtNodePos = NOT_A_DICT_POS;
+ if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid(
+ mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) {
+ mIsError = true;
+ mReadingState.mPos = NOT_A_DICT_POS;
+ return;
+ }
+ mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos;
+ mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray;
+ mReadingState.mPos = firstPtNodePos;
+ // Count up nodes and node arrays to avoid infinite loop.
+ mReadingState.mTotalPtNodeIndexInThisArrayChain +=
+ mReadingState.mRemainingPtNodeCountInThisArray;
+ mReadingState.mPtNodeArrayIndexInThisArrayChain++;
+ if (mReadingState.mRemainingPtNodeCountInThisArray < 0
+ || mReadingState.mTotalPtNodeIndexInThisArrayChain
+ > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP
+ || mReadingState.mPtNodeArrayIndexInThisArrayChain
+ > MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) {
+ // Invalid dictionary.
+ AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d"
+ "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d",
+ mReadingState.mRemainingPtNodeCountInThisArray,
+ mReadingState.mTotalPtNodeIndexInThisArrayChain,
+ MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP,
+ mReadingState.mPtNodeArrayIndexInThisArrayChain,
+ MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP);
+ ASSERT(false);
+ mIsError = true;
+ mReadingState.mPos = NOT_A_DICT_POS;
+ return;
+ }
+ if (mReadingState.mRemainingPtNodeCountInThisArray == 0) {
+ // Empty node array. Try following forward link.
+ followForwardLink();
+ }
+}
+
+// Follow the forward link and read the next node array if exists.
+void DynamicPtReadingHelper::followForwardLink() {
+ int nextPtNodeArrayPos = NOT_A_DICT_POS;
+ if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid(
+ mReadingState.mPos, &nextPtNodeArrayPos)) {
+ mIsError = true;
+ mReadingState.mPos = NOT_A_DICT_POS;
+ return;
+ }
+ mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos;
+ if (nextPtNodeArrayPos != NOT_A_DICT_POS) {
+ // Follow the forward link.
+ mReadingState.mPos = nextPtNodeArrayPos;
+ nextPtNodeArray();
+ } else {
+ // All node arrays have been read.
+ mReadingState.mPos = NOT_A_DICT_POS;
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h
new file mode 100644
index 000000000..d8ddc7c2b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_PT_READING_HELPER_H
+#define LATINIME_DYNAMIC_PT_READING_HELPER_H
+
+#include <cstddef>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
+
+namespace latinime {
+
+class DictionaryShortcutsStructurePolicy;
+class PtNodeArrayReader;
+
+/*
+ * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and
+ * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop.
+ */
+class DynamicPtReadingHelper {
+ public:
+ class TraversingEventListener {
+ public:
+ virtual ~TraversingEventListener() {};
+
+ // Returns whether the event handling was succeeded or not.
+ virtual bool onAscend() = 0;
+
+ // Returns whether the event handling was succeeded or not.
+ virtual bool onDescend(const int ptNodeArrayPos) = 0;
+
+ // Returns whether the event handling was succeeded or not.
+ virtual bool onReadingPtNodeArrayTail() = 0;
+
+ // Returns whether the event handling was succeeded or not.
+ virtual bool onVisitingPtNode(const PtNodeParams *const node) = 0;
+
+ protected:
+ TraversingEventListener() {};
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(TraversingEventListener);
+ };
+
+ class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener {
+ public:
+ TraversePolicyToGetAllTerminalPtNodePositions(std::vector<int> *const terminalPositions)
+ : mTerminalPositions(terminalPositions) {}
+ bool onAscend() { return true; }
+ bool onDescend(const int ptNodeArrayPos) { return true; }
+ bool onReadingPtNodeArrayTail() { return true; }
+ bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions);
+
+ std::vector<int> *const mTerminalPositions;
+ };
+
+ DynamicPtReadingHelper(const PtNodeReader *const ptNodeReader,
+ const PtNodeArrayReader *const ptNodeArrayReader)
+ : mIsError(false), mReadingState(), mPtNodeReader(ptNodeReader),
+ mPtNodeArrayReader(ptNodeArrayReader), mReadingStateStack() {}
+
+ ~DynamicPtReadingHelper() {}
+
+ AK_FORCE_INLINE bool isError() const {
+ return mIsError;
+ }
+
+ AK_FORCE_INLINE bool isEnd() const {
+ return mReadingState.mPos == NOT_A_DICT_POS;
+ }
+
+ // Initialize reading state with the head position of a PtNode array.
+ AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) {
+ if (ptNodeArrayPos == NOT_A_DICT_POS) {
+ mReadingState.mPos = NOT_A_DICT_POS;
+ } else {
+ mIsError = false;
+ mReadingState.mPos = ptNodeArrayPos;
+ mReadingState.mTotalCodePointCountSinceInitialization = 0;
+ mReadingState.mTotalPtNodeIndexInThisArrayChain = 0;
+ mReadingState.mPtNodeArrayIndexInThisArrayChain = 0;
+ mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS;
+ mReadingStateStack.clear();
+ nextPtNodeArray();
+ }
+ }
+
+ // Initialize reading state with the head position of a node.
+ AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ mReadingState.mPos = NOT_A_DICT_POS;
+ } else {
+ mIsError = false;
+ mReadingState.mPos = ptNodePos;
+ mReadingState.mRemainingPtNodeCountInThisArray = 1;
+ mReadingState.mTotalCodePointCountSinceInitialization = 0;
+ mReadingState.mTotalPtNodeIndexInThisArrayChain = 1;
+ mReadingState.mPtNodeArrayIndexInThisArrayChain = 1;
+ mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS;
+ mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS;
+ mReadingStateStack.clear();
+ }
+ }
+
+ AK_FORCE_INLINE const PtNodeParams getPtNodeParams() const {
+ if (isEnd()) {
+ return PtNodeParams();
+ }
+ return mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(mReadingState.mPos);
+ }
+
+ AK_FORCE_INLINE bool isValidTerminalNode(const PtNodeParams &ptNodeParams) const {
+ return !isEnd() && !ptNodeParams.isDeleted() && ptNodeParams.isTerminal();
+ }
+
+ AK_FORCE_INLINE bool isMatchedCodePoint(const PtNodeParams &ptNodeParams, const int index,
+ const int codePoint) const {
+ return ptNodeParams.getCodePoints()[index] == codePoint;
+ }
+
+ // Return code point count exclude the last read node's code points.
+ AK_FORCE_INLINE size_t getPrevTotalCodePointCount() const {
+ return mReadingState.mTotalCodePointCountSinceInitialization;
+ }
+
+ // Return code point count include the last read node's code points.
+ AK_FORCE_INLINE size_t getTotalCodePointCount(const PtNodeParams &ptNodeParams) const {
+ return mReadingState.mTotalCodePointCountSinceInitialization
+ + ptNodeParams.getCodePointCount();
+ }
+
+ AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder(const PtNodeParams &ptNodeParams,
+ const int index, int *const outCodePoints) const {
+ const int nodeCodePointCount = ptNodeParams.getCodePointCount();
+ const int *const nodeCodePoints = ptNodeParams.getCodePoints();
+ for (int i = 0; i < nodeCodePointCount; ++i) {
+ outCodePoints[index + i] = nodeCodePoints[nodeCodePointCount - 1 - i];
+ }
+ }
+
+ AK_FORCE_INLINE void readNextSiblingNode(const PtNodeParams &ptNodeParams) {
+ mReadingState.mRemainingPtNodeCountInThisArray -= 1;
+ mReadingState.mPos = ptNodeParams.getSiblingNodePos();
+ if (mReadingState.mRemainingPtNodeCountInThisArray <= 0) {
+ // All nodes in the current node array have been read.
+ followForwardLink();
+ }
+ }
+
+ // Read the first child node of the current node.
+ AK_FORCE_INLINE void readChildNode(const PtNodeParams &ptNodeParams) {
+ if (ptNodeParams.hasChildren()) {
+ mReadingState.mTotalCodePointCountSinceInitialization +=
+ ptNodeParams.getCodePointCount();
+ mReadingState.mTotalPtNodeIndexInThisArrayChain = 0;
+ mReadingState.mPtNodeArrayIndexInThisArrayChain = 0;
+ mReadingState.mPos = ptNodeParams.getChildrenPos();
+ mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS;
+ // Read children node array.
+ nextPtNodeArray();
+ } else {
+ mReadingState.mPos = NOT_A_DICT_POS;
+ }
+ }
+
+ // Read the parent node of the current node.
+ AK_FORCE_INLINE void readParentNode(const PtNodeParams &ptNodeParams) {
+ if (ptNodeParams.getParentPos() != NOT_A_DICT_POS) {
+ mReadingState.mTotalCodePointCountSinceInitialization +=
+ ptNodeParams.getCodePointCount();
+ mReadingState.mTotalPtNodeIndexInThisArrayChain = 1;
+ mReadingState.mPtNodeArrayIndexInThisArrayChain = 1;
+ mReadingState.mRemainingPtNodeCountInThisArray = 1;
+ mReadingState.mPos = ptNodeParams.getParentPos();
+ mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS;
+ mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS;
+ } else {
+ mReadingState.mPos = NOT_A_DICT_POS;
+ }
+ }
+
+ AK_FORCE_INLINE int getPosOfLastForwardLinkField() const {
+ return mReadingState.mPosOfLastForwardLinkField;
+ }
+
+ AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const {
+ return mReadingState.mPosOfThisPtNodeArrayHead;
+ }
+
+ bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener);
+
+ bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ TraversingEventListener *const listener);
+
+ int getCodePointsAndReturnCodePointCount(const int maxCodePointCount, int *const outCodePoints);
+
+ int getTerminalPtNodePositionOfWord(const int *const inWord, const size_t length,
+ const bool forceLowerCaseSearch);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DynamicPtReadingHelper);
+
+ // This class encapsulates the reading state of a position in the dictionary. It points at a
+ // specific PtNode in the dictionary.
+ class PtNodeReadingState {
+ public:
+ // Note that copy constructor and assignment operator are used for this class to use
+ // std::vector.
+ PtNodeReadingState() : mPos(NOT_A_DICT_POS), mRemainingPtNodeCountInThisArray(0),
+ mTotalCodePointCountSinceInitialization(0), mTotalPtNodeIndexInThisArrayChain(0),
+ mPtNodeArrayIndexInThisArrayChain(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS),
+ mPosOfThisPtNodeArrayHead(NOT_A_DICT_POS) {}
+
+ int mPos;
+ // Remaining node count in the current array.
+ int mRemainingPtNodeCountInThisArray;
+ size_t mTotalCodePointCountSinceInitialization;
+ // Counter of PtNodes used to avoid infinite loops caused by broken or malicious links.
+ int mTotalPtNodeIndexInThisArrayChain;
+ // Counter of PtNode arrays used to avoid infinite loops caused by cyclic links of empty
+ // PtNode arrays.
+ int mPtNodeArrayIndexInThisArrayChain;
+ int mPosOfLastForwardLinkField;
+ int mPosOfThisPtNodeArrayHead;
+ };
+
+ static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP;
+ static const int MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP;
+ static const size_t MAX_READING_STATE_STACK_SIZE;
+
+ // TODO: Introduce error code to track what caused the error.
+ bool mIsError;
+ PtNodeReadingState mReadingState;
+ const PtNodeReader *const mPtNodeReader;
+ const PtNodeArrayReader *const mPtNodeArrayReader;
+ std::vector<PtNodeReadingState> mReadingStateStack;
+
+ void nextPtNodeArray();
+
+ void followForwardLink();
+
+ AK_FORCE_INLINE void pushReadingStateToStack() {
+ if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) {
+ AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE);
+ ASSERT(false);
+ mIsError = true;
+ mReadingState.mPos = NOT_A_DICT_POS;
+ } else {
+ mReadingStateStack.push_back(mReadingState);
+ }
+ }
+
+ AK_FORCE_INLINE void popReadingStateFromStack() {
+ if (mReadingStateStack.empty()) {
+ mReadingState.mPos = NOT_A_DICT_POS;
+ } else {
+ mReadingState = mReadingStateStack.back();
+ mReadingStateStack.pop_back();
+ }
+ }
+};
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_PT_READING_HELPER_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp
new file mode 100644
index 000000000..3eb55ed9b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+
+#include "defines.h"
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::MASK_MOVED = 0xC0;
+const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_NOT_MOVED = 0xC0;
+const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_MOVED = 0x40;
+const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_DELETED = 0x80;
+const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_WILL_BECOME_NON_TERMINAL = 0x00;
+
+// TODO: Make DICT_OFFSET_ZERO_OFFSET = 0.
+// Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum
+// value of offsets, which is 0x7FFFFF is used to represent 0 offset.
+const int DynamicPtReadingUtils::DICT_OFFSET_INVALID = 0;
+const int DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF;
+
+/* static */ int DynamicPtReadingUtils::getForwardLinkPosition(const uint8_t *const buffer,
+ const int pos) {
+ int linkAddressPos = pos;
+ return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos);
+}
+
+/* static */ int DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos);
+}
+
+/* static */ int DynamicPtReadingUtils::getParentPtNodePos(const int parentOffset,
+ const int ptNodePos) {
+ if (parentOffset == DICT_OFFSET_INVALID) {
+ return NOT_A_DICT_POS;
+ } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) {
+ return ptNodePos;
+ } else {
+ return parentOffset + ptNodePos;
+ }
+}
+
+/* static */ int DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const int base = *pos;
+ const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos);
+ if (offset == DICT_OFFSET_INVALID) {
+ // The PtNode does not have children.
+ return NOT_A_DICT_POS;
+ } else if (offset == DICT_OFFSET_ZERO_OFFSET) {
+ return base;
+ } else {
+ return base + offset;
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h
new file mode 100644
index 000000000..b13a075d5
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_PT_READING_UTILS_H
+#define LATINIME_DYNAMIC_PT_READING_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+
+namespace latinime {
+
+class DynamicPtReadingUtils {
+ public:
+ typedef uint8_t NodeFlags;
+
+ static const int DICT_OFFSET_INVALID;
+ static const int DICT_OFFSET_ZERO_OFFSET;
+
+ static int getForwardLinkPosition(const uint8_t *const buffer, const int pos);
+
+ static AK_FORCE_INLINE bool isValidForwardLinkPosition(const int forwardLinkAddress) {
+ return forwardLinkAddress != 0;
+ }
+
+ static int getParentPtNodePosOffsetAndAdvancePosition(const uint8_t *const buffer,
+ int *const pos);
+
+ static int getParentPtNodePos(const int parentOffset, const int ptNodePos);
+
+ static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, int *const pos);
+
+ /**
+ * Node Flags
+ */
+ static AK_FORCE_INLINE bool isMoved(const NodeFlags flags) {
+ return FLAG_IS_MOVED == (MASK_MOVED & flags);
+ }
+
+ static AK_FORCE_INLINE bool isDeleted(const NodeFlags flags) {
+ return FLAG_IS_DELETED == (MASK_MOVED & flags);
+ }
+
+ static AK_FORCE_INLINE bool willBecomeNonTerminal(const NodeFlags flags) {
+ return FLAG_WILL_BECOME_NON_TERMINAL == (MASK_MOVED & flags);
+ }
+
+ static AK_FORCE_INLINE NodeFlags updateAndGetFlags(const NodeFlags originalFlags,
+ const bool isMoved, const bool isDeleted, const bool willBecomeNonTerminal) {
+ NodeFlags flags = originalFlags;
+ flags = willBecomeNonTerminal ?
+ ((flags & (~MASK_MOVED)) | FLAG_WILL_BECOME_NON_TERMINAL) : flags;
+ flags = isMoved ? ((flags & (~MASK_MOVED)) | FLAG_IS_MOVED) : flags;
+ flags = isDeleted ? ((flags & (~MASK_MOVED)) | FLAG_IS_DELETED) : flags;
+ flags = (!isMoved && !isDeleted && !willBecomeNonTerminal) ?
+ ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags;
+ return flags;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtReadingUtils);
+
+ static const NodeFlags MASK_MOVED;
+ static const NodeFlags FLAG_IS_NOT_MOVED;
+ static const NodeFlags FLAG_IS_MOVED;
+ static const NodeFlags FLAG_IS_DELETED;
+ static const NodeFlags FLAG_WILL_BECOME_NON_TERMINAL;
+};
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_PT_READING_UTILS_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp
new file mode 100644
index 000000000..ccad345c8
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
+
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
+
+bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper,
+ const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
+ bool *const outAddedNewUnigram) {
+ int parentPos = NOT_A_DICT_POS;
+ while (!readingHelper->isEnd()) {
+ const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
+ if (!ptNodeParams.isValid()) {
+ break;
+ }
+ const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
+ if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
+ wordCodePoints[matchedCodePointCount])) {
+ // The first code point is different from target code point. Skip this node and read
+ // the next sibling node.
+ readingHelper->readNextSiblingNode(ptNodeParams);
+ continue;
+ }
+ // Check following merged node code points.
+ const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size();
+ for (size_t j = 1; j < nodeCodePointCount; ++j) {
+ const size_t nextIndex = matchedCodePointCount + j;
+ if (nextIndex >= wordCodePoints.size()
+ || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
+ wordCodePoints[matchedCodePointCount + j])) {
+ *outAddedNewUnigram = true;
+ return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
+ wordCodePoints.skip(matchedCodePointCount));
+ }
+ }
+ // All characters are matched.
+ if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) {
+ return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
+ }
+ if (!ptNodeParams.hasChildren()) {
+ *outAddedNewUnigram = true;
+ return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
+ wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams)));
+ }
+ // Advance to the children nodes.
+ parentPos = ptNodeParams.getHeadPos();
+ readingHelper->readChildNode(ptNodeParams);
+ }
+ if (readingHelper->isError()) {
+ // The dictionary is invalid.
+ return false;
+ }
+ int pos = readingHelper->getPosOfLastForwardLinkField();
+ *outAddedNewUnigram = true;
+ return createAndInsertNodeIntoPtNodeArray(parentPos,
+ wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty,
+ &pos);
+}
+
+bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
+ const int wordPos, const NgramProperty *const ngramProperty,
+ bool *const outAddedNewEntry) {
+ if (prevWordsPtNodePos.empty()) {
+ return false;
+ }
+ ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
+ int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
+ prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
+ prevWordsPtNodePos[i]).getTerminalId();
+ }
+ const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
+ const int wordId =
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
+ return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry);
+}
+
+bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
+ const int wordPos) {
+ if (prevWordsPtNodePos.empty()) {
+ return false;
+ }
+ ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
+ int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
+ prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
+ prevWordsPtNodePos[i]).getTerminalId();
+ }
+ const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
+ const int wordId =
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
+ return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId);
+}
+
+bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
+ const CodePointArrayView targetCodePoints, const int shortcutProbability) {
+ const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
+ return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(),
+ targetCodePoints.size(), shortcutProbability);
+}
+
+bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
+ const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
+ int *const forwardLinkFieldPos) {
+ const int newPtNodeArrayPos = mBuffer->getTailPosition();
+ if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
+ newPtNodeArrayPos, forwardLinkFieldPos)) {
+ return false;
+ }
+ return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty);
+}
+
+bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
+ const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
+ if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) {
+ // Overwrites the probability.
+ *outAddedNewUnigram = false;
+ return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
+ } else {
+ // Make the node terminal and write the probability.
+ *outAddedNewUnigram = true;
+ const int movedPos = mBuffer->getTailPosition();
+ int writingPos = movedPos;
+ const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
+ true /* isTerminal */, originalPtNodeParams->getParentPos(),
+ originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
+ unigramProperty, &writingPos)) {
+ return false;
+ }
+ if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
+ const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
+ const CodePointArrayView codePoints) {
+ const int newPtNodeArrayPos = mBuffer->getTailPosition();
+ if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
+ return false;
+ }
+ return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
+ unigramProperty);
+}
+
+bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
+ const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints,
+ const UnigramProperty *const unigramProperty) {
+ int writingPos = mBuffer->getTailPosition();
+ if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
+ 1 /* arraySize */, &writingPos)) {
+ return false;
+ }
+ const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
+ true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
+ unigramProperty->getProbability()));
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
+ unigramProperty, &writingPos)) {
+ return false;
+ }
+ if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
+ NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
+ return false;
+ }
+ return true;
+}
+
+// Returns whether the dictionary updating was succeeded or not.
+bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
+ const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount,
+ const UnigramProperty *const unigramProperty,
+ const CodePointArrayView newPtNodeCodePoints) {
+ // When addsExtraChild is true, split the reallocating PtNode and add new child.
+ // Reallocating PtNode: abcde, newNode: abcxy.
+ // abc (1st, not terminal) __ de (2nd)
+ // \_ xy (extra child, terminal)
+ // Otherwise, this method makes 1st part terminal and write information in unigramProperty.
+ // Reallocating PtNode: abcde, newNode: abc.
+ // abc (1st, terminal) __ de (2nd)
+ const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount;
+ const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
+ int writingPos = firstPartOfReallocatedPtNodePos;
+ // Write the 1st part of the reallocating node. The children position will be updated later
+ // with actual children position.
+ const CodePointArrayView firstPtNodeCodePoints =
+ reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
+ if (addsExtraChild) {
+ const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
+ false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
+ reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
+ NOT_A_PROBABILITY));
+ if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
+ return false;
+ }
+ } else {
+ const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
+ true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
+ firstPtNodeCodePoints, unigramProperty->getProbability()));
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
+ unigramProperty, &writingPos)) {
+ return false;
+ }
+ }
+ const int actualChildrenPos = writingPos;
+ // Create new children PtNode array.
+ const size_t newPtNodeCount = addsExtraChild ? 2 : 1;
+ if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
+ newPtNodeCount, &writingPos)) {
+ return false;
+ }
+ // Write the 2nd part of the reallocating node.
+ const int secondPartOfReallocatedPtNodePos = writingPos;
+ const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
+ reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
+ reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
+ reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
+ reallocatingPtNodeParams->getProbability()));
+ if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
+ return false;
+ }
+ if (addsExtraChild) {
+ const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
+ true /* isTerminal */, firstPartOfReallocatedPtNodePos,
+ newPtNodeCodePoints.skip(overlappingCodePointCount),
+ unigramProperty->getProbability()));
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
+ unigramProperty, &writingPos)) {
+ return false;
+ }
+ }
+ if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
+ NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
+ return false;
+ }
+ // Update original reallocating PtNode as moved.
+ if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos,
+ secondPartOfReallocatedPtNodePos)) {
+ return false;
+ }
+ // Load node info. Information of the 1st part will be fetched.
+ const PtNodeParams ptNodeParams(
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos));
+ // Update children position.
+ return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos);
+}
+
+const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
+ const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
+ const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
+ const CodePointArrayView codePoints, const int probability) const {
+ const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
+ isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
+ false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
+ CHILDREN_POSITION_FIELD_SIZE);
+ return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
+}
+
+const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
+ const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
+ const CodePointArrayView codePoints, const int probability) const {
+ const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
+ isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
+ false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
+ CHILDREN_POSITION_FIELD_SIZE);
+ return PtNodeParams(flags, parentPos, codePoints, probability);
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h
new file mode 100644
index 000000000..e8cf98c39
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_PT_UPDATING_HELPER_H
+#define LATINIME_DYNAMIC_PT_UPDATING_HELPER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class NgramProperty;
+class BufferWithExtendableBuffer;
+class DynamicPtReadingHelper;
+class PtNodeReader;
+class PtNodeWriter;
+class UnigramProperty;
+
+class DynamicPtUpdatingHelper {
+ public:
+ DynamicPtUpdatingHelper(BufferWithExtendableBuffer *const buffer,
+ const PtNodeReader *const ptNodeReader, PtNodeWriter *const ptNodeWriter)
+ : mBuffer(buffer), mPtNodeReader(ptNodeReader), mPtNodeWriter(ptNodeWriter) {}
+
+ ~DynamicPtUpdatingHelper() {}
+
+ // Add a word to the dictionary. If the word already exists, update the probability.
+ bool addUnigramWord(DynamicPtReadingHelper *const readingHelper,
+ const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
+ bool *const outAddedNewUnigram);
+
+ // TODO: Remove after stopping supporting v402.
+ // Add an n-gram entry.
+ bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
+
+ // TODO: Remove after stopping supporting v402.
+ // Remove an n-gram entry.
+ bool removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos);
+
+ // Add a shortcut target.
+ bool addShortcutTarget(const int wordPos, const CodePointArrayView targetCodePoints,
+ const int shortcutProbability);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtUpdatingHelper);
+
+ static const int CHILDREN_POSITION_FIELD_SIZE;
+
+ BufferWithExtendableBuffer *const mBuffer;
+ const PtNodeReader *const mPtNodeReader;
+ PtNodeWriter *const mPtNodeWriter;
+
+ bool createAndInsertNodeIntoPtNodeArray(const int parentPos,
+ const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
+ int *const forwardLinkFieldPos);
+
+ bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
+ const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram);
+
+ bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams,
+ const UnigramProperty *const unigramProperty,
+ const CodePointArrayView remainingCodePoints);
+
+ bool createNewPtNodeArrayWithAChildPtNode(const int parentPos,
+ const CodePointArrayView ptNodeCodePoints,
+ const UnigramProperty *const unigramProperty);
+
+ bool reallocatePtNodeAndAddNewPtNodes(const PtNodeParams *const reallocatingPtNodeParams,
+ const size_t overlappingCodePointCount, const UnigramProperty *const unigramProperty,
+ const CodePointArrayView newPtNodeCodePoints);
+
+ const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams,
+ const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal,
+ const int parentPos, const CodePointArrayView codePoints, const int probability) const;
+
+ const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord,
+ const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
+ const CodePointArrayView codePoints, const int probability) const;
+};
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp
new file mode 100644
index 000000000..ea760a538
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F;
+const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF;
+const int DynamicPtWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1;
+const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2;
+const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000;
+const int DynamicPtWritingUtils::DICT_OFFSET_FIELD_SIZE = 3;
+const int DynamicPtWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF;
+const int DynamicPtWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF;
+const int DynamicPtWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000;
+const int DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE = 1;
+
+/* static */ bool DynamicPtWritingUtils::writeEmptyDictionary(
+ BufferWithExtendableBuffer *const buffer, const int rootPos) {
+ int writingPos = rootPos;
+ if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) {
+ return false;
+ }
+ return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */,
+ &writingPos);
+}
+
+/* static */ bool DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(
+ BufferWithExtendableBuffer *const buffer, const int forwardLinkPos,
+ int *const forwardLinkFieldPos) {
+ return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos);
+}
+
+/* static */ bool DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(
+ BufferWithExtendableBuffer *const buffer, const size_t arraySize,
+ int *const arraySizeFieldPos) {
+ // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to
+ // simplify updating process.
+ // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays.
+ /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) {
+ return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE,
+ arraySizeFieldPos);
+ } else */
+ if (arraySize <= MAX_PTNODE_ARRAY_SIZE) {
+ uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG;
+ return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE,
+ arraySizeFieldPos);
+ } else {
+ AKLOGI("PtNode array size cannot be written because arraySize is too large: %zd",
+ arraySize);
+ ASSERT(false);
+ return false;
+ }
+}
+
+/* static */ bool DynamicPtWritingUtils::writeFlagsAndAdvancePosition(
+ BufferWithExtendableBuffer *const buffer,
+ const DynamicPtReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) {
+ return buffer->writeUintAndAdvancePosition(nodeFlags, NODE_FLAG_FIELD_SIZE, nodeFlagsFieldPos);
+}
+
+// Note that parentOffset is offset from node's head position.
+/* static */ bool DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
+ BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos,
+ int *const parentPosFieldPos) {
+ return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos);
+}
+
+/* static */ bool DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(
+ BufferWithExtendableBuffer *const buffer, const int *const codePoints,
+ const int codePointCount, int *const codePointFieldPos) {
+ if (codePointCount <= 0) {
+ AKLOGI("code points cannot be written because codePointCount is invalid: %d",
+ codePointCount);
+ ASSERT(false);
+ return false;
+ }
+ const bool hasMultipleCodePoints = codePointCount > 1;
+ return buffer->writeCodePointsAndAdvancePosition(codePoints, codePointCount,
+ hasMultipleCodePoints, codePointFieldPos);
+}
+
+/* static */ bool DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(
+ BufferWithExtendableBuffer *const buffer, const int childrenPosition,
+ int *const childrenPositionFieldPos) {
+ return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos),
+ childrenPositionFieldPos);
+}
+
+/* static */ bool DynamicPtWritingUtils::writeDictOffset(BufferWithExtendableBuffer *const buffer,
+ const int targetPos, const int basePos, int *const offsetFieldPos) {
+ int offset = targetPos - basePos;
+ if (targetPos == NOT_A_DICT_POS) {
+ offset = DynamicPtReadingUtils::DICT_OFFSET_INVALID;
+ } else if (offset == 0) {
+ offset = DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET;
+ }
+ if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) {
+ AKLOGI("offset cannot be written because the offset is too large or too small: %d",
+ offset);
+ ASSERT(false);
+ return false;
+ }
+ uint32_t data = 0;
+ if (offset >= 0) {
+ data = offset;
+ } else {
+ data = abs(offset) | DICT_OFFSET_NEGATIVE_FLAG;
+ }
+ return buffer->writeUintAndAdvancePosition(data, DICT_OFFSET_FIELD_SIZE, offsetFieldPos);
+}
+}
diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h
new file mode 100644
index 000000000..b4817af41
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_PT_WRITING_UTILS_H
+#define LATINIME_DYNAMIC_PT_WRITING_UTILS_H
+
+#include <cstddef>
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class DynamicPtWritingUtils {
+ public:
+ static const int NODE_FLAG_FIELD_SIZE;
+
+ static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos);
+
+ static bool writeForwardLinkPositionAndAdvancePosition(
+ BufferWithExtendableBuffer *const buffer, const int forwardLinkPos,
+ int *const forwardLinkFieldPos);
+
+ static bool writePtNodeArraySizeAndAdvancePosition(BufferWithExtendableBuffer *const buffer,
+ const size_t arraySize, int *const arraySizeFieldPos);
+
+ static bool writeFlags(BufferWithExtendableBuffer *const buffer,
+ const DynamicPtReadingUtils::NodeFlags nodeFlags,
+ const int nodeFlagsFieldPos) {
+ int writingPos = nodeFlagsFieldPos;
+ return writeFlagsAndAdvancePosition(buffer, nodeFlags, &writingPos);
+ }
+
+ static bool writeFlagsAndAdvancePosition(BufferWithExtendableBuffer *const buffer,
+ const DynamicPtReadingUtils::NodeFlags nodeFlags,
+ int *const nodeFlagsFieldPos);
+
+ static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer,
+ const int parentPosition, const int basePos, int *const parentPosFieldPos);
+
+ static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer,
+ const int *const codePoints, const int codePointCount, int *const codePointFieldPos);
+
+ static bool writeChildrenPositionAndAdvancePosition(BufferWithExtendableBuffer *const buffer,
+ const int childrenPosition, int *const childrenPositionFieldPos);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtWritingUtils);
+
+ static const size_t MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD;
+ static const size_t MAX_PTNODE_ARRAY_SIZE;
+ static const int SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE;
+ static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE;
+ static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG;
+ static const int DICT_OFFSET_FIELD_SIZE;
+ static const int MAX_DICT_OFFSET_VALUE;
+ static const int MIN_DICT_OFFSET_VALUE;
+ static const int DICT_OFFSET_NEGATIVE_FLAG;
+
+ static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos,
+ const int basePos, int *const offsetFieldPos);
+};
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_PT_WRITING_UTILS_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
new file mode 100644
index 000000000..e2807c492
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+typedef PatriciaTrieReadingUtils PtReadingUtils;
+
+const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0;
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00;
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40;
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80;
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0;
+
+// Flag for single/multiple char group
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
+// Flag for terminal PtNodes
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
+// Flag for shortcut targets presence
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
+// Flag for bigram presence
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
+// Flag for non-words (typically, shortcut only entries)
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
+// Flag for possibly offensive words
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
+
+/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
+ if (firstByte < 0x80) {
+ return firstByte;
+ } else {
+ return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
+ buffer, pos);
+ }
+}
+
+/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
+}
+
+/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
+ const int *const codePointTable, int *const pos) {
+ return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
+}
+
+// Returns the number of read characters.
+/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
+ const NodeFlags flags, const int maxLength, const int *const codePointTable,
+ int *const outBuffer, int *const pos) {
+ int length = 0;
+ if (hasMultipleChars(flags)) {
+ length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
+ outBuffer, pos);
+ } else {
+ const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
+ if (codePoint == NOT_A_CODE_POINT) {
+ // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
+ // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
+ // when the PtNode has a single code point.
+ length = 0;
+ AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
+ *pos - 1, codePoint, buffer[*pos - 1]);
+ ASSERT(false);
+ } else if (maxLength > 0) {
+ outBuffer[0] = codePoint;
+ length = 1;
+ }
+ }
+ return length;
+}
+
+// Returns the number of skipped characters.
+/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
+ const int maxLength, const int *const codePointTable, int *const pos) {
+ if (hasMultipleChars(flags)) {
+ return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
+ } else {
+ if (maxLength > 0) {
+ getCodePointAndAdvancePosition(buffer, codePointTable, pos);
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+}
+
+/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer,
+ int *const pos) {
+ return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
+}
+
+/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
+ const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
+ const int base = *pos;
+ int offset = 0;
+ switch (MASK_CHILDREN_POSITION_TYPE & flags) {
+ case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE:
+ offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
+ break;
+ case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES:
+ offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
+ break;
+ case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES:
+ offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
+ break;
+ default:
+ // If we come here, it means we asked for the children of a word with
+ // no children.
+ return NOT_A_DICT_POS;
+ }
+ return base + offset;
+}
+
+/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
+ const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
+ const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
+ NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
+ int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
+ int *const outBigramPos, int *const outSiblingPos) {
+ int readingPos = ptNodePos;
+ const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
+ *outFlags = flags;
+ *outCodePointCount = getCharsAndAdvancePosition(
+ dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
+ *outProbability = isTerminal(flags) ?
+ readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
+ *outChildrenPos = hasChildrenInFlags(flags) ?
+ readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS;
+ *outShortcutPos = NOT_A_DICT_POS;
+ if (hasShortcutTargets(flags)) {
+ *outShortcutPos = readingPos;
+ shortcutPolicy->skipAllShortcuts(&readingPos);
+ }
+ *outBigramPos = NOT_A_DICT_POS;
+ if (hasBigrams(flags)) {
+ *outBigramPos = readingPos;
+ bigramPolicy->skipAllBigrams(&readingPos);
+ }
+ *outSiblingPos = readingPos;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h
new file mode 100644
index 000000000..6a2bf5d3c
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PATRICIA_TRIE_READING_UTILS_H
+#define LATINIME_PATRICIA_TRIE_READING_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+
+namespace latinime {
+
+class DictionaryShortcutsStructurePolicy;
+class DictionaryBigramsStructurePolicy;
+
+class PatriciaTrieReadingUtils {
+ public:
+ typedef uint8_t NodeFlags;
+
+ static int getPtNodeArraySizeAndAdvancePosition(const uint8_t *const buffer, int *const pos);
+
+ static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
+
+ static int getCodePointAndAdvancePosition(const uint8_t *const buffer,
+ const int *const codePointTable, int *const pos);
+
+ // Returns the number of read characters.
+ static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
+ const int maxLength, const int *const codePointTable, int *const outBuffer,
+ int *const pos);
+
+ // Returns the number of skipped characters.
+ static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
+ const int maxLength, const int *const codePointTable, int *const pos);
+
+ static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
+
+ static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer,
+ const NodeFlags flags, int *const pos);
+
+ /**
+ * Node Flags
+ */
+ static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) {
+ return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0;
+ }
+
+ static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) {
+ return (flags & FLAG_IS_NOT_A_WORD) != 0;
+ }
+
+ static AK_FORCE_INLINE bool isTerminal(const NodeFlags flags) {
+ return (flags & FLAG_IS_TERMINAL) != 0;
+ }
+
+ static AK_FORCE_INLINE bool hasShortcutTargets(const NodeFlags flags) {
+ return (flags & FLAG_HAS_SHORTCUT_TARGETS) != 0;
+ }
+
+ static AK_FORCE_INLINE bool hasBigrams(const NodeFlags flags) {
+ return (flags & FLAG_HAS_BIGRAMS) != 0;
+ }
+
+ static AK_FORCE_INLINE bool hasMultipleChars(const NodeFlags flags) {
+ return (flags & FLAG_HAS_MULTIPLE_CHARS) != 0;
+ }
+
+ static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) {
+ return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags);
+ }
+
+ static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive,
+ const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets,
+ const bool hasBigrams, const bool hasMultipleChars,
+ const int childrenPositionFieldSize) {
+ NodeFlags nodeFlags = 0;
+ nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags;
+ nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags;
+ nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags;
+ nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags;
+ nodeFlags = hasBigrams ? (nodeFlags | FLAG_HAS_BIGRAMS) : nodeFlags;
+ nodeFlags = hasMultipleChars ? (nodeFlags | FLAG_HAS_MULTIPLE_CHARS) : nodeFlags;
+ if (childrenPositionFieldSize == 1) {
+ nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_ONEBYTE;
+ } else if (childrenPositionFieldSize == 2) {
+ nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_TWOBYTES;
+ } else if (childrenPositionFieldSize == 3) {
+ nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_THREEBYTES;
+ } else {
+ nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_NOPOSITION;
+ }
+ return nodeFlags;
+ }
+
+ static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
+ const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
+ const DictionaryBigramsStructurePolicy *const bigramPolicy,
+ const int *const codePointTable, NodeFlags *const outFlags,
+ int *const outCodePointCount, int *const outCodePoint, int *const outProbability,
+ int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos,
+ int *const outSiblingPos);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);
+
+ static const NodeFlags MASK_CHILDREN_POSITION_TYPE;
+ static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_NOPOSITION;
+ static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_ONEBYTE;
+ static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_TWOBYTES;
+ static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_THREEBYTES;
+
+ static const NodeFlags FLAG_HAS_MULTIPLE_CHARS;
+ static const NodeFlags FLAG_IS_TERMINAL;
+ static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS;
+ static const NodeFlags FLAG_HAS_BIGRAMS;
+ static const NodeFlags FLAG_IS_NOT_A_WORD;
+ static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE;
+};
+} // namespace latinime
+#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h
new file mode 100644
index 000000000..6078d8285
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PT_NODE_ARRAY_READER_H
+#define LATINIME_PT_NODE_ARRAY_READER_H
+
+#include "defines.h"
+
+namespace latinime {
+
+// Interface class used to read PtNode array information.
+class PtNodeArrayReader {
+ public:
+ virtual ~PtNodeArrayReader() {}
+
+ // Returns if the position is valid or not.
+ virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
+ int *const outPtNodeCount, int *const outFirstPtNodePos) const = 0;
+
+ // Returns if the position is valid or not. NOT_A_DICT_POS is set to outNextPtNodeArrayPos when
+ // the next array doesn't exist.
+ virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos,
+ int *const outNextPtNodeArrayPos) const = 0;
+
+ protected:
+ PtNodeArrayReader() {};
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(PtNodeArrayReader);
+};
+} // namespace latinime
+#endif /* LATINIME_PT_NODE_READER_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h
new file mode 100644
index 000000000..905deb1bc
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PT_NODE_PARAMS_H
+#define LATINIME_PT_NODE_PARAMS_H
+
+#include <cstring>
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "utils/char_utils.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+// This class has information of a PtNode. This class is immutable.
+class PtNodeParams {
+ public:
+ // Invalid PtNode.
+ PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mHasMovedFlag(false),
+ mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mCodePoints(),
+ mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
+ mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY),
+ mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS),
+ mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS),
+ mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {}
+
+ PtNodeParams(const PtNodeParams& ptNodeParams)
+ : mHeadPos(ptNodeParams.mHeadPos), mFlags(ptNodeParams.mFlags),
+ mHasMovedFlag(ptNodeParams.mHasMovedFlag), mParentPos(ptNodeParams.mParentPos),
+ mCodePointCount(ptNodeParams.mCodePointCount), mCodePoints(),
+ mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos),
+ mTerminalId(ptNodeParams.mTerminalId),
+ mProbabilityFieldPos(ptNodeParams.mProbabilityFieldPos),
+ mProbability(ptNodeParams.mProbability),
+ mChildrenPosFieldPos(ptNodeParams.mChildrenPosFieldPos),
+ mChildrenPos(ptNodeParams.mChildrenPos),
+ mBigramLinkedNodePos(ptNodeParams.mBigramLinkedNodePos),
+ mShortcutPos(ptNodeParams.mShortcutPos), mBigramPos(ptNodeParams.mBigramPos),
+ mSiblingPos(ptNodeParams.mSiblingPos) {
+ memcpy(mCodePoints, ptNodeParams.getCodePoints(), sizeof(int) * mCodePointCount);
+ }
+
+ // PtNode read from version 2 dictionary.
+ PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags,
+ const int codePointCount, const int *const codePoints, const int probability,
+ const int childrenPos, const int shortcutPos, const int bigramPos,
+ const int siblingPos)
+ : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(false), mParentPos(NOT_A_DICT_POS),
+ mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS),
+ mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
+ mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability),
+ mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(childrenPos),
+ mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(shortcutPos),
+ mBigramPos(bigramPos), mSiblingPos(siblingPos) {
+ memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount);
+ }
+
+ // PtNode with a terminal id.
+ PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags,
+ const int parentPos, const int codePointCount, const int *const codePoints,
+ const int terminalIdFieldPos, const int terminalId, const int probability,
+ const int childrenPosFieldPos, const int childrenPos, const int siblingPos)
+ : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos),
+ mCodePointCount(codePointCount), mCodePoints(),
+ mTerminalIdFieldPos(terminalIdFieldPos), mTerminalId(terminalId),
+ mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability),
+ mChildrenPosFieldPos(childrenPosFieldPos), mChildrenPos(childrenPos),
+ mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(terminalId),
+ mBigramPos(terminalId), mSiblingPos(siblingPos) {
+ memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount);
+ }
+
+ // Construct new params by updating existing PtNode params.
+ PtNodeParams(const PtNodeParams *const ptNodeParams,
+ const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos,
+ const CodePointArrayView codePoints, const int probability)
+ : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true),
+ mParentPos(parentPos), mCodePointCount(codePoints.size()), mCodePoints(),
+ mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()),
+ mTerminalId(ptNodeParams->getTerminalId()),
+ mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()),
+ mProbability(probability),
+ mChildrenPosFieldPos(ptNodeParams->getChildrenPosFieldPos()),
+ mChildrenPos(ptNodeParams->getChildrenPos()),
+ mBigramLinkedNodePos(ptNodeParams->getBigramLinkedNodePos()),
+ mShortcutPos(ptNodeParams->getShortcutPos()),
+ mBigramPos(ptNodeParams->getBigramsPos()),
+ mSiblingPos(ptNodeParams->getSiblingNodePos()) {
+ memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount);
+ }
+
+ PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos,
+ const CodePointArrayView codePoints, const int probability)
+ : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos),
+ mCodePointCount(codePoints.size()), mCodePoints(),
+ mTerminalIdFieldPos(NOT_A_DICT_POS),
+ mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
+ mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability),
+ mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS),
+ mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS),
+ mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {
+ memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount);
+ }
+
+ AK_FORCE_INLINE bool isValid() const {
+ return mCodePointCount > 0;
+ }
+
+ // Head position of the PtNode
+ AK_FORCE_INLINE int getHeadPos() const {
+ return mHeadPos;
+ }
+
+ // Flags
+ AK_FORCE_INLINE bool isDeleted() const {
+ return mHasMovedFlag && DynamicPtReadingUtils::isDeleted(mFlags);
+ }
+
+ AK_FORCE_INLINE bool willBecomeNonTerminal() const {
+ return mHasMovedFlag && DynamicPtReadingUtils::willBecomeNonTerminal(mFlags);
+ }
+
+ AK_FORCE_INLINE bool hasChildren() const {
+ return mChildrenPos != NOT_A_DICT_POS;
+ }
+
+ AK_FORCE_INLINE bool isTerminal() const {
+ return PatriciaTrieReadingUtils::isTerminal(mFlags);
+ }
+
+ AK_FORCE_INLINE bool isPossiblyOffensive() const {
+ return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
+ }
+
+ AK_FORCE_INLINE bool isNotAWord() const {
+ return PatriciaTrieReadingUtils::isNotAWord(mFlags);
+ }
+
+ AK_FORCE_INLINE bool hasBigrams() const {
+ return PatriciaTrieReadingUtils::hasBigrams(mFlags);
+ }
+
+ AK_FORCE_INLINE bool hasShortcutTargets() const {
+ return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
+ }
+
+ AK_FORCE_INLINE bool representsNonWordInfo() const {
+ return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0])
+ && isNotAWord();
+ }
+
+ AK_FORCE_INLINE int representsBeginningOfSentence() const {
+ return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE
+ && isNotAWord();
+ }
+
+ // Parent node position
+ AK_FORCE_INLINE int getParentPos() const {
+ return mParentPos;
+ }
+
+ AK_FORCE_INLINE const CodePointArrayView getCodePointArrayView() const {
+ return CodePointArrayView(mCodePoints, mCodePointCount);
+ }
+
+ // TODO: Remove
+ // Number of code points
+ AK_FORCE_INLINE uint8_t getCodePointCount() const {
+ return mCodePointCount;
+ }
+
+ // TODO: Remove
+ AK_FORCE_INLINE const int *getCodePoints() const {
+ return mCodePoints;
+ }
+
+ // Probability
+ AK_FORCE_INLINE int getTerminalIdFieldPos() const {
+ return mTerminalIdFieldPos;
+ }
+
+ AK_FORCE_INLINE int getTerminalId() const {
+ return mTerminalId;
+ }
+
+ // Probability
+ AK_FORCE_INLINE int getProbabilityFieldPos() const {
+ return mProbabilityFieldPos;
+ }
+
+ AK_FORCE_INLINE int getProbability() const {
+ return mProbability;
+ }
+
+ // Children PtNode array position
+ AK_FORCE_INLINE int getChildrenPosFieldPos() const {
+ return mChildrenPosFieldPos;
+ }
+
+ AK_FORCE_INLINE int getChildrenPos() const {
+ return mChildrenPos;
+ }
+
+ // Bigram linked node position.
+ AK_FORCE_INLINE int getBigramLinkedNodePos() const {
+ return mBigramLinkedNodePos;
+ }
+
+ // Shortcutlist position
+ AK_FORCE_INLINE int getShortcutPos() const {
+ return mShortcutPos;
+ }
+
+ // Bigrams position
+ AK_FORCE_INLINE int getBigramsPos() const {
+ return mBigramPos;
+ }
+
+ // Sibling node position
+ AK_FORCE_INLINE int getSiblingNodePos() const {
+ return mSiblingPos;
+ }
+
+ private:
+ // This class have a public copy constructor to be used as a return value.
+ DISALLOW_ASSIGNMENT_OPERATOR(PtNodeParams);
+
+ const int mHeadPos;
+ const PatriciaTrieReadingUtils::NodeFlags mFlags;
+ const bool mHasMovedFlag;
+ const int mParentPos;
+ const uint8_t mCodePointCount;
+ int mCodePoints[MAX_WORD_LENGTH];
+ const int mTerminalIdFieldPos;
+ const int mTerminalId;
+ const int mProbabilityFieldPos;
+ const int mProbability;
+ const int mChildrenPosFieldPos;
+ const int mChildrenPos;
+ const int mBigramLinkedNodePos;
+ const int mShortcutPos;
+ const int mBigramPos;
+ const int mSiblingPos;
+};
+} // namespace latinime
+#endif /* LATINIME_PT_NODE_PARAMS_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h
new file mode 100644
index 000000000..15da19e0b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PT_NODE_READER_H
+#define LATINIME_PT_NODE_READER_H
+
+#include "defines.h"
+
+#include "dictionary/structure/pt_common/pt_node_params.h"
+
+namespace latinime {
+
+// Interface class used to read PtNode information.
+class PtNodeReader {
+ public:
+ virtual ~PtNodeReader() {}
+ virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(
+ const int ptNodePos) const = 0;
+
+ protected:
+ PtNodeReader() {};
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(PtNodeReader);
+};
+} // namespace latinime
+#endif /* LATINIME_PT_NODE_READER_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h
new file mode 100644
index 000000000..e6cad25aa
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PT_NODE_WRITER_H
+#define LATINIME_PT_NODE_WRITER_H
+
+#include <unordered_map>
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class NgramProperty;
+class UnigramProperty;
+
+// Interface class used to write PtNode information.
+class PtNodeWriter {
+ public:
+ typedef std::unordered_map<int, int> PtNodeArrayPositionRelocationMap;
+ typedef std::unordered_map<int, int> PtNodePositionRelocationMap;
+ struct DictPositionRelocationMap {
+ public:
+ DictPositionRelocationMap()
+ : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {}
+
+ PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap;
+ PtNodePositionRelocationMap mPtNodePositionRelocationMap;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap);
+ };
+
+ virtual ~PtNodeWriter() {}
+
+ virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams) = 0;
+
+ virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int movedPos, const int bigramLinkedNodePos) = 0;
+
+ virtual bool markPtNodeAsWillBecomeNonTerminal(
+ const PtNodeParams *const toBeUpdatedPtNodeParams) = 0;
+
+ virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const UnigramProperty *const unigramProperty) = 0;
+
+ virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
+ const PtNodeParams *const toBeUpdatedPtNodeParams,
+ bool *const outNeedsToKeepPtNode) = 0;
+
+ virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int newChildrenPosition) = 0;
+
+ virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
+ int *const ptNodeWritingPos) = 0;
+
+ virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
+ const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0;
+
+ virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) = 0;
+
+ virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0;
+
+ virtual bool updateAllBigramEntriesAndDeleteUselessEntries(
+ const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) = 0;
+
+ virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const DictPositionRelocationMap *const dictPositionRelocationMap,
+ int *const outBigramEntryCount) = 0;
+
+ virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
+ const int *const targetCodePoints, const int targetCodePointCount,
+ const int shortcutProbability) = 0;
+
+ protected:
+ PtNodeWriter() {};
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(PtNodeWriter);
+};
+} // namespace latinime
+#endif /* LATINIME_PT_NODE_WRITER_H */
diff --git a/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
new file mode 100644
index 000000000..14428edd4
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+// Flag for presence of more attributes
+const ShortcutListReadingUtils::ShortcutFlags
+ ShortcutListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
+// Mask for attribute probability, stored on 4 bits inside the flags byte.
+const ShortcutListReadingUtils::ShortcutFlags
+ ShortcutListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F;
+const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2;
+// The numeric value of the shortcut probability that means 'whitelist'.
+const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
+
+/* static */ ShortcutListReadingUtils::ShortcutFlags
+ ShortcutListReadingUtils::getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer,
+ int *const pos) {
+ return ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos);
+}
+
+/* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(
+ const ReadOnlyByteArrayView buffer, int *const pos) {
+ // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself.
+ return ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos)
+ - SHORTCUT_LIST_SIZE_FIELD_SIZE;
+}
+
+/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
+ const int maxLength, int *const outWord, int *const pos) {
+ // TODO: Use codePointTable for shortcuts.
+ return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength,
+ nullptr /* codePointTable */, outWord, pos);
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h
new file mode 100644
index 000000000..71cb8cc2c
--- /dev/null
+++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_SHORTCUT_LIST_READING_UTILS_H
+#define LATINIME_SHORTCUT_LIST_READING_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class ShortcutListReadingUtils {
+ public:
+ typedef uint8_t ShortcutFlags;
+
+ static ShortcutFlags getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer,
+ int *const pos);
+
+ static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) {
+ return flags & MASK_ATTRIBUTE_PROBABILITY;
+ }
+
+ static AK_FORCE_INLINE bool hasNext(const ShortcutFlags flags) {
+ return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0;
+ }
+
+ // This method returns the size of the shortcut list region excluding the shortcut list size
+ // field at the beginning.
+ static int getShortcutListSizeAndForwardPointer(const ReadOnlyByteArrayView buffer,
+ int *const pos);
+
+ static AK_FORCE_INLINE int getShortcutListSizeFieldSize() {
+ return SHORTCUT_LIST_SIZE_FIELD_SIZE;
+ }
+
+ static AK_FORCE_INLINE void skipShortcuts(const ReadOnlyByteArrayView buffer, int *const pos) {
+ const int shortcutListSize = getShortcutListSizeAndForwardPointer(buffer, pos);
+ *pos += shortcutListSize;
+ }
+
+ static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) {
+ return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY;
+ }
+
+ static int readShortcutTarget(const ReadOnlyByteArrayView buffer, const int maxLength,
+ int *const outWord, int *const pos);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListReadingUtils);
+
+ static const ShortcutFlags FLAG_ATTRIBUTE_HAS_NEXT;
+ static const ShortcutFlags MASK_ATTRIBUTE_PROBABILITY;
+ static const int SHORTCUT_LIST_SIZE_FIELD_SIZE;
+ static const int WHITELIST_SHORTCUT_PROBABILITY;
+};
+} // namespace latinime
+#endif // LATINIME_SHORTCUT_LIST_READING_UTILS_H
diff --git a/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h
new file mode 100644
index 000000000..25081fa04
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BIGRAM_LIST_POLICY_H
+#define LATINIME_BIGRAM_LIST_POLICY_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
+#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class BigramListPolicy : public DictionaryBigramsStructurePolicy {
+ public:
+ BigramListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}
+
+ ~BigramListPolicy() {}
+
+ void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext,
+ int *const pos) const {
+ BigramListReadWriteUtils::BigramFlags flags;
+ if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBuffer, &flags,
+ outBigramPos, pos)) {
+ AKLOGE("Cannot read bigram entry. bufSize: %zd, pos: %d. ", mBuffer.size(), *pos);
+ *outProbability = NOT_A_PROBABILITY;
+ *outHasNext = false;
+ return;
+ }
+ *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags);
+ *outHasNext = BigramListReadWriteUtils::hasNext(flags);
+ }
+
+ bool skipAllBigrams(int *const pos) const {
+ return BigramListReadWriteUtils::skipExistingBigrams(mBuffer, pos);
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy);
+
+ const ReadOnlyByteArrayView mBuffer;
+};
+} // namespace latinime
+#endif // LATINIME_BIGRAM_LIST_POLICY_H
diff --git a/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp
new file mode 100644
index 000000000..4e8b96b08
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -0,0 +1,526 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v2/patricia_trie_policy.h"
+
+#include "defines.h"
+#include "suggest/core/dicnode/dic_node.h"
+#include "suggest/core/dicnode/dic_node_vector.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/property/ngram_context.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
+#include "dictionary/utils/multi_bigram_map.h"
+#include "dictionary/utils/probability_utils.h"
+#include "utils/char_utils.h"
+
+namespace latinime {
+
+void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const {
+ if (!dicNode->hasChildren()) {
+ return;
+ }
+ int nextPos = dicNode->getChildrenPtNodeArrayPos();
+ if (!isValidPos(nextPos)) {
+ AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd",
+ nextPos, mBuffer.size());
+ mIsCorrupted = true;
+ ASSERT(false);
+ return;
+ }
+ const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
+ mBuffer.data(), &nextPos);
+ for (int i = 0; i < childCount; i++) {
+ if (!isValidPos(nextPos)) {
+ AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d",
+ nextPos, mBuffer.size(), i, childCount);
+ mIsCorrupted = true;
+ ASSERT(false);
+ return;
+ }
+ nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
+ }
+}
+
+int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints) const {
+ return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount,
+ outCodePoints, nullptr /* outUnigramProbability */);
+}
+// This retrieves code points and the probability of the word by its id.
+// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
+// it is possible to check for this with advantageous complexity. For each PtNode array, we search
+// for PtNodes with children and compare the children position with the position we look for.
+// When we shoot the position we look for, it means the word we look for is in the children
+// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a
+// PtNode array with the last PtNode's children position still less than what we are searching for,
+// we must descend the last PtNode's children (for example, if the word we are searching for starts
+// with a z, it's the last PtNode of the root array, so all children addresses will be smaller
+// than the position we look for, and we have to descend the z PtNode).
+/* Parameters :
+ * wordId: Id of the word we are searching for.
+ * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
+ * outUnigramProbability: a pointer to an int to write the probability into.
+ * Return value : the code point count, of 0 if the word was not found.
+ */
+// TODO: Split this function to be more readable
+int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
+ const int wordId, const int maxCodePointCount, int *const outCodePoints,
+ int *const outUnigramProbability) const {
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ int pos = getRootPosition();
+ int wordPos = 0;
+ const int *const codePointTable = mHeaderPolicy.getCodePointTable();
+ if (outUnigramProbability) {
+ *outUnigramProbability = NOT_A_PROBABILITY;
+ }
+ // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
+ // only traverse PtNodes that are actually a part of the terminal we are searching, so each
+ // time we enter this loop we are one depth level further than last time.
+ // The only reason we count PtNodes is because we want to reduce the probability of infinite
+ // looping in case there is a bug. Since we know there is an upper bound to the depth we are
+ // supposed to traverse, it does not hurt to count iterations.
+ for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {
+ int lastCandidatePtNodePos = 0;
+ // Let's loop through PtNodes in this PtNode array searching for either the terminal
+ // or one of its ascendants.
+ if (!isValidPos(pos)) {
+ AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd",
+ pos, mBuffer.size());
+ mIsCorrupted = true;
+ ASSERT(false);
+ return 0;
+ }
+ for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
+ mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) {
+ const int startPos = pos;
+ if (!isValidPos(pos)) {
+ AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size());
+ mIsCorrupted = true;
+ ASSERT(false);
+ return 0;
+ }
+ const PatriciaTrieReadingUtils::NodeFlags flags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
+ const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
+ mBuffer.data(), codePointTable, &pos);
+ if (ptNodePos == startPos) {
+ // We found the position. Copy the rest of the code points in the buffer and return
+ // the length.
+ outCodePoints[wordPos] = character;
+ if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
+ int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
+ mBuffer.data(), codePointTable, &pos);
+ // We count code points in order to avoid infinite loops if the file is broken
+ // or if there is some other bug
+ int charCount = maxCodePointCount;
+ while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
+ outCodePoints[++wordPos] = nextChar;
+ nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
+ mBuffer.data(), codePointTable, &pos);
+ }
+ }
+ if (outUnigramProbability) {
+ *outUnigramProbability =
+ PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
+ mBuffer.data(), &pos);
+ }
+ return ++wordPos;
+ }
+ // We need to skip past this PtNode, so skip any remaining code points after the
+ // first and possibly the probability.
+ if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
+ PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
+ codePointTable, &pos);
+ }
+ if (PatriciaTrieReadingUtils::isTerminal(flags)) {
+ PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
+ }
+ // The fact that this PtNode has children is very important. Since we already know
+ // that this PtNode does not match, if it has no children we know it is irrelevant
+ // to what we are searching for.
+ const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags);
+ // We will write in `found' whether we have passed the children position we are
+ // searching for. For example if we search for "beer", the children of b are less
+ // than the address we are searching for and the children of c are greater. When we
+ // come here for c, we realize this is too big, and that we should descend b.
+ bool found;
+ if (hasChildren) {
+ int currentPos = pos;
+ // Here comes the tricky part. First, read the children position.
+ const int childrenPos = PatriciaTrieReadingUtils
+ ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags,
+ &currentPos);
+ if (childrenPos > ptNodePos) {
+ // If the children pos is greater than the position, it means the previous
+ // PtNode, which position is stored in lastCandidatePtNodePos, was the right
+ // one.
+ found = true;
+ } else if (1 >= ptNodeCount) {
+ // However if we are on the LAST PtNode of this array, and we have NOT shot the
+ // position we should descend THIS PtNode. So we trick the
+ // lastCandidatePtNodePos so that we will descend this PtNode, not the previous
+ // one.
+ lastCandidatePtNodePos = startPos;
+ found = true;
+ } else {
+ // Else, we should continue looking.
+ found = false;
+ }
+ } else {
+ // Even if we don't have children here, we could still be on the last PtNode of
+ // this array. If this is the case, we should descend the last PtNode that had
+ // children, and their position is already in lastCandidatePtNodePos.
+ found = (1 >= ptNodeCount);
+ }
+
+ if (found) {
+ // Okay, we found the PtNode we should descend. Its position is in
+ // the lastCandidatePtNodePos variable, so we just re-read it.
+ if (0 != lastCandidatePtNodePos) {
+ const PatriciaTrieReadingUtils::NodeFlags lastFlags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
+ mBuffer.data(), &lastCandidatePtNodePos);
+ const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
+ // We copy all the characters in this PtNode to the buffer
+ outCodePoints[wordPos] = lastChar;
+ if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
+ int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
+ int charCount = maxCodePointCount;
+ while (-1 != nextChar && --charCount > 0) {
+ outCodePoints[++wordPos] = nextChar;
+ nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
+ }
+ }
+ ++wordPos;
+ // Now we only need to branch to the children address. Skip the probability if
+ // it's there, read pos, and break to resume the search at pos.
+ if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
+ PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
+ &lastCandidatePtNodePos);
+ }
+ pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
+ mBuffer.data(), lastFlags, &lastCandidatePtNodePos);
+ break;
+ } else {
+ // Here is a little tricky part: we come here if we found out that all children
+ // addresses in this PtNode are bigger than the address we are searching for.
+ // Should we conclude the word is not in the dictionary? No! It could still be
+ // one of the remaining PtNodes in this array, so we have to keep looking in
+ // this array until we find it (or we realize it's not there either, in which
+ // case it's actually not in the dictionary). Pass the end of this PtNode,
+ // ready to start the next one.
+ if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
+ PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
+ mBuffer.data(), flags, &pos);
+ }
+ if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
+ mShortcutListPolicy.skipAllShortcuts(&pos);
+ }
+ if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
+ if (!mBigramListPolicy.skipAllBigrams(&pos)) {
+ AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(),
+ pos);
+ mIsCorrupted = true;
+ ASSERT(false);
+ return 0;
+ }
+ }
+ }
+ } else {
+ // If we did not find it, we should record the last children address for the next
+ // iteration.
+ if (hasChildren) lastCandidatePtNodePos = startPos;
+ // Now skip the end of this PtNode (children pos and the attributes if any) so that
+ // our pos is after the end of this PtNode, at the start of the next one.
+ if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
+ PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
+ mBuffer.data(), flags, &pos);
+ }
+ if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
+ mShortcutListPolicy.skipAllShortcuts(&pos);
+ }
+ if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
+ if (!mBigramListPolicy.skipAllBigrams(&pos)) {
+ AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos);
+ mIsCorrupted = true;
+ ASSERT(false);
+ return 0;
+ }
+ }
+ }
+
+ }
+ }
+ // If we have looked through all the PtNodes and found no match, the ptNodePos is
+ // not the position of a terminal in this dictionary.
+ return 0;
+}
+
+// This function gets the position of the terminal PtNode of the exact matching word in the
+// dictionary. If no match is found, it returns NOT_A_WORD_ID.
+int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const {
+ DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
+ wordCodePoints.size(), forceLowerCaseSearch);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in getWordId().");
+ }
+ return getWordIdFromTerminalPtNodePos(ptNodePos);
+}
+
+const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(
+ const WordIdArrayView prevWordIds, const int wordId,
+ MultiBigramMap *const multiBigramMap) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return WordAttributes();
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ const PtNodeParams ptNodeParams =
+ mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (multiBigramMap) {
+ const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
+ prevWordIds, wordId, ptNodeParams.getProbability());
+ return getWordAttributes(probability, ptNodeParams);
+ }
+ if (!prevWordIds.empty()) {
+ const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId);
+ if (bigramProbability != NOT_A_PROBABILITY) {
+ return getWordAttributes(bigramProbability, ptNodeParams);
+ }
+ }
+ return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
+ ptNodeParams);
+}
+
+const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const {
+ return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
+ ptNodeParams.isPossiblyOffensive());
+}
+
+int PatriciaTriePolicy::getProbability(const int unigramProbability,
+ const int bigramProbability) const {
+ // Due to space constraints, the probability for bigrams is approximate - the lower the unigram
+ // probability, the worse the precision. The theoritical maximum error in resulting probability
+ // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means
+ // that sometimes, we'll see some bigrams interverted here, but it can't get too bad.
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else if (bigramProbability == NOT_A_PROBABILITY) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ } else {
+ return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
+ bigramProbability);
+ }
+}
+
+int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
+ const int wordId) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return NOT_A_PROBABILITY;
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ const PtNodeParams ptNodeParams =
+ mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (ptNodeParams.isNotAWord()) {
+ // If this is not a word, it should behave as having no probability outside of the
+ // suggestion process (where it should be used for shortcuts).
+ return NOT_A_PROBABILITY;
+ }
+ if (!prevWordIds.empty()) {
+ const int bigramsPosition = getBigramsPositionOfPtNode(
+ getTerminalPtNodePosFromWordId(prevWordIds[0]));
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ if (bigramsIt.getBigramPos() == ptNodePos
+ && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
+ return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
+ }
+ }
+ return NOT_A_PROBABILITY;
+ }
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+}
+
+void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const {
+ if (prevWordIds.empty()) {
+ return;
+ }
+ const int bigramsPosition = getBigramsPositionOfPtNode(
+ getTerminalPtNodePosFromWordId(prevWordIds[0]));
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ listener->onVisitEntry(bigramsIt.getProbability(),
+ getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
+ }
+}
+
+BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const {
+ const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
+ return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos);
+}
+
+int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_DICT_POS;
+ }
+ return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
+}
+
+int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_DICT_POS;
+ }
+ return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos();
+}
+
+int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
+ const int ptNodePos, DicNodeVector *childDicNodes) const {
+ PatriciaTrieReadingUtils::NodeFlags flags;
+ int mergedNodeCodePointCount = 0;
+ int mergedNodeCodePoints[MAX_WORD_LENGTH];
+ int probability = NOT_A_PROBABILITY;
+ int childrenPos = NOT_A_DICT_POS;
+ int shortcutPos = NOT_A_DICT_POS;
+ int bigramPos = NOT_A_DICT_POS;
+ int siblingPos = NOT_A_DICT_POS;
+ const int *const codePointTable = mHeaderPolicy.getCodePointTable();
+ PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
+ &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
+ mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
+ &siblingPos);
+ // Skip PtNodes don't start with Unicode code point because they represent non-word information.
+ if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
+ const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
+ childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId,
+ CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount));
+ }
+ return siblingPos;
+}
+
+const WordProperty PatriciaTriePolicy::getWordProperty(
+ const CodePointArrayView wordCodePoints) const {
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ AKLOGE("getWordProperty was called for invalid word.");
+ return WordProperty();
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ const PtNodeParams ptNodeParams =
+ mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ // Fetch bigram information.
+ std::vector<NgramProperty> ngrams;
+ const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
+ int bigramWord1CodePoints[MAX_WORD_LENGTH];
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
+ while (bigramsIt.hasNext()) {
+ // Fetch the next bigram information and forward the iterator.
+ bigramsIt.next();
+ // Skip the entry if the entry has been deleted. This never happens for ver2 dicts.
+ if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
+ int word1Probability = NOT_A_PROBABILITY;
+ const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
+ getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
+ bigramWord1CodePoints, &word1Probability);
+ const int probability = getProbability(word1Probability, bigramsIt.getProbability());
+ ngrams.emplace_back(
+ NgramContext(wordCodePoints.data(), wordCodePoints.size(),
+ ptNodeParams.representsBeginningOfSentence()),
+ CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
+ probability, HistoricalInfo());
+ }
+ }
+ // Fetch shortcut information.
+ std::vector<UnigramProperty::ShortcutProperty> shortcuts;
+ int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
+ if (shortcutPos != NOT_A_DICT_POS) {
+ int shortcutTargetCodePoints[MAX_WORD_LENGTH];
+ ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos);
+ bool hasNext = true;
+ while (hasNext) {
+ const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
+ ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos);
+ hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
+ const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
+ mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
+ const int shortcutProbability =
+ ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
+ shortcuts.emplace_back(
+ CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(),
+ shortcutProbability);
+ }
+ }
+ const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
+ ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
+ ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
+ return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
+}
+
+int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount) {
+ *outCodePointCount = 0;
+ if (token == 0) {
+ // Start iterating the dictionary.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
+ &mTerminalPtNodePositionsForIteratingWords);
+ DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
+ }
+ const int terminalPtNodePositionsVectorSize =
+ static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
+ if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
+ AKLOGE("Given token %d is invalid.", token);
+ return 0;
+ }
+ const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
+ *outCodePointCount = getCodePointsAndReturnCodePointCount(
+ getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
+ const int nextToken = token + 1;
+ if (nextToken >= terminalPtNodePositionsVectorSize) {
+ // All words have been iterated.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ return 0;
+ }
+ return nextToken;
+}
+
+int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
+ return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
+}
+
+int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
+ return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
+}
+
+bool PatriciaTriePolicy::isValidPos(const int pos) const {
+ return pos >= 0 && pos < static_cast<int>(mBuffer.size());
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h
new file mode 100644
index 000000000..8edfa7d10
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PATRICIA_TRIE_POLICY_H
+#define LATINIME_PATRICIA_TRIE_POLICY_H
+
+#include <cstdint>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/structure/v2/bigram/bigram_list_policy.h"
+#include "dictionary/structure/v2/shortcut/shortcut_list_policy.h"
+#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
+#include "dictionary/structure/v2/ver2_pt_node_array_reader.h"
+#include "dictionary/utils/format_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
+#include "utils/byte_array_view.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class DicNode;
+class DicNodeVector;
+
+// Word id = Position of a PtNode that represents the word.
+// Max supported n-gram is bigram.
+class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
+ public:
+ PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
+ : mMmappedBuffer(std::move(mmappedBuffer)),
+ mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
+ FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())),
+ mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
+ mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
+ mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
+ mHeaderPolicy.getCodePointTable()),
+ mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
+ mIsCorrupted(false) {}
+
+ AK_FORCE_INLINE int getRootPosition() const {
+ return 0;
+ }
+
+ void createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const;
+
+ int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const;
+
+ int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
+
+ const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const;
+
+ int getProbability(const int unigramProbability, const int bigramProbability) const;
+
+ int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
+
+ void iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const;
+
+ BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
+
+ const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
+ return &mHeaderPolicy;
+ }
+
+ bool addUnigramEntry(const CodePointArrayView wordCodePoints,
+ const UnigramProperty *const unigramProperty) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+
+ bool removeUnigramEntry(const CodePointArrayView wordCodePoints) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+
+ bool addNgramEntry(const NgramProperty *const ngramProperty) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+
+ bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+
+ bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
+ "dictionary.");
+ return false;
+ }
+
+ bool flush(const char *const filePath) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: flush() is called for non-updatable dictionary.");
+ return false;
+ }
+
+ bool flushWithGC(const char *const filePath) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
+ return false;
+ }
+
+ bool needsToRunGC(const bool mindsBlockByGC) const {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
+ return false;
+ }
+
+ void getProperty(const char *const query, const int queryLength, char *const outResult,
+ const int maxResultLength) {
+ // getProperty is not supported for this class.
+ if (maxResultLength > 0) {
+ outResult[0] = '\0';
+ }
+ }
+
+ const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
+
+ int getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount);
+
+ bool isCorrupted() const {
+ return mIsCorrupted;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);
+
+ const MmappedBuffer::MmappedBufferPtr mMmappedBuffer;
+ const HeaderPolicy mHeaderPolicy;
+ const ReadOnlyByteArrayView mBuffer;
+ const BigramListPolicy mBigramListPolicy;
+ const ShortcutListPolicy mShortcutListPolicy;
+ const Ver2ParticiaTrieNodeReader mPtNodeReader;
+ const Ver2PtNodeArrayReader mPtNodeArrayReader;
+ std::vector<int> mTerminalPtNodePositionsForIteratingWords;
+ mutable bool mIsCorrupted;
+
+ int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints,
+ int *const outUnigramProbability) const;
+ int getShortcutPositionOfPtNode(const int ptNodePos) const;
+ int getBigramsPositionOfPtNode(const int ptNodePos) const;
+ int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
+ DicNodeVector *const childDicNodes) const;
+ int getWordIdFromTerminalPtNodePos(const int ptNodePos) const;
+ int getTerminalPtNodePosFromWordId(const int wordId) const;
+ const WordAttributes getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const;
+ bool isValidPos(const int pos) const;
+};
+} // namespace latinime
+#endif // LATINIME_PATRICIA_TRIE_POLICY_H
diff --git a/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h
new file mode 100644
index 000000000..995b1ed01
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_SHORTCUT_LIST_POLICY_H
+#define LATINIME_SHORTCUT_LIST_POLICY_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
+ public:
+ explicit ShortcutListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}
+
+ ~ShortcutListPolicy() {}
+
+ int getStartPos(const int pos) const {
+ if (pos == NOT_A_DICT_POS) {
+ return NOT_A_DICT_POS;
+ }
+ int listPos = pos;
+ ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &listPos);
+ return listPos;
+ }
+
+ void getNextShortcut(const int maxCodePointCount, int *const outCodePoint,
+ int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext,
+ int *const pos) const {
+ const ShortcutListReadingUtils::ShortcutFlags flags =
+ ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, pos);
+ if (outHasNext) {
+ *outHasNext = ShortcutListReadingUtils::hasNext(flags);
+ }
+ if (outIsWhitelist) {
+ *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags);
+ }
+ if (outCodePoint) {
+ *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget(
+ mBuffer, maxCodePointCount, outCodePoint, pos);
+ }
+ }
+
+ void skipAllShortcuts(int *const pos) const {
+ const int shortcutListSize = ShortcutListReadingUtils
+ ::getShortcutListSizeAndForwardPointer(mBuffer, pos);
+ *pos += shortcutListSize;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy);
+
+ const ReadOnlyByteArrayView mBuffer;
+};
+} // namespace latinime
+#endif // LATINIME_SHORTCUT_LIST_POLICY_H
diff --git a/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
new file mode 100644
index 000000000..cbb8ead81
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
+
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+
+namespace latinime {
+
+const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNodePos(
+ const int ptNodePos) const {
+ if (ptNodePos < 0 || ptNodePos >= static_cast<int>(mBuffer.size())) {
+ // Reading invalid position because of bug or broken dictionary.
+ AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %zd",
+ ptNodePos, mBuffer.size());
+ ASSERT(false);
+ return PtNodeParams();
+ }
+ PatriciaTrieReadingUtils::NodeFlags flags;
+ int mergedNodeCodePointCount = 0;
+ int mergedNodeCodePoints[MAX_WORD_LENGTH];
+ int probability = NOT_A_PROBABILITY;
+ int childrenPos = NOT_A_DICT_POS;
+ int shortcutPos = NOT_A_DICT_POS;
+ int bigramPos = NOT_A_DICT_POS;
+ int siblingPos = NOT_A_DICT_POS;
+ PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortcutPolicy,
+ mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
+ &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
+ if (mergedNodeCodePointCount <= 0) {
+ AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
+ ASSERT(false);
+ return PtNodeParams();
+ }
+ return PtNodeParams(ptNodePos, flags, mergedNodeCodePointCount, mergedNodeCodePoints,
+ probability, childrenPos, shortcutPos, bigramPos, siblingPos);
+}
+
+}
diff --git a/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
new file mode 100644
index 000000000..dc87c7c68
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H
+#define LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class DictionaryBigramsStructurePolicy;
+class DictionaryShortcutsStructurePolicy;
+
+class Ver2ParticiaTrieNodeReader : public PtNodeReader {
+ public:
+ Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
+ const DictionaryBigramsStructurePolicy *const bigramPolicy,
+ const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
+ const int *const codePointTable)
+ : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy),
+ mCodePointTable(codePointTable) {}
+
+ virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader);
+
+ const ReadOnlyByteArrayView mBuffer;
+ const DictionaryBigramsStructurePolicy *const mBigramPolicy;
+ const DictionaryShortcutsStructurePolicy *const mShortcutPolicy;
+ const int *const mCodePointTable;
+};
+} // namespace latinime
+#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */
diff --git a/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp
new file mode 100644
index 000000000..8b9b02df1
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v2/ver2_pt_node_array_reader.h"
+
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+
+namespace latinime {
+
+bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
+ int *const outPtNodeCount, int *const outFirstPtNodePos) const {
+ if (ptNodeArrayPos < 0 || ptNodeArrayPos >= static_cast<int>(mBuffer.size())) {
+ // Reading invalid position because of a bug or a broken dictionary.
+ AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %zd",
+ ptNodeArrayPos, mBuffer.size());
+ ASSERT(false);
+ return false;
+ }
+ int readingPos = ptNodeArrayPos;
+ const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
+ mBuffer.data(), &readingPos);
+ *outPtNodeCount = ptNodeCountInArray;
+ *outFirstPtNodePos = readingPos;
+ return true;
+}
+
+bool Ver2PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos,
+ int *const outNextPtNodeArrayPos) const {
+ if (forwordLinkPos < 0 || forwordLinkPos >= static_cast<int>(mBuffer.size())) {
+ // Reading invalid position because of bug or broken dictionary.
+ AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %zd",
+ forwordLinkPos, mBuffer.size());
+ ASSERT(false);
+ return false;
+ }
+ // Ver2 dicts don't have forward links.
+ *outNextPtNodeArrayPos = NOT_A_DICT_POS;
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h
new file mode 100644
index 000000000..32fa96d15
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER2_PT_NODE_ARRAY_READER_H
+#define LATINIME_VER2_PT_NODE_ARRAY_READER_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class Ver2PtNodeArrayReader : public PtNodeArrayReader {
+ public:
+ Ver2PtNodeArrayReader(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {};
+
+ virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
+ int *const outPtNodeCount, int *const outFirstPtNodePos) const;
+ virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos,
+ int *const outNextPtNodeArrayPos) const;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver2PtNodeArrayReader);
+
+ const ReadOnlyByteArrayView mBuffer;
+};
+} // namespace latinime
+#endif /* LATINIME_VER2_PT_NODE_ARRAY_READER_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
new file mode 100644
index 000000000..165947f87
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
+
+namespace latinime {
+
+// Used to provide stable probabilities even if the user's input count is small.
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2, 1};
+
+// Encoded backoff weights.
+// Note that we give positive values for trigrams and quadgrams that means the weight is more than
+// 1.
+// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight.
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, -4, 2, 8};
+
+// This value is used to remove too old entries from the dictionary.
+const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS =
+ 300 * 24 * 60 * 60; // 300 days
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
new file mode 100644
index 000000000..71824c954
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+
+#include <algorithm>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "utils/ngram_utils.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+class DynamicLanguageModelProbabilityUtils {
+ public:
+ static float computeRawProbabilityFromCounts(const int count, const int contextCount,
+ const NgramType ngramType) {
+ const int minCount = ASSUMED_MIN_COUNTS[static_cast<int>(ngramType)];
+ return static_cast<float>(count) / static_cast<float>(std::max(contextCount, minCount));
+ }
+
+ static float backoff(const int ngramProbability, const NgramType ngramType) {
+ const int probability =
+ ngramProbability + ENCODED_BACKOFF_WEIGHTS[static_cast<int>(ngramType)];
+ return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY);
+ }
+
+ static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) {
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ if (elapsedTime < 0) {
+ AKLOGE("The elapsed time is negatime value. Timestamp overflow?");
+ return NOT_A_PROBABILITY;
+ }
+ // TODO: Improve this logic.
+ // We don't modify probability depending on the elapsed time.
+ return probability;
+ }
+
+ static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+ }
+
+ static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ // More recently input entries get higher priority.
+ return historicalInfo.getTimestamp();
+ }
+
+private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils);
+
+ static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 3, "Max supported Ngram is Quadgram.");
+
+ static const int ASSUMED_MIN_COUNTS[];
+ static const int ENCODED_BACKOFF_WEIGHTS[];
+ static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+};
+
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp
new file mode 100644
index 000000000..c10e4906b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp
@@ -0,0 +1,478 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/language_model_dict_content.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
+#include "dictionary/utils/probability_utils.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0;
+const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1;
+
+bool LanguageModelDictContent::save(FILE *const file) const {
+ return mTrieMap.save(file) && mGlobalCounters.save(file);
+}
+
+bool LanguageModelDictContent::runGC(
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const LanguageModelDictContent *const originalContent) {
+ return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(),
+ 0 /* nextLevelBitmapEntryIndex */);
+}
+
+const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds,
+ const int wordId, const bool mustMatchAllPrevWords,
+ const HeaderPolicy *const headerPolicy) const {
+ int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
+ bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex();
+ int maxPrevWordCount = 0;
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ const int nextBitmapEntryIndex =
+ mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex;
+ if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ break;
+ }
+ maxPrevWordCount = i + 1;
+ bitmapEntryIndices[i + 1] = nextBitmapEntryIndex;
+ }
+
+ const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
+ if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) {
+ // The word should be treated as a invalid word.
+ return WordAttributes();
+ }
+ for (int i = maxPrevWordCount; i >= 0; --i) {
+ if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) {
+ break;
+ }
+ const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]);
+ if (!result.mIsValid) {
+ continue;
+ }
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
+ int probability = NOT_A_PROBABILITY;
+ if (mHasHistoricalInfo) {
+ const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
+ int contextCount = 0;
+ if (i == 0) {
+ // unigram
+ contextCount = mGlobalCounters.getTotalCount();
+ } else {
+ const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry(
+ prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]);
+ if (!prevWordProbabilityEntry.isValid()) {
+ continue;
+ }
+ if (prevWordProbabilityEntry.representsBeginningOfSentence()
+ && historicalInfo->getCount() == 1) {
+ // BoS ngram requires multiple contextCount.
+ continue;
+ }
+ contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount();
+ }
+ const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1);
+ const float rawProbability =
+ DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts(
+ historicalInfo->getCount(), contextCount, ngramType);
+ const int encodedRawProbability =
+ ProbabilityUtils::encodeRawProbability(rawProbability);
+ const int decayedProbability =
+ DynamicLanguageModelProbabilityUtils::getDecayedProbability(
+ encodedRawProbability, *historicalInfo);
+ probability = DynamicLanguageModelProbabilityUtils::backoff(
+ decayedProbability, ngramType);
+ } else {
+ probability = probabilityEntry.getProbability();
+ }
+ // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
+ // probabilityEntry.
+ return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
+ unigramProbabilityEntry.isNotAWord(),
+ unigramProbabilityEntry.isPossiblyOffensive());
+ }
+ // Cannot find the word.
+ return WordAttributes();
+}
+
+ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry(
+ const WordIdArrayView prevWordIds, const int wordId) const {
+ const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
+ if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ return ProbabilityEntry();
+ }
+ const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
+ if (!result.mIsValid) {
+ // Not found.
+ return ProbabilityEntry();
+ }
+ return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
+}
+
+bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds,
+ const int wordId, const ProbabilityEntry *const probabilityEntry) {
+ if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
+ return false;
+ }
+ const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds);
+ if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ return false;
+ }
+ return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex);
+}
+
+bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds,
+ const int wordId) {
+ const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
+ if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ // Cannot find bitmap entry for the probability entry. The entry doesn't exist.
+ return false;
+ }
+ return mTrieMap.remove(wordId, bitmapEntryIndex);
+}
+
+LanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries(
+ const WordIdArrayView prevWordIds) const {
+ const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
+ return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo);
+}
+
+std::vector<LanguageModelDictContent::DumppedFullEntryInfo>
+ LanguageModelDictContent::exportAllNgramEntriesRelatedToWord(
+ const HeaderPolicy *const headerPolicy, const int wordId) const {
+ const TrieMap::Result result = mTrieMap.getRoot(wordId);
+ if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ // The word doesn't have any related ngram entries.
+ return std::vector<DumppedFullEntryInfo>();
+ }
+ std::vector<int> prevWordIds = { wordId };
+ std::vector<DumppedFullEntryInfo> entries;
+ exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex,
+ &prevWordIds, &entries);
+ return entries;
+}
+
+void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner(
+ const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex,
+ std::vector<int> *const prevWordIds,
+ std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const {
+ for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
+ const int wordId = entry.key();
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
+ if (probabilityEntry.isValid()) {
+ const WordAttributes wordAttributes = getWordAttributes(
+ WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */,
+ headerPolicy);
+ outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId,
+ wordAttributes, probabilityEntry);
+ }
+ if (entry.hasNextLevelMap()) {
+ prevWordIds->push_back(wordId);
+ exportAllNgramEntriesRelatedToWordInner(headerPolicy,
+ entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo);
+ prevWordIds->pop_back();
+ }
+ }
+}
+
+bool LanguageModelDictContent::truncateEntries(const EntryCounts &currentEntryCounts,
+ const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy,
+ MutableEntryCounters *const outEntryCounters) {
+ for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) {
+ const int totalWordCount = prevWordCount + 1;
+ const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount);
+ if (currentEntryCounts.getNgramCount(ngramType)
+ <= maxEntryCounts.getNgramCount(ngramType)) {
+ outEntryCounters->setNgramCount(ngramType,
+ currentEntryCounts.getNgramCount(ngramType));
+ continue;
+ }
+ int entryCount = 0;
+ if (!turncateEntriesInSpecifiedLevel(headerPolicy,
+ maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) {
+ return false;
+ }
+ outEntryCounters->setNgramCount(ngramType, entryCount);
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds,
+ const int wordId, const bool isValid, const HistoricalInfo historicalInfo,
+ const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) {
+ if (!mHasHistoricalInfo) {
+ AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info.");
+ return false;
+ }
+ const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId);
+ const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom(
+ originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy);
+ if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) {
+ return false;
+ }
+ mGlobalCounters.incrementTotalCount();
+ mGlobalCounters.updateMaxValueOfCounters(
+ updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount());
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ if (prevWordIds[i] == NOT_A_WORD_ID) {
+ break;
+ }
+ // TODO: Optimize this code.
+ const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1);
+ const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry(
+ limitedPrevWordIds, wordId);
+ const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom(
+ originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy);
+ if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) {
+ return false;
+ }
+ mGlobalCounters.updateMaxValueOfCounters(
+ updatedNgramProbabilityEntry.getHistoricalInfo()->getCount());
+ if (!originalNgramProbabilityEntry.isValid()) {
+ // (i + 2) words are used in total because the prevWords consists of (i + 1) words when
+ // looking at its i-th element.
+ entryCountersToUpdate->incrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(i + 2));
+ }
+ }
+ return true;
+}
+
+const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom(
+ const ProbabilityEntry &originalProbabilityEntry, const bool isValid,
+ const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const {
+ const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(),
+ 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount()
+ + historicalInfo.getCount());
+ if (originalProbabilityEntry.isValid()) {
+ return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo);
+ } else {
+ return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo);
+ }
+}
+
+bool LanguageModelDictContent::runGCInner(
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) {
+ for (auto &entry : trieMapRange) {
+ const auto it = terminalIdMap->find(entry.key());
+ if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) {
+ // The word has been removed.
+ continue;
+ }
+ if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) {
+ return false;
+ }
+ if (entry.hasNextLevelMap()) {
+ if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(),
+ mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) {
+ int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex();
+ for (const int wordId : prevWordIds) {
+ const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex);
+ if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) {
+ lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
+ continue;
+ }
+ if (!result.mIsValid) {
+ if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo),
+ lastBitmapEntryIndex)) {
+ AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId,
+ lastBitmapEntryIndex);
+ return TrieMap::INVALID_INDEX;
+ }
+ }
+ lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId,
+ lastBitmapEntryIndex);
+ }
+ return lastBitmapEntryIndex;
+}
+
+int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const {
+ int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex();
+ for (const int wordId : prevWordIds) {
+ const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
+ if (!result.mIsValid) {
+ return TrieMap::INVALID_INDEX;
+ }
+ bitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
+ }
+ return bitmapEntryIndex;
+}
+
+bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex,
+ const int prevWordCount, const HeaderPolicy *const headerPolicy,
+ const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) {
+ for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
+ if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
+ AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.",
+ prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM);
+ return false;
+ }
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
+ if (prevWordCount > 0 && probabilityEntry.isValid()
+ && !mTrieMap.getRoot(entry.key()).mIsValid) {
+ // The entry is related to a word that has been removed. Remove the entry.
+ if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
+ return false;
+ }
+ continue;
+ }
+ if (mHasHistoricalInfo && probabilityEntry.isValid()) {
+ const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo();
+ if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC(
+ *originalHistoricalInfo)) {
+ // Remove the entry.
+ if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
+ return false;
+ }
+ continue;
+ }
+ if (needsToHalveCounters) {
+ const int updatedCount = originalHistoricalInfo->getCount() / 2;
+ if (updatedCount == 0) {
+ // Remove the entry.
+ if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
+ return false;
+ }
+ continue;
+ }
+ const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(),
+ originalHistoricalInfo->getLevel(), updatedCount);
+ const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(),
+ &historicalInfoToSave);
+ if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo),
+ bitmapEntryIndex)) {
+ return false;
+ }
+ }
+ }
+ outEntryCounters->incrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1));
+ if (!entry.hasNextLevelMap()) {
+ continue;
+ }
+ if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(),
+ prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel(
+ const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel,
+ int *const outEntryCount) {
+ std::vector<int> prevWordIds;
+ std::vector<EntryInfoToTurncate> entryInfoVector;
+ if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(),
+ &prevWordIds, &entryInfoVector)) {
+ return false;
+ }
+ if (static_cast<int>(entryInfoVector.size()) <= maxEntryCount) {
+ *outEntryCount = static_cast<int>(entryInfoVector.size());
+ return true;
+ }
+ *outEntryCount = maxEntryCount;
+ const int entryCountToRemove = static_cast<int>(entryInfoVector.size()) - maxEntryCount;
+ std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove,
+ entryInfoVector.end(),
+ EntryInfoToTurncate::Comparator());
+ for (int i = 0; i < entryCountToRemove; ++i) {
+ const EntryInfoToTurncate &entryInfo = entryInfoVector[i];
+ if (!removeNgramProbabilityEntry(
+ WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount),
+ entryInfo.mKey)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy,
+ const int targetLevel, const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
+ std::vector<EntryInfoToTurncate> *const outEntryInfo) const {
+ const int prevWordCount = prevWordIds->size();
+ for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
+ if (prevWordCount < targetLevel) {
+ if (!entry.hasNextLevelMap()) {
+ continue;
+ }
+ prevWordIds->push_back(entry.key());
+ if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(),
+ prevWordIds, outEntryInfo)) {
+ return false;
+ }
+ prevWordIds->pop_back();
+ continue;
+ }
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
+ const int priority = mHasHistoricalInfo
+ ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction(
+ *probabilityEntry.getHistoricalInfo())
+ : probabilityEntry.getProbability();
+ outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(),
+ entry.key(), targetLevel, prevWordIds->data());
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()(
+ const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const {
+ if (left.mPriority != right.mPriority) {
+ return left.mPriority < right.mPriority;
+ }
+ if (left.mCount != right.mCount) {
+ return left.mCount < right.mCount;
+ }
+ if (left.mKey != right.mKey) {
+ return left.mKey < right.mKey;
+ }
+ if (left.mPrevWordCount != right.mPrevWordCount) {
+ return left.mPrevWordCount > right.mPrevWordCount;
+ }
+ for (int i = 0; i < left.mPrevWordCount; ++i) {
+ if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) {
+ return left.mPrevWordIds[i] < right.mPrevWordIds[i];
+ }
+ }
+ // left and rigth represent the same entry.
+ return false;
+}
+
+LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority,
+ const int count, const int key, const int prevWordCount, const int *const prevWordIds)
+ : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) {
+ memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0]));
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h
new file mode 100644
index 000000000..db8c6e12b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
+#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
+
+#include <cstdio>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/word_attributes.h"
+#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/entry_counters.h"
+#include "dictionary/utils/trie_map.h"
+#include "utils/byte_array_view.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class HeaderPolicy;
+
+/**
+ * Class representing language model.
+ *
+ * This class provides methods to get and store unigram/n-gram probability information and flags.
+ */
+class LanguageModelDictContent {
+ public:
+ // Pair of word id and probability entry used for iteration.
+ class WordIdAndProbabilityEntry {
+ public:
+ WordIdAndProbabilityEntry(const int wordId, const ProbabilityEntry &probabilityEntry)
+ : mWordId(wordId), mProbabilityEntry(probabilityEntry) {}
+
+ int getWordId() const { return mWordId; }
+ const ProbabilityEntry getProbabilityEntry() const { return mProbabilityEntry; }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(WordIdAndProbabilityEntry);
+ DISALLOW_ASSIGNMENT_OPERATOR(WordIdAndProbabilityEntry);
+
+ const int mWordId;
+ const ProbabilityEntry mProbabilityEntry;
+ };
+
+ // Iterator.
+ class EntryIterator {
+ public:
+ EntryIterator(const TrieMap::TrieMapIterator &trieMapIterator,
+ const bool hasHistoricalInfo)
+ : mTrieMapIterator(trieMapIterator), mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ const WordIdAndProbabilityEntry operator*() const {
+ const TrieMap::TrieMapIterator::IterationResult &result = *mTrieMapIterator;
+ return WordIdAndProbabilityEntry(
+ result.key(), ProbabilityEntry::decode(result.value(), mHasHistoricalInfo));
+ }
+
+ bool operator!=(const EntryIterator &other) const {
+ return mTrieMapIterator != other.mTrieMapIterator;
+ }
+
+ const EntryIterator &operator++() {
+ ++mTrieMapIterator;
+ return *this;
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(EntryIterator);
+ DISALLOW_ASSIGNMENT_OPERATOR(EntryIterator);
+
+ TrieMap::TrieMapIterator mTrieMapIterator;
+ const bool mHasHistoricalInfo;
+ };
+
+ // Class represents range to use range base for loops.
+ class EntryRange {
+ public:
+ EntryRange(const TrieMap::TrieMapRange trieMapRange, const bool hasHistoricalInfo)
+ : mTrieMapRange(trieMapRange), mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ EntryIterator begin() const {
+ return EntryIterator(mTrieMapRange.begin(), mHasHistoricalInfo);
+ }
+
+ EntryIterator end() const {
+ return EntryIterator(mTrieMapRange.end(), mHasHistoricalInfo);
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(EntryRange);
+ DISALLOW_ASSIGNMENT_OPERATOR(EntryRange);
+
+ const TrieMap::TrieMapRange mTrieMapRange;
+ const bool mHasHistoricalInfo;
+ };
+
+ class DumppedFullEntryInfo {
+ public:
+ DumppedFullEntryInfo(std::vector<int> &prevWordIds, const int targetWordId,
+ const WordAttributes &wordAttributes, const ProbabilityEntry &probabilityEntry)
+ : mPrevWordIds(prevWordIds), mTargetWordId(targetWordId),
+ mWordAttributes(wordAttributes), mProbabilityEntry(probabilityEntry) {}
+
+ const WordIdArrayView getPrevWordIds() const { return WordIdArrayView(mPrevWordIds); }
+ int getTargetWordId() const { return mTargetWordId; }
+ const WordAttributes &getWordAttributes() const { return mWordAttributes; }
+ const ProbabilityEntry &getProbabilityEntry() const { return mProbabilityEntry; }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(DumppedFullEntryInfo);
+
+ const std::vector<int> mPrevWordIds;
+ const int mTargetWordId;
+ const WordAttributes mWordAttributes;
+ const ProbabilityEntry mProbabilityEntry;
+ };
+
+ LanguageModelDictContent(const ReadWriteByteArrayView *const buffers,
+ const bool hasHistoricalInfo)
+ : mTrieMap(buffers[TRIE_MAP_BUFFER_INDEX]),
+ mGlobalCounters(buffers[GLOBAL_COUNTERS_BUFFER_INDEX]),
+ mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ explicit LanguageModelDictContent(const bool hasHistoricalInfo)
+ : mTrieMap(), mGlobalCounters(), mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ bool isNearSizeLimit() const {
+ return mTrieMap.isNearSizeLimit() || mGlobalCounters.needsToHalveCounters();
+ }
+
+ bool save(FILE *const file) const;
+
+ bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const LanguageModelDictContent *const originalContent);
+
+ const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId,
+ const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const;
+
+ ProbabilityEntry getProbabilityEntry(const int wordId) const {
+ return getNgramProbabilityEntry(WordIdArrayView(), wordId);
+ }
+
+ bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) {
+ mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount());
+ return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry);
+ }
+
+ bool removeProbabilityEntry(const int wordId) {
+ return removeNgramProbabilityEntry(WordIdArrayView(), wordId);
+ }
+
+ ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds,
+ const int wordId) const;
+
+ bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId,
+ const ProbabilityEntry *const probabilityEntry);
+
+ bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId);
+
+ EntryRange getProbabilityEntries(const WordIdArrayView prevWordIds) const;
+
+ std::vector<DumppedFullEntryInfo> exportAllNgramEntriesRelatedToWord(
+ const HeaderPolicy *const headerPolicy, const int wordId) const;
+
+ bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy,
+ MutableEntryCounters *const outEntryCounters) {
+ if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(),
+ 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(),
+ outEntryCounters)) {
+ return false;
+ }
+ if (mGlobalCounters.needsToHalveCounters()) {
+ mGlobalCounters.halveCounters();
+ }
+ return true;
+ }
+
+ // entryCounts should be created by updateAllProbabilityEntries.
+ bool truncateEntries(const EntryCounts &currentEntryCounts, const EntryCounts &maxEntryCounts,
+ const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters);
+
+ bool updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId,
+ const bool isValid, const HistoricalInfo historicalInfo,
+ const HeaderPolicy *const headerPolicy,
+ MutableEntryCounters *const entryCountersToUpdate);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent);
+
+ class EntryInfoToTurncate {
+ public:
+ class Comparator {
+ public:
+ bool operator()(const EntryInfoToTurncate &left,
+ const EntryInfoToTurncate &right) const;
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(Comparator);
+ };
+
+ EntryInfoToTurncate(const int priority, const int count, const int key,
+ const int prevWordCount, const int *const prevWordIds);
+
+ int mPriority;
+ // TODO: Remove.
+ int mCount;
+ int mKey;
+ int mPrevWordCount;
+ int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate);
+ };
+
+ static const int TRIE_MAP_BUFFER_INDEX;
+ static const int GLOBAL_COUNTERS_BUFFER_INDEX;
+
+ TrieMap mTrieMap;
+ LanguageModelDictContentGlobalCounters mGlobalCounters;
+ const bool mHasHistoricalInfo;
+
+ bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex);
+ int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds);
+ int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const;
+ bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount,
+ const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters,
+ MutableEntryCounters *const outEntryCounters);
+ bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy,
+ const int maxEntryCount, const int targetLevel, int *const outEntryCount);
+ bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel,
+ const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
+ std::vector<EntryInfoToTurncate> *const outEntryInfo) const;
+ const ProbabilityEntry createUpdatedEntryFrom(const ProbabilityEntry &originalProbabilityEntry,
+ const bool isValid, const HistoricalInfo historicalInfo,
+ const HeaderPolicy *const headerPolicy) const;
+ void exportAllNgramEntriesRelatedToWordInner(const HeaderPolicy *const headerPolicy,
+ const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
+ std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const;
+};
+} // namespace latinime
+#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp
new file mode 100644
index 000000000..89cf0e306
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h"
+
+#include <climits>
+
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+
+namespace latinime {
+
+const int LanguageModelDictContentGlobalCounters::COUNTER_VALUE_NEAR_LIMIT_THRESHOLD =
+ (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 64;
+const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD = 1 << 30;
+const int LanguageModelDictContentGlobalCounters::COUNTER_SIZE_IN_BYTES = 4;
+const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_INDEX = 0;
+const int LanguageModelDictContentGlobalCounters::MAX_VALUE_OF_COUNTERS_INDEX = 1;
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h
new file mode 100644
index 000000000..3f87c0ea0
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H
+#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H
+
+#include <cstdio>
+
+#include "defines.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class LanguageModelDictContentGlobalCounters {
+ public:
+ explicit LanguageModelDictContentGlobalCounters(const ReadWriteByteArrayView buffer)
+ : mBuffer(buffer, 0 /* maxAdditionalBufferSize */),
+ mTotalCount(readValue(mBuffer, TOTAL_COUNT_INDEX)),
+ mMaxValueOfCounters(readValue(mBuffer, MAX_VALUE_OF_COUNTERS_INDEX)) {}
+
+ LanguageModelDictContentGlobalCounters()
+ : mBuffer(0 /* maxAdditionalBufferSize */), mTotalCount(0), mMaxValueOfCounters(0) {}
+
+ bool needsToHalveCounters() const {
+ return mMaxValueOfCounters >= COUNTER_VALUE_NEAR_LIMIT_THRESHOLD
+ || mTotalCount >= TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD;
+ }
+
+ int getTotalCount() const {
+ return mTotalCount;
+ }
+
+ bool save(FILE *const file) const {
+ BufferWithExtendableBuffer bufferToWrite(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ if (!bufferToWrite.writeUint(mTotalCount, COUNTER_SIZE_IN_BYTES,
+ TOTAL_COUNT_INDEX * COUNTER_SIZE_IN_BYTES)) {
+ return false;
+ }
+ if (!bufferToWrite.writeUint(mMaxValueOfCounters, COUNTER_SIZE_IN_BYTES,
+ MAX_VALUE_OF_COUNTERS_INDEX * COUNTER_SIZE_IN_BYTES)) {
+ return false;
+ }
+ return DictFileWritingUtils::writeBufferToFileTail(file, &bufferToWrite);
+ }
+
+ void incrementTotalCount() {
+ mTotalCount += 1;
+ }
+
+ void addToTotalCount(const int count) {
+ mTotalCount += count;
+ }
+
+ void updateMaxValueOfCounters(const int count) {
+ mMaxValueOfCounters = std::max(count, mMaxValueOfCounters);
+ }
+
+ void halveCounters() {
+ mMaxValueOfCounters /= 2;
+ mTotalCount /= 2;
+ }
+
+private:
+ DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContentGlobalCounters);
+
+ const static int COUNTER_VALUE_NEAR_LIMIT_THRESHOLD;
+ const static int TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD;
+ const static int COUNTER_SIZE_IN_BYTES;
+ const static int TOTAL_COUNT_INDEX;
+ const static int MAX_VALUE_OF_COUNTERS_INDEX;
+
+ BufferWithExtendableBuffer mBuffer;
+ int mTotalCount;
+ int mMaxValueOfCounters;
+
+ static int readValue(const BufferWithExtendableBuffer &buffer, const int index) {
+ const int pos = COUNTER_SIZE_IN_BYTES * index;
+ if (pos + COUNTER_SIZE_IN_BYTES > buffer.getTailPosition()) {
+ return 0;
+ }
+ return buffer.readUint(COUNTER_SIZE_IN_BYTES, pos);
+ }
+};
+} // namespace latinime
+#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/dictionary/structure/v4/content/probability_entry.h
new file mode 100644
index 000000000..473354b90
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/probability_entry.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PROBABILITY_ENTRY_H
+#define LATINIME_PROBABILITY_ENTRY_H
+
+#include <climits>
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+
+namespace latinime {
+
+class ProbabilityEntry {
+ public:
+ ProbabilityEntry(const ProbabilityEntry &probabilityEntry)
+ : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability),
+ mHistoricalInfo(probabilityEntry.mHistoricalInfo) {}
+
+ // Dummy entry
+ ProbabilityEntry()
+ : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY),
+ mHistoricalInfo() {}
+
+ // Entry without historical information
+ ProbabilityEntry(const int flags, const int probability)
+ : mFlags(flags), mProbability(probability), mHistoricalInfo() {}
+
+ // Entry with historical information.
+ ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo)
+ : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {}
+
+ // Create from unigram property.
+ ProbabilityEntry(const UnigramProperty *const unigramProperty)
+ : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(),
+ unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
+ unigramProperty->isPossiblyOffensive())),
+ mProbability(unigramProperty->getProbability()),
+ mHistoricalInfo(unigramProperty->getHistoricalInfo()) {}
+
+ // Create from ngram property.
+ // TODO: Set flags.
+ ProbabilityEntry(const NgramProperty *const ngramProperty)
+ : mFlags(0), mProbability(ngramProperty->getProbability()),
+ mHistoricalInfo(ngramProperty->getHistoricalInfo()) {}
+
+ bool isValid() const {
+ return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
+ }
+
+ bool hasHistoricalInfo() const {
+ return mHistoricalInfo.isValid();
+ }
+
+ uint8_t getFlags() const {
+ return mFlags;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ const HistoricalInfo *getHistoricalInfo() const {
+ return &mHistoricalInfo;
+ }
+
+ bool representsBeginningOfSentence() const {
+ return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0;
+ }
+
+ bool isNotAWord() const {
+ return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0;
+ }
+
+ bool isBlacklisted() const {
+ return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0;
+ }
+
+ bool isPossiblyOffensive() const {
+ return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0;
+ }
+
+ uint64_t encode(const bool hasHistoricalInfo) const {
+ uint64_t encodedEntry = static_cast<uint8_t>(mFlags);
+ if (hasHistoricalInfo) {
+ encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT))
+ | static_cast<uint32_t>(mHistoricalInfo.getTimestamp());
+ encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT))
+ | static_cast<uint8_t>(mHistoricalInfo.getLevel());
+ encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT))
+ | static_cast<uint16_t>(mHistoricalInfo.getCount());
+ } else {
+ encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT))
+ | static_cast<uint8_t>(mProbability);
+ }
+ return encodedEntry;
+ }
+
+ static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) {
+ if (hasHistoricalInfo) {
+ const int flags = readFromEncodedEntry(encodedEntry,
+ Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE,
+ Ver4DictConstants::TIME_STAMP_FIELD_SIZE
+ + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
+ + Ver4DictConstants::WORD_COUNT_FIELD_SIZE);
+ const int timestamp = readFromEncodedEntry(encodedEntry,
+ Ver4DictConstants::TIME_STAMP_FIELD_SIZE,
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
+ + Ver4DictConstants::WORD_COUNT_FIELD_SIZE);
+ const int level = readFromEncodedEntry(encodedEntry,
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE,
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE);
+ const int count = readFromEncodedEntry(encodedEntry,
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */);
+ const HistoricalInfo historicalInfo(timestamp, level, count);
+ return ProbabilityEntry(flags, &historicalInfo);
+ } else {
+ const int flags = readFromEncodedEntry(encodedEntry,
+ Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE,
+ Ver4DictConstants::PROBABILITY_SIZE);
+ const int probability = readFromEncodedEntry(encodedEntry,
+ Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */);
+ return ProbabilityEntry(flags, probability);
+ }
+ }
+
+ private:
+ // Copy constructor is public to use this class as a type of return value.
+ DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry);
+
+ const uint8_t mFlags;
+ const int mProbability;
+ const HistoricalInfo mHistoricalInfo;
+
+ static int readFromEncodedEntry(const uint64_t encodedEntry, const int size, const int pos) {
+ return static_cast<int>(
+ (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1));
+ }
+
+ static uint8_t createFlags(const bool representsBeginningOfSentence,
+ const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) {
+ uint8_t flags = 0;
+ if (representsBeginningOfSentence) {
+ flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
+ }
+ if (isNotAWord) {
+ flags |= Ver4DictConstants::FLAG_NOT_A_WORD;
+ }
+ if (isBlacklisted) {
+ flags |= Ver4DictConstants::FLAG_BLACKLISTED;
+ }
+ if (isPossiblyOffensive) {
+ flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE;
+ }
+ return flags;
+ }
+};
+} // namespace latinime
+#endif /* LATINIME_PROBABILITY_ENTRY_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp
new file mode 100644
index 000000000..e3b419449
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/shortcut_dict_content.h"
+
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
+ int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
+ bool *const outhasNext, int *const shortcutEntryPos) const {
+ const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer();
+ if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) {
+ AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d",
+ *shortcutEntryPos, shortcutListBuffer->getTailPosition());
+ ASSERT(false);
+ if (outhasNext) {
+ *outhasNext = false;
+ }
+ if (outCodePointCount) {
+ *outCodePointCount = 0;
+ }
+ return;
+ }
+
+ const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition(
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
+ if (outProbability) {
+ *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK;
+ }
+ if (outhasNext) {
+ *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK;
+ }
+ if (outCodePoint && outCodePointCount) {
+ shortcutListBuffer->readCodePointsAndAdvancePosition(
+ maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos);
+ }
+}
+
+int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const {
+ const SparseTable *const addressLookupTable = getAddressLookupTable();
+ if (!addressLookupTable->contains(terminalId)) {
+ return NOT_A_DICT_POS;
+ }
+ return addressLookupTable->get(terminalId);
+}
+
+bool ShortcutDictContent::runGC(
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const ShortcutDictContent *const originalShortcutDictContent) {
+ for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
+ it != terminalIdMap->end(); ++it) {
+ const int originalShortcutListPos =
+ originalShortcutDictContent->getShortcutListHeadPos(it->first);
+ if (originalShortcutListPos == NOT_A_DICT_POS) {
+ continue;
+ }
+ const int shortcutListPos = getContentBuffer()->getTailPosition();
+ // Copy shortcut list from original content.
+ if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent,
+ shortcutListPos)) {
+ AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d",
+ originalShortcutListPos, shortcutListPos);
+ return false;
+ }
+ // Set shortcut list position to the lookup table.
+ if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) {
+ AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d",
+ it->second, shortcutListPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool ShortcutDictContent::createNewShortcutList(const int terminalId) {
+ const int shortcutListListPos = getContentBuffer()->getTailPosition();
+ return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos);
+}
+
+bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) {
+ return copyShortcutListFromDictContent(shortcutListPos, this, toPos);
+}
+
+bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos,
+ const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) {
+ bool hasNext = true;
+ int readingPos = shortcutListPos;
+ int writingPos = toPos;
+ int codePoints[MAX_WORD_LENGTH];
+ while (hasNext) {
+ int probability = 0;
+ int codePointCount = 0;
+ sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH,
+ codePoints, &codePointCount, &probability, &hasNext, &readingPos);
+ if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability,
+ hasNext, &writingPos)) {
+ AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos);
+ return false;
+ }
+ }
+ return true;
+}
+
+bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) {
+ BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer();
+ const int shortcutFlags = shortcutListBuffer->readUint(
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
+ const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK;
+ const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext);
+ return shortcutListBuffer->writeUint(shortcutFlagsToWrite,
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
+}
+
+bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint,
+ const int codePointCount, const int probability, const bool hasNext,
+ int *const shortcutEntryPos) {
+ BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer();
+ const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext);
+ if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags,
+ Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) {
+ AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos);
+ return false;
+ }
+ if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount,
+ true /* writesTerminator */, shortcutEntryPos)) {
+ AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos);
+ return false;
+ }
+ return true;
+}
+
+// Find a shortcut entry that has specified target and return its position.
+int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos,
+ const int *const targetCodePointsToFind, const int codePointCount) const {
+ bool hasNext = true;
+ int readingPos = shortcutListPos;
+ int targetCodePoints[MAX_WORD_LENGTH];
+ while (hasNext) {
+ const int entryPos = readingPos;
+ int probability = 0;
+ int targetCodePointCount = 0;
+ getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount,
+ &probability, &hasNext, &readingPos);
+ if (targetCodePointCount != codePointCount) {
+ continue;
+ }
+ bool matched = true;
+ for (int i = 0; i < codePointCount; ++i) {
+ if (targetCodePointsToFind[i] != targetCodePoints[i]) {
+ matched = false;
+ break;
+ }
+ }
+ if (matched) {
+ return entryPos;
+ }
+ }
+ return NOT_A_DICT_POS;
+}
+
+int ShortcutDictContent::createAndGetShortcutFlags(const int probability,
+ const bool hasNext) const {
+ return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK)
+ | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0);
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h
new file mode 100644
index 000000000..27de4e79e
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_SHORTCUT_DICT_CONTENT_H
+#define LATINIME_SHORTCUT_DICT_CONTENT_H
+
+#include <cstdio>
+
+#include "defines.h"
+#include "dictionary/structure/v4/content/sparse_table_dict_content.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+
+namespace latinime {
+
+class ReadWriteByteArrayView;
+
+class ShortcutDictContent : public SparseTableDictContent {
+ public:
+ ShortcutDictContent(const ReadWriteByteArrayView *const buffers)
+ : SparseTableDictContent(buffers, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
+ Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
+
+ ShortcutDictContent()
+ : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
+ Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
+
+ void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint,
+ int *const outCodePointCount, int *const outProbability, bool *const outhasNext,
+ const int shortcutEntryPos) {
+ int readingPos = shortcutEntryPos;
+ return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint,
+ outCodePointCount, outProbability, outhasNext, &readingPos);
+ }
+
+ void getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
+ int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
+ bool *const outhasNext, int *const shortcutEntryPos) const;
+
+ // Returns head position of shortcut list for a PtNode specified by terminalId.
+ int getShortcutListHeadPos(const int terminalId) const;
+
+ bool flushToFile(FILE *const file) const {
+ return flush(file);
+ }
+
+ bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const ShortcutDictContent *const originalShortcutDictContent);
+
+ bool createNewShortcutList(const int terminalId);
+
+ bool copyShortcutList(const int shortcutListPos, const int toPos);
+
+ bool setProbability(const int probability, const int shortcutEntryPos);
+
+ bool writeShortcutEntry(const int *const codePoint, const int codePointCount,
+ const int probability, const bool hasNext, const int shortcutEntryPos) {
+ int writingPos = shortcutEntryPos;
+ return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability,
+ hasNext, &writingPos);
+ }
+
+ bool writeShortcutEntryAndAdvancePosition(const int *const codePoint,
+ const int codePointCount, const int probability, const bool hasNext,
+ int *const shortcutEntryPos);
+
+ int findShortcutEntryAndGetPos(const int shortcutListPos,
+ const int *const targetCodePointsToFind, const int codePointCount) const;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent);
+
+ bool copyShortcutListFromDictContent(const int shortcutListPos,
+ const ShortcutDictContent *const sourceShortcutDictContent, const int toPos);
+
+ int createAndGetShortcutFlags(const int probability, const bool hasNext) const;
+};
+} // namespace latinime
+#endif /* LATINIME_SHORTCUT_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/single_dict_content.h b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h
new file mode 100644
index 000000000..6faa9a28b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_SINGLE_DICT_CONTENT_H
+#define LATINIME_SINGLE_DICT_CONTENT_H
+
+#include <cstdio>
+
+#include "defines.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class SingleDictContent {
+ public:
+ SingleDictContent(const ReadWriteByteArrayView buffer)
+ : mExpandableContentBuffer(buffer,
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {}
+
+ SingleDictContent()
+ : mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE) {}
+
+ virtual ~SingleDictContent() {}
+
+ bool isNearSizeLimit() const {
+ return mExpandableContentBuffer.isNearSizeLimit();
+ }
+
+ protected:
+ BufferWithExtendableBuffer *getWritableBuffer() {
+ return &mExpandableContentBuffer;
+ }
+
+ const BufferWithExtendableBuffer *getBuffer() const {
+ return &mExpandableContentBuffer;
+ }
+
+ bool flush(FILE *const file) const {
+ return DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer);
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(SingleDictContent);
+
+ BufferWithExtendableBuffer mExpandableContentBuffer;
+};
+} // namespace latinime
+#endif /* LATINIME_SINGLE_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp
new file mode 100644
index 000000000..685365f36
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/sparse_table_dict_content.h"
+
+#include "dictionary/utils/dict_file_writing_utils.h"
+
+namespace latinime {
+
+const int SparseTableDictContent::LOOKUP_TABLE_BUFFER_INDEX = 0;
+const int SparseTableDictContent::ADDRESS_TABLE_BUFFER_INDEX = 1;
+const int SparseTableDictContent::CONTENT_BUFFER_INDEX = 2;
+
+bool SparseTableDictContent::flush(FILE *const file) const {
+ if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableLookupTableBuffer)) {
+ return false;
+ }
+ if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableAddressTableBuffer)) {
+ return false;
+ }
+ if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer)) {
+ return false;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h
new file mode 100644
index 000000000..6245abc8e
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_SPARSE_TABLE_DICT_CONTENT_H
+#define LATINIME_SPARSE_TABLE_DICT_CONTENT_H
+
+#include <cstdio>
+
+#include "defines.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/sparse_table.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+// TODO: Support multiple contents.
+class SparseTableDictContent {
+ public:
+ AK_FORCE_INLINE SparseTableDictContent(const ReadWriteByteArrayView *const buffers,
+ const int sparseTableBlockSize, const int sparseTableDataSize)
+ : mExpandableLookupTableBuffer(buffers[LOOKUP_TABLE_BUFFER_INDEX],
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mExpandableAddressTableBuffer(buffers[ADDRESS_TABLE_BUFFER_INDEX],
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mExpandableContentBuffer(buffers[CONTENT_BUFFER_INDEX],
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer,
+ sparseTableBlockSize, sparseTableDataSize) {}
+
+ SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize)
+ : mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer,
+ sparseTableBlockSize, sparseTableDataSize) {}
+
+ virtual ~SparseTableDictContent() {}
+
+ bool isNearSizeLimit() const {
+ return mExpandableLookupTableBuffer.isNearSizeLimit()
+ || mExpandableAddressTableBuffer.isNearSizeLimit()
+ || mExpandableContentBuffer.isNearSizeLimit();
+ }
+
+ protected:
+ SparseTable *getUpdatableAddressLookupTable() {
+ return &mAddressLookupTable;
+ }
+
+ const SparseTable *getAddressLookupTable() const {
+ return &mAddressLookupTable;
+ }
+
+ BufferWithExtendableBuffer *getWritableContentBuffer() {
+ return &mExpandableContentBuffer;
+ }
+
+ const BufferWithExtendableBuffer *getContentBuffer() const {
+ return &mExpandableContentBuffer;
+ }
+
+ bool flush(FILE *const file) const;
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent);
+
+ static const int LOOKUP_TABLE_BUFFER_INDEX;
+ static const int ADDRESS_TABLE_BUFFER_INDEX;
+ static const int CONTENT_BUFFER_INDEX;
+
+ BufferWithExtendableBuffer mExpandableLookupTableBuffer;
+ BufferWithExtendableBuffer mExpandableAddressTableBuffer;
+ BufferWithExtendableBuffer mExpandableContentBuffer;
+ SparseTable mAddressLookupTable;
+};
+} // namespace latinime
+#endif /* LATINIME_SPARSE_TABLE_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp
new file mode 100644
index 000000000..5503151fd
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const {
+ if (terminalId < 0 || terminalId >= mSize) {
+ return NOT_A_DICT_POS;
+ }
+ const int terminalPos = getBuffer()->readUint(
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId));
+ return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ?
+ NOT_A_DICT_POS : terminalPos;
+}
+
+bool TerminalPositionLookupTable::setTerminalPtNodePosition(
+ const int terminalId, const int terminalPtNodePos) {
+ if (terminalId < 0) {
+ return false;
+ }
+ while (terminalId >= mSize) {
+ // Write new entry.
+ if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) {
+ return false;
+ }
+ mSize++;
+ }
+ const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ?
+ terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS;
+ return getWritableBuffer()->writeUint(terminalPos,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId));
+}
+
+bool TerminalPositionLookupTable::flushToFile(FILE *const file) const {
+ // If the used buffer size is smaller than the actual buffer size, regenerate the lookup
+ // table and write the new table to the file.
+ if (getEntryPos(mSize) < getBuffer()->getTailPosition()) {
+ TerminalPositionLookupTable lookupTableToWrite;
+ for (int i = 0; i < mSize; ++i) {
+ const int terminalPtNodePosition = getTerminalPtNodePosition(i);
+ if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) {
+ AKLOGE("Cannot set terminal position to lookupTableToWrite."
+ " terminalId: %d, position: %d", i, terminalPtNodePosition);
+ return false;
+ }
+ }
+ return lookupTableToWrite.flush(file);
+ } else {
+ // We can simply use this lookup table because the buffer size has not been
+ // changed.
+ return flush(file);
+ }
+}
+
+bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) {
+ int removedEntryCount = 0;
+ int nextNewTerminalId = 0;
+ for (int i = 0; i < mSize; ++i) {
+ const int terminalPos = getBuffer()->readUint(
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i));
+ if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) {
+ // This entry is a garbage.
+ removedEntryCount++;
+ } else {
+ // Give a new terminal id to the entry.
+ if (!getWritableBuffer()->writeUint(terminalPos,
+ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
+ getEntryPos(nextNewTerminalId))) {
+ return false;
+ }
+ // Memorize the mapping to the old terminal id to the new terminal id.
+ terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId));
+ nextNewTerminalId++;
+ }
+ }
+ mSize = nextNewTerminalId;
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h
new file mode 100644
index 000000000..f45ceb52d
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H
+#define LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H
+
+#include <cstdio>
+#include <unordered_map>
+
+#include "defines.h"
+#include "dictionary/structure/v4/content/single_dict_content.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class TerminalPositionLookupTable : public SingleDictContent {
+ public:
+ typedef std::unordered_map<int, int> TerminalIdMap;
+
+ TerminalPositionLookupTable(const ReadWriteByteArrayView buffer)
+ : SingleDictContent(buffer),
+ mSize(getBuffer()->getTailPosition()
+ / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {}
+
+ TerminalPositionLookupTable() : mSize(0) {}
+
+ int getTerminalPtNodePosition(const int terminalId) const;
+
+ bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos);
+
+ int getNextTerminalId() const {
+ return mSize;
+ }
+
+ bool flushToFile(FILE *const file) const;
+
+ bool runGCTerminalIds(TerminalIdMap *const terminalIdMap);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable);
+
+ int getEntryPos(const int terminalId) const {
+ return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
+ }
+
+ int mSize;
+};
+} // namespace latinime
+#endif // LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H
diff --git a/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
new file mode 100644
index 000000000..25ab22543
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_SHORTCUT_LIST_POLICY_H
+#define LATINIME_VER4_SHORTCUT_LIST_POLICY_H
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "dictionary/structure/v4/content/shortcut_dict_content.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+
+namespace latinime {
+
+class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
+ public:
+ Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent,
+ const TerminalPositionLookupTable *const terminalPositionLookupTable)
+ : mShortcutDictContent(shortcutDictContent) {}
+
+ ~Ver4ShortcutListPolicy() {}
+
+ int getStartPos(const int pos) const {
+ // The first shortcut entry is located at the head position of the shortcut list.
+ return pos;
+ }
+
+ void getNextShortcut(const int maxCodePointCount, int *const outCodePoint,
+ int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext,
+ int *const pos) const {
+ int probability = 0;
+ mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount,
+ outCodePoint, outCodePointCount, &probability, outHasNext, pos);
+ if (outIsWhitelist) {
+ *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability);
+ }
+ }
+
+ void skipAllShortcuts(int *const pos) const {
+ // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries.
+ }
+
+ bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount,
+ const int probability) {
+ const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId);
+ if (shortcutListPos == NOT_A_DICT_POS) {
+ // Create shortcut list.
+ if (!mShortcutDictContent->createNewShortcutList(terminalId)) {
+ AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId);
+ return false;
+ }
+ const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId);
+ return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability,
+ false /* hasNext */, writingPos);
+ }
+ const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos,
+ codePoints, codePointCount);
+ if (entryPos == NOT_A_DICT_POS) {
+ // Add new entry to the shortcut list.
+ // Create new shortcut list.
+ if (!mShortcutDictContent->createNewShortcutList(terminalId)) {
+ AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId);
+ return false;
+ }
+ int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId);
+ if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints,
+ codePointCount, probability, true /* hasNext */, &writingPos)) {
+ AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId,
+ writingPos);
+ return false;
+ }
+ return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos);
+ }
+ // Overwrite existing entry.
+ bool hasNext = false;
+ mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */,
+ 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos);
+ if (!mShortcutDictContent->writeShortcutEntry(codePoints,
+ codePointCount, probability, hasNext, entryPos)) {
+ AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId,
+ entryPos);
+ return false;
+ }
+ return true;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy);
+
+ ShortcutDictContent *const mShortcutDictContent;
+};
+} // namespace latinime
+#endif // LATINIME_VER4_SHORTCUT_LIST_POLICY_H
diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp
new file mode 100644
index 000000000..b0a82839b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+
+#include <cerrno>
+#include <cstring>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <vector>
+
+#include "dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/file_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers(
+ const char *const dictPath, MmappedBuffer::MmappedBufferPtr &&headerBuffer,
+ const FormatUtils::FORMAT_VERSION formatVersion) {
+ if (!headerBuffer) {
+ ASSERT(false);
+ AKLOGE("The header buffer must be valid to open ver4 dict buffers.");
+ return Ver4DictBuffersPtr(nullptr);
+ }
+ // TODO: take only dictDirPath, and open both header and trie files in the constructor below
+ const bool isUpdatable = headerBuffer->isUpdatable();
+ MmappedBuffer::MmappedBufferPtr bodyBuffer = MmappedBuffer::openBuffer(dictPath,
+ Ver4DictConstants::BODY_FILE_EXTENSION, isUpdatable);
+ if (!bodyBuffer) {
+ return Ver4DictBuffersPtr(nullptr);
+ }
+ std::vector<ReadWriteByteArrayView> buffers;
+ const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView();
+ int position = 0;
+ while (position < static_cast<int>(buffer.size())) {
+ const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition(
+ buffer.data(), &position);
+ buffers.push_back(buffer.subView(position, bufferSize));
+ position += bufferSize;
+ if (bufferSize < 0 || position < 0 || position > static_cast<int>(buffer.size())) {
+ AKLOGE("The dict body file is corrupted.");
+ return Ver4DictBuffersPtr(nullptr);
+ }
+ }
+ if (buffers.size() != Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE) {
+ AKLOGE("The dict body file is corrupted.");
+ return Ver4DictBuffersPtr(nullptr);
+ }
+ return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer),
+ formatVersion, buffers));
+}
+
+bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath,
+ const BufferWithExtendableBuffer *const headerBuffer) const {
+ // Create temporary directory.
+ const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath,
+ DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE);
+ char tmpDirPath[tmpDirPathBufSize];
+ FileUtils::getFilePathWithSuffix(dictDirPath,
+ DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize,
+ tmpDirPath);
+ if (FileUtils::existsDir(tmpDirPath)) {
+ if (!FileUtils::removeDirAndFiles(tmpDirPath)) {
+ AKLOGE("Existing directory %s cannot be removed.", tmpDirPath);
+ ASSERT(false);
+ return false;
+ }
+ }
+ umask(S_IWGRP | S_IWOTH);
+ if (mkdir(tmpDirPath, S_IRWXU) == -1) {
+ AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno);
+ return false;
+ }
+ // Get dictionary base path.
+ const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */;
+ char dictName[dictNameBufSize];
+ FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName);
+ const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName);
+ char dictPath[dictPathBufSize];
+ FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath);
+
+ // Write header file.
+ if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath,
+ Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) {
+ AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath,
+ Ver4DictConstants::HEADER_FILE_EXTENSION);
+ return false;
+ }
+
+ // Write body file.
+ const int bodyFilePathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictPath,
+ Ver4DictConstants::BODY_FILE_EXTENSION);
+ char bodyFilePath[bodyFilePathBufSize];
+ FileUtils::getFilePathWithSuffix(dictPath, Ver4DictConstants::BODY_FILE_EXTENSION,
+ bodyFilePathBufSize, bodyFilePath);
+
+ const int fd = open(bodyFilePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+ if (fd == -1) {
+ AKLOGE("File %s cannot be opened. errno: %d", bodyFilePath, errno);
+ ASSERT(false);
+ return false;
+ }
+ FILE *const file = fdopen(fd, "wb");
+ if (!file) {
+ AKLOGE("fdopen failed for the file %s. errno: %d", bodyFilePath, errno);
+ ASSERT(false);
+ return false;
+ }
+
+ if (!flushDictBuffers(file)) {
+ fclose(file);
+ return false;
+ }
+ fclose(file);
+ // Remove existing dictionary.
+ if (!FileUtils::removeDirAndFiles(dictDirPath)) {
+ AKLOGE("Existing directory %s cannot be removed.", dictDirPath);
+ ASSERT(false);
+ return false;
+ }
+ // Rename temporary directory.
+ if (rename(tmpDirPath, dictDirPath) != 0) {
+ AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath);
+ ASSERT(false);
+ return false;
+ }
+ return true;
+}
+
+bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const {
+ // Write trie.
+ if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableTrieBuffer)) {
+ AKLOGE("Trie cannot be written.");
+ return false;
+ }
+ // Write terminal position lookup table.
+ if (!mTerminalPositionLookupTable.flushToFile(file)) {
+ AKLOGE("Terminal position lookup table cannot be written.");
+ return false;
+ }
+ // Write language model content.
+ if (!mLanguageModelDictContent.save(file)) {
+ AKLOGE("Language model dict content cannot be written.");
+ return false;
+ }
+ // Write shortcut dict content.
+ if (!mShortcutDictContent.flushToFile(file)) {
+ AKLOGE("Shortcut dict content cannot be written.");
+ return false;
+ }
+ return true;
+}
+
+Ver4DictBuffers::Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer,
+ MmappedBuffer::MmappedBufferPtr &&bodyBuffer,
+ const FormatUtils::FORMAT_VERSION formatVersion,
+ const std::vector<ReadWriteByteArrayView> &contentBuffers)
+ : mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(std::move(bodyBuffer)),
+ mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion),
+ mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(),
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mExpandableTrieBuffer(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX],
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
+ mTerminalPositionLookupTable(
+ contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]),
+ mLanguageModelDictContent(&contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX],
+ mHeaderPolicy.hasHistoricalInfoOfWords()),
+ mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]),
+ mIsUpdatable(mDictBuffer->isUpdatable()) {}
+
+Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize)
+ : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy),
+ mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
+ mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(),
+ mLanguageModelDictContent(headerPolicy->hasHistoricalInfoOfWords()),
+ mShortcutDictContent(), mIsUpdatable(true) {}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h
new file mode 100644
index 000000000..c8270c93c
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_DICT_BUFFER_H
+#define LATINIME_VER4_DICT_BUFFER_H
+
+#include <cstdio>
+#include <memory>
+
+#include "defines.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/v4/content/language_model_dict_content.h"
+#include "dictionary/structure/v4/content/shortcut_dict_content.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/mmapped_buffer.h"
+
+namespace latinime {
+
+class Ver4DictBuffers {
+ public:
+ typedef std::unique_ptr<Ver4DictBuffers> Ver4DictBuffersPtr;
+
+ static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath,
+ MmappedBuffer::MmappedBufferPtr &&headerBuffer,
+ const FormatUtils::FORMAT_VERSION formatVersion);
+
+ static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers(
+ const HeaderPolicy *const headerPolicy, const int maxTrieSize) {
+ return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize));
+ }
+
+ AK_FORCE_INLINE bool isValid() const {
+ return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid();
+ }
+
+ AK_FORCE_INLINE bool isNearSizeLimit() const {
+ return mExpandableTrieBuffer.isNearSizeLimit()
+ || mTerminalPositionLookupTable.isNearSizeLimit()
+ || mLanguageModelDictContent.isNearSizeLimit()
+ || mShortcutDictContent.isNearSizeLimit();
+ }
+
+ AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const {
+ return &mHeaderPolicy;
+ }
+
+ AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() {
+ return &mExpandableHeaderBuffer;
+ }
+
+ AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() {
+ return &mExpandableTrieBuffer;
+ }
+
+ AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const {
+ return &mExpandableTrieBuffer;
+ }
+
+ AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() {
+ return &mTerminalPositionLookupTable;
+ }
+
+ AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const {
+ return &mTerminalPositionLookupTable;
+ }
+
+ AK_FORCE_INLINE LanguageModelDictContent *getMutableLanguageModelDictContent() {
+ return &mLanguageModelDictContent;
+ }
+
+ AK_FORCE_INLINE const LanguageModelDictContent *getLanguageModelDictContent() const {
+ return &mLanguageModelDictContent;
+ }
+
+ AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() {
+ return &mShortcutDictContent;
+ }
+
+ AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const {
+ return &mShortcutDictContent;
+ }
+
+ AK_FORCE_INLINE bool isUpdatable() const {
+ return mIsUpdatable;
+ }
+
+ bool flush(const char *const dictDirPath) const {
+ return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer);
+ }
+
+ bool flushHeaderAndDictBuffers(const char *const dictDirPath,
+ const BufferWithExtendableBuffer *const headerBuffer) const;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers);
+
+ Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer,
+ MmappedBuffer::MmappedBufferPtr &&bodyBuffer,
+ const FormatUtils::FORMAT_VERSION formatVersion,
+ const std::vector<ReadWriteByteArrayView> &contentBuffers);
+
+ Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize);
+
+ bool flushDictBuffers(FILE *const file) const;
+
+ const MmappedBuffer::MmappedBufferPtr mHeaderBuffer;
+ const MmappedBuffer::MmappedBufferPtr mDictBuffer;
+ const HeaderPolicy mHeaderPolicy;
+ BufferWithExtendableBuffer mExpandableHeaderBuffer;
+ BufferWithExtendableBuffer mExpandableTrieBuffer;
+ TerminalPositionLookupTable mTerminalPositionLookupTable;
+ LanguageModelDictContent mLanguageModelDictContent;
+ ShortcutDictContent mShortcutDictContent;
+ const int mIsUpdatable;
+};
+} // namespace latinime
+#endif /* LATINIME_VER4_DICT_BUFFER_H */
diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp
new file mode 100644
index 000000000..fd6907824
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+
+namespace latinime {
+
+const char *const Ver4DictConstants::BODY_FILE_EXTENSION = ".body";
+const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header";
+
+// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets.
+const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024;
+// Extended region size, which is not GCed region size in dict file + additional buffer size, is
+// limited to 1MB to prevent from inefficient traversing.
+const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024;
+
+// NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT for Trie and TerminalAddressLookupTable.
+// NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT for language model.
+// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for shortcut.
+const size_t Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE =
+ NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT * 2
+ + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT
+ + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT;
+const int Ver4DictConstants::TRIE_BUFFER_INDEX = 0;
+const int Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX =
+ TRIE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT;
+const int Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX =
+ TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT;
+const int Ver4DictConstants::SHORTCUT_BUFFERS_INDEX =
+ LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT;
+
+const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1;
+const int Ver4DictConstants::PROBABILITY_SIZE = 1;
+const int Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE = 1;
+const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
+const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
+const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
+const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4;
+const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0;
+const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2;
+
+const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
+const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
+const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4;
+const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8;
+const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10;
+
+const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
+const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
+
+const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1;
+const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F;
+const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80;
+
+const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT = 1;
+const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT = 3;
+const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 2;
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h
new file mode 100644
index 000000000..13d7a5714
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_DICT_CONSTANTS_H
+#define LATINIME_VER4_DICT_CONSTANTS_H
+
+#include "defines.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace latinime {
+
+// TODO: Create PtConstants under the pt_common and move some constant values there.
+// Note that there are corresponding definitions in FormatSpec.java.
+class Ver4DictConstants {
+ public:
+ static const char *const BODY_FILE_EXTENSION;
+ static const char *const HEADER_FILE_EXTENSION;
+ static const int MAX_DICTIONARY_SIZE;
+ static const int MAX_DICT_EXTENDED_REGION_SIZE;
+
+ static const size_t NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE;
+ static const int TRIE_BUFFER_INDEX;
+ static const int TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX;
+ static const int LANGUAGE_MODEL_BUFFER_INDEX;
+ static const int BIGRAM_BUFFERS_INDEX;
+ static const int SHORTCUT_BUFFERS_INDEX;
+
+ static const int NOT_A_TERMINAL_ID;
+ static const int PROBABILITY_SIZE;
+ static const int FLAGS_IN_LANGUAGE_MODEL_SIZE;
+ static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
+ static const int NOT_A_TERMINAL_ADDRESS;
+ static const int TERMINAL_ID_FIELD_SIZE;
+ static const int TIME_STAMP_FIELD_SIZE;
+ // TODO: Remove
+ static const int WORD_LEVEL_FIELD_SIZE;
+ static const int WORD_COUNT_FIELD_SIZE;
+ // Flags in probability entry.
+ static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
+ static const uint8_t FLAG_NOT_A_VALID_ENTRY;
+ static const uint8_t FLAG_NOT_A_WORD;
+ static const uint8_t FLAG_BLACKLISTED;
+ static const uint8_t FLAG_POSSIBLY_OFFENSIVE;
+
+ static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
+ static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
+
+ static const int SHORTCUT_FLAGS_FIELD_SIZE;
+ static const int SHORTCUT_PROBABILITY_MASK;
+ static const int SHORTCUT_HAS_NEXT_MASK;
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants);
+
+ static const size_t NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT;
+ static const size_t NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT;
+ static const size_t NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT;
+};
+} // namespace latinime
+#endif /* LATINIME_VER4_DICT_CONSTANTS_H */
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
new file mode 100644
index 000000000..b38b03dcb
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/v4/content/language_model_dict_content.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+
+namespace latinime {
+
+const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode(
+ const int ptNodePos, const int siblingNodePos) const {
+ if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) {
+ // Reading invalid position because of bug or broken dictionary.
+ AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d",
+ ptNodePos, mBuffer->getTailPosition());
+ ASSERT(false);
+ return PtNodeParams();
+ }
+ const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos);
+ const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
+ int pos = ptNodePos;
+ const int headPos = ptNodePos;
+ if (usesAdditionalBuffer) {
+ pos -= mBuffer->getOriginalBufferSize();
+ }
+ const PatriciaTrieReadingUtils::NodeFlags flags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const int parentPosOffset =
+ DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition(
+ dictBuf, &pos);
+ const int parentPos =
+ DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
+ int codePoints[MAX_WORD_LENGTH];
+ // Code point table is not used for ver4 dictionaries.
+ const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
+ dictBuf, flags, MAX_WORD_LENGTH, nullptr /* codePointTable */, codePoints, &pos);
+ int terminalIdFieldPos = NOT_A_DICT_POS;
+ int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
+ if (PatriciaTrieReadingUtils::isTerminal(flags)) {
+ terminalIdFieldPos = pos;
+ if (usesAdditionalBuffer) {
+ terminalIdFieldPos += mBuffer->getOriginalBufferSize();
+ }
+ terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
+ }
+ int childrenPosFieldPos = pos;
+ if (usesAdditionalBuffer) {
+ childrenPosFieldPos += mBuffer->getOriginalBufferSize();
+ }
+ int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition(
+ dictBuf, &pos);
+ if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) {
+ childrenPos += mBuffer->getOriginalBufferSize();
+ }
+ if (usesAdditionalBuffer) {
+ pos += mBuffer->getOriginalBufferSize();
+ }
+ // Sibling position is the tail position of original PtNode.
+ int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos;
+ // Read destination node if the read node is a moved node.
+ if (DynamicPtReadingUtils::isMoved(flags)) {
+ // The destination position is stored at the same place as the parent position.
+ return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
+ } else {
+ return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
+ terminalIdFieldPos, terminalId, NOT_A_PROBABILITY, childrenPosFieldPos, childrenPos,
+ newSiblingNodePos);
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
new file mode 100644
index 000000000..4e5ae3a89
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H
+#define LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+class HeaderPolicy;
+class LanguageModelDictContent;
+
+/*
+ * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved
+ * node and reads node attributes.
+ */
+class Ver4PatriciaTrieNodeReader : public PtNodeReader {
+ public:
+ explicit Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer)
+ : mBuffer(buffer) {}
+
+ ~Ver4PatriciaTrieNodeReader() {}
+
+ virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const {
+ return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos,
+ NOT_A_DICT_POS /* siblingNodePos */);
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader);
+
+ const BufferWithExtendableBuffer *const mBuffer;
+
+ const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos,
+ const int siblingNodePos) const;
+};
+} // namespace latinime
+#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H */
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
new file mode 100644
index 000000000..d974b50f4
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
+#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+
+namespace latinime {
+
+const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3;
+
+bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
+ const PtNodeParams *const toBeUpdatedPtNodeParams) {
+ int pos = toBeUpdatedPtNodeParams->getHeadPos();
+ const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
+ const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
+ if (usesAdditionalBuffer) {
+ pos -= mTrieBuffer->getOriginalBufferSize();
+ }
+ // Read original flags
+ const PatriciaTrieReadingUtils::NodeFlags originalFlags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
+ DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
+ true /* isDeleted */, false /* willBecomeNonTerminal */);
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
+ // Update flags.
+ if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
+ &writingPos)) {
+ return false;
+ }
+ if (toBeUpdatedPtNodeParams->isTerminal()) {
+ // The PtNode is a terminal. Delete entry from the terminal position lookup table.
+ return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
+ toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
+ } else {
+ return true;
+ }
+}
+
+// TODO: Quit using bigramLinkedNodePos.
+bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
+ const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int movedPos, const int bigramLinkedNodePos) {
+ int pos = toBeUpdatedPtNodeParams->getHeadPos();
+ const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
+ const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
+ if (usesAdditionalBuffer) {
+ pos -= mTrieBuffer->getOriginalBufferSize();
+ }
+ // Read original flags
+ const PatriciaTrieReadingUtils::NodeFlags originalFlags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
+ DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */,
+ false /* isDeleted */, false /* willBecomeNonTerminal */);
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
+ // Update flags.
+ if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
+ &writingPos)) {
+ return false;
+ }
+ // Update moved position, which is stored in the parent offset field.
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
+ mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
+ return false;
+ }
+ if (toBeUpdatedPtNodeParams->hasChildren()) {
+ // Update children's parent position.
+ mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos());
+ while (!mReadingHelper.isEnd()) {
+ const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams());
+ int parentOffsetFieldPos = childPtNodeParams.getHeadPos()
+ + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
+ mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(),
+ &parentOffsetFieldPos)) {
+ // Parent offset cannot be written because of a bug or a broken dictionary; thus,
+ // we give up to update dictionary.
+ return false;
+ }
+ mReadingHelper.readNextSiblingNode(childPtNodeParams);
+ }
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal(
+ const PtNodeParams *const toBeUpdatedPtNodeParams) {
+ int pos = toBeUpdatedPtNodeParams->getHeadPos();
+ const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
+ const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
+ if (usesAdditionalBuffer) {
+ pos -= mTrieBuffer->getOriginalBufferSize();
+ }
+ // Read original flags
+ const PatriciaTrieReadingUtils::NodeFlags originalFlags =
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
+ const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
+ DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
+ false /* isDeleted */, true /* willBecomeNonTerminal */);
+ if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
+ toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) {
+ AKLOGE("Cannot update terminal position lookup table. terminal id: %d",
+ toBeUpdatedPtNodeParams->getTerminalId());
+ return false;
+ }
+ // Update flags.
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
+ return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
+ &writingPos);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty(
+ const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const UnigramProperty *const unigramProperty) {
+ // Update probability and historical information.
+ // TODO: Update other information in the unigram property.
+ if (!toBeUpdatedPtNodeParams->isTerminal()) {
+ return false;
+ }
+ const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty);
+ return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
+ toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntryOfUnigramProperty);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
+ const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) {
+ if (!toBeUpdatedPtNodeParams->isTerminal()) {
+ AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
+ return false;
+ }
+ const ProbabilityEntry originalProbabilityEntry =
+ mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
+ toBeUpdatedPtNodeParams->getTerminalId());
+ if (originalProbabilityEntry.isValid()) {
+ *outNeedsToKeepPtNode = true;
+ return true;
+ }
+ if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
+ AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
+ return false;
+ }
+ *outNeedsToKeepPtNode = false;
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
+ const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) {
+ int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos();
+ return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
+ newChildrenPosition, &childrenPosFieldPos);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int newTerminalId) {
+ return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE,
+ toBeUpdatedPtNodeParams->getTerminalIdFieldPos());
+}
+
+bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) {
+ return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */,
+ ptNodeWritingPos);
+}
+
+bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
+ int *const ptNodeWritingPos) {
+ int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
+ if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId,
+ ptNodeWritingPos)) {
+ return false;
+ }
+ // Write probability.
+ ProbabilityEntry newProbabilityEntry;
+ const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty);
+ return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
+ terminalId, &probabilityEntryOfUnigramProperty);
+}
+
+// TODO: Support counting ngram entries.
+bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) {
+ LanguageModelDictContent *const languageModelDictContent =
+ mBuffers->getMutableLanguageModelDictContent();
+ const ProbabilityEntry probabilityEntry =
+ languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId);
+ const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty);
+ if (!languageModelDictContent->setNgramProbabilityEntry(
+ prevWordIds, wordId, &probabilityEntryOfNgramProperty)) {
+ AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d",
+ prevWordIds[0], prevWordIds.size(), wordId);
+ return false;
+ }
+ if (!probabilityEntry.isValid() && outAddedNewBigram) {
+ *outAddedNewBigram = true;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds,
+ const int wordId) {
+ LanguageModelDictContent *const languageModelDictContent =
+ mBuffers->getMutableLanguageModelDictContent();
+ return languageModelDictContent->removeNgramProbabilityEntry(prevWordIds, wordId);
+}
+
+// TODO: Remove when we stop supporting v402 format.
+bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
+ const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
+ // Do nothing.
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
+ const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const DictPositionRelocationMap *const dictPositionRelocationMap,
+ int *const outBigramEntryCount) {
+ int parentPos = toBeUpdatedPtNodeParams->getParentPos();
+ if (parentPos != NOT_A_DICT_POS) {
+ PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
+ dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
+ if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
+ parentPos = it->second;
+ }
+ }
+ int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
+ + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
+ // Write updated parent offset.
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
+ parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
+ return false;
+ }
+
+ // Updates children position.
+ int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
+ if (childrenPos != NOT_A_DICT_POS) {
+ PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
+ dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
+ if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
+ childrenPos = it->second;
+ }
+ }
+ if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
+ const int *const targetCodePoints, const int targetCodePointCount,
+ const int shortcutProbability) {
+ if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
+ targetCodePoints, targetCodePointCount, shortcutProbability)) {
+ AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId());
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, int *const outTerminalId,
+ int *const ptNodeWritingPos) {
+ const int nodePos = *ptNodeWritingPos;
+ // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the
+ // PtNode writing.
+ if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer,
+ 0 /* nodeFlags */, ptNodeWritingPos)) {
+ return false;
+ }
+ // Calculate a parent offset and write the offset.
+ if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
+ ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) {
+ return false;
+ }
+ // Write code points
+ if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer,
+ ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) {
+ return false;
+ }
+ int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
+ if (!ptNodeParams->willBecomeNonTerminal()) {
+ if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) {
+ terminalId = ptNodeParams->getTerminalId();
+ } else if (ptNodeParams->isTerminal()) {
+ // Write terminal information using a new terminal id.
+ // Get a new unused terminal id.
+ terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId();
+ }
+ }
+ const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID;
+ if (isTerminal) {
+ // Update the lookup table.
+ if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
+ terminalId, nodePos)) {
+ return false;
+ }
+ // Write terminal Id.
+ if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId,
+ Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) {
+ return false;
+ }
+ if (outTerminalId) {
+ *outTerminalId = terminalId;
+ }
+ }
+ // Write children position
+ if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
+ ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
+ return false;
+ }
+ return updatePtNodeFlags(nodePos, isTerminal,
+ ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
+}
+
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal,
+ const bool hasMultipleChars) {
+ // Create node flags and write them.
+ PatriciaTrieReadingUtils::NodeFlags nodeFlags =
+ PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */,
+ false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */,
+ false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE);
+ if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
+ AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
+ return false;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
new file mode 100644
index 000000000..55856110b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H
+#define LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+class HeaderPolicy;
+class Ver4DictBuffers;
+class Ver4PatriciaTrieNodeReader;
+class Ver4PtNodeArrayReader;
+class Ver4ShortcutListPolicy;
+
+/*
+ * This class is used for helping to writes nodes of ver4 patricia trie.
+ */
+class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
+ public:
+ Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer,
+ Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader,
+ const PtNodeArrayReader *const ptNodeArrayReader,
+ Ver4ShortcutListPolicy *const shortcutPolicy)
+ : mTrieBuffer(trieBuffer), mBuffers(buffers),
+ mReadingHelper(ptNodeReader, ptNodeArrayReader), mShortcutPolicy(shortcutPolicy) {}
+
+ virtual ~Ver4PatriciaTrieNodeWriter() {}
+
+ virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams);
+
+ virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int movedPos, const int bigramLinkedNodePos);
+
+ virtual bool markPtNodeAsWillBecomeNonTerminal(
+ const PtNodeParams *const toBeUpdatedPtNodeParams);
+
+ virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const UnigramProperty *const unigramProperty);
+
+ virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
+ const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode);
+
+ virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int newChildrenPosition);
+
+ bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const int newTerminalId);
+
+ virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
+ int *const ptNodeWritingPos);
+
+ virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
+ const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
+
+ virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
+
+ virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
+
+ virtual bool updateAllBigramEntriesAndDeleteUselessEntries(
+ const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount);
+
+ virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams,
+ const DictPositionRelocationMap *const dictPositionRelocationMap,
+ int *const outBigramEntryCount);
+
+ virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams,
+ const int *const targetCodePoints, const int targetCodePointCount,
+ const int shortcutProbability);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter);
+
+ bool writePtNodeAndGetTerminalIdAndAdvancePosition(
+ const PtNodeParams *const ptNodeParams, int *const outTerminalId,
+ int *const ptNodeWritingPos);
+
+ bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars);
+
+ static const int CHILDREN_POSITION_FIELD_SIZE;
+
+ BufferWithExtendableBuffer *const mTrieBuffer;
+ Ver4DictBuffers *const mBuffers;
+ DynamicPtReadingHelper mReadingHelper;
+ Ver4ShortcutListPolicy *const mShortcutPolicy;
+};
+} // namespace latinime
+#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
new file mode 100644
index 000000000..1dbec5545
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -0,0 +1,603 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_patricia_trie_policy.h"
+
+#include <array>
+#include <vector>
+
+#include "suggest/core/dicnode/dic_node.h"
+#include "suggest/core/dicnode/dic_node_vector.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/property/ngram_context.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/property/word_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/utils/multi_bigram_map.h"
+#include "dictionary/utils/probability_utils.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
+// BinaryDictionaryDecayingTests.
+const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
+const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
+const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
+ Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
+
+void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const {
+ if (!dicNode->hasChildren()) {
+ return;
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
+ while (!readingHelper.isEnd()) {
+ const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
+ if (!ptNodeParams.isValid()) {
+ break;
+ }
+ const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
+ const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
+ childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
+ wordId, ptNodeParams.getCodePointArrayView());
+ readingHelper.readNextSiblingNode(ptNodeParams);
+ }
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ }
+}
+
+int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ const int ptNodePos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ readingHelper.initWithPtNodePos(ptNodePos);
+ const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount(
+ maxCodePointCount, outCodePoints);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
+ }
+ return codePointCount;
+}
+
+int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
+ wordCodePoints.size(), forceLowerCaseSearch);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ }
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_WORD_ID;
+ }
+ const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_WORD_ID;
+ }
+ return ptNodeParams.getTerminalId();
+}
+
+const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
+ const WordIdArrayView prevWordIds, const int wordId,
+ MultiBigramMap *const multiBigramMap) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return WordAttributes();
+ }
+ return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
+ false /* mustMatchAllPrevWords */, mHeaderPolicy);
+}
+
+int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
+ const int wordId) const {
+ if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
+ return NOT_A_PROBABILITY;
+ }
+ const WordAttributes wordAttributes =
+ mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
+ true /* mustMatchAllPrevWords */, mHeaderPolicy);
+ if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) {
+ return NOT_A_PROBABILITY;
+ }
+ return wordAttributes.getProbability();
+}
+
+BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
+ const int wordId) const {
+ const int shortcutPos = getShortcutPositionOfWord(wordId);
+ return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
+}
+
+void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const {
+ if (prevWordIds.empty()) {
+ return;
+ }
+ const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
+ for (size_t i = 1; i <= prevWordIds.size(); ++i) {
+ for (const auto entry : languageModelDictContent->getProbabilityEntries(
+ prevWordIds.limit(i))) {
+ const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
+ if (!probabilityEntry.isValid()) {
+ continue;
+ }
+ int probability = NOT_A_PROBABILITY;
+ if (probabilityEntry.hasHistoricalInfo()) {
+ // TODO: Quit checking count here.
+ // If count <= 1, the word can be an invaild word. The actual probability should
+ // be checked using getWordAttributesInContext() in onVisitEntry().
+ probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ?
+ NOT_A_PROBABILITY : 0;
+ } else {
+ probability = probabilityEntry.getProbability();
+ }
+ listener->onVisitEntry(probability, entry.getWordId());
+ }
+ }
+}
+
+int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return NOT_A_DICT_POS;
+ }
+ const int ptNodePos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_DICT_POS;
+ }
+ return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
+ ptNodeParams.getTerminalId());
+}
+
+bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
+ const UnigramProperty *const unigramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert to the dictionary, length: %zd",
+ wordCodePoints.size());
+ return false;
+ }
+ for (const auto &shortcut : unigramProperty->getShortcuts()) {
+ if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
+ shortcut.getTargetCodePoints()->size());
+ return false;
+ }
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ bool addedNewUnigram = false;
+ int codePointsToAdd[MAX_WORD_LENGTH];
+ int codePointCountToAdd = wordCodePoints.size();
+ memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
+ if (unigramProperty->representsBeginningOfSentence()) {
+ codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
+ codePointCountToAdd, MAX_WORD_LENGTH);
+ }
+ if (codePointCountToAdd <= 0) {
+ return false;
+ }
+ const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
+ if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
+ &addedNewUnigram)) {
+ if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
+ mEntryCounters.incrementNgramCount(NgramType::Unigram);
+ }
+ if (unigramProperty->getShortcuts().size() > 0) {
+ // Add shortcut target.
+ const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ AKLOGE("Cannot find word id to add shortcut target.");
+ return false;
+ }
+ const int wordPos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ for (const auto &shortcut : unigramProperty->getShortcuts()) {
+ if (!mUpdatingHelper.addShortcutTarget(wordPos,
+ CodePointArrayView(*shortcut.getTargetCodePoints()),
+ shortcut.getProbability())) {
+ AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
+ "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
+ shortcut.getProbability());
+ return false;
+ }
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ return false;
+ }
+ const int ptNodePos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) {
+ AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos);
+ return false;
+ }
+ if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) {
+ return false;
+ }
+ if (!ptNodeParams.representsNonWordInfo()) {
+ mEntryCounters.decrementNgramCount(NgramType::Unigram);
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ const NgramContext *const ngramContext = ngramProperty->getNgramContext();
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
+ return false;
+ }
+ if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert the ngram to the dictionary. "
+ "length: %zd", ngramProperty->getTargetCodePoints()->size());
+ return false;
+ }
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSearch */);
+ if (prevWordIds.empty()) {
+ return false;
+ }
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ if (prevWordIds[i] != NOT_A_WORD_ID) {
+ continue;
+ }
+ if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) {
+ return false;
+ }
+ const UnigramProperty beginningOfSentenceUnigramProperty(
+ true /* representsBeginningOfSentence */, true /* isNotAWord */,
+ false /* isBlacklisted */, false /* isPossiblyOffensive */,
+ MAX_PROBABILITY /* probability */, HistoricalInfo());
+ if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
+ &beginningOfSentenceUnigramProperty)) {
+ AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
+ return false;
+ }
+ // Refresh word ids.
+ ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
+ }
+ const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()),
+ false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ return false;
+ }
+ bool addedNewEntry = false;
+ if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) {
+ if (addedNewEntry) {
+ mEntryCounters.incrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
+ return false;
+ }
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
+ wordCodePoints.size());
+ }
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSerch */);
+ if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) {
+ return false;
+ }
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ return false;
+ }
+ if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) {
+ mEntryCounters.decrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
+ const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
+ const bool isValidWord, const HistoricalInfo historicalInfo) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
+ "dictionary.");
+ return false;
+ }
+ const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ?
+ false : isValidWord;
+ int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ // The word is not in the dictionary.
+ const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
+ false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
+ NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
+ 0 /* count */));
+ if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
+ AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ if (!isValidWord) {
+ return true;
+ }
+ wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
+ }
+
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSearch */);
+ if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
+ if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
+ const UnigramProperty beginningOfSentenceUnigramProperty(
+ true /* representsBeginningOfSentence */,
+ true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
+ HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
+ if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
+ &beginningOfSentenceUnigramProperty)) {
+ AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ // Refresh word ids.
+ ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
+ }
+ // Update entries for beginning of sentence.
+ if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(
+ prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo,
+ mHeaderPolicy, &mEntryCounters)) {
+ return false;
+ }
+ }
+ if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds,
+ wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) {
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
+ AKLOGE("Cannot flush the dictionary to file.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
+ AKLOGE("Cannot flush the dictionary to file with GC.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mBuffers->isNearSizeLimit()) {
+ // Additional buffer size is near the limit.
+ return true;
+ } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
+ > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
+ // Total extended region size of the trie exceeds the limit.
+ return true;
+ } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
+ && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
+ // Needs to reduce dictionary size.
+ return true;
+ } else if (mHeaderPolicy->isDecayingDict()) {
+ return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
+ mHeaderPolicy);
+ }
+ return false;
+}
+
+void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
+ char *const outResult, const int maxResultLength) {
+ const int compareLength = queryLength + 1 /* terminator */;
+ if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mEntryCounters.getNgramCount(NgramType::Unigram));
+ } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
+ } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Unigram)) :
+ static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Bigram)) :
+ static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ }
+}
+
+const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
+ const CodePointArrayView wordCodePoints) const {
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ AKLOGE("getWordProperty is called for invalid word.");
+ return WordProperty();
+ }
+ const LanguageModelDictContent *const languageModelDictContent =
+ mBuffers->getLanguageModelDictContent();
+ // Fetch ngram information.
+ std::vector<NgramProperty> ngrams;
+ int ngramTargetCodePoints[MAX_WORD_LENGTH];
+ int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
+ int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord(
+ mHeaderPolicy, wordId)) {
+ const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(),
+ MAX_WORD_LENGTH, ngramTargetCodePoints);
+ const WordIdArrayView prevWordIds = entry.getPrevWordIds();
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i],
+ MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]);
+ ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry(
+ prevWordIds[i]).representsBeginningOfSentence();
+ if (ngramPrevWordIsBeginningOfSentense[i]) {
+ ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker(
+ ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]);
+ }
+ }
+ const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount,
+ ngramPrevWordIsBeginningOfSentense, prevWordIds.size());
+ const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry();
+ const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo();
+ // TODO: Output flags in WordAttributes.
+ ngrams.emplace_back(ngramContext,
+ CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(),
+ entry.getWordAttributes().getProbability(), *historicalInfo);
+ }
+ // Fetch shortcut information.
+ std::vector<UnigramProperty::ShortcutProperty> shortcuts;
+ int shortcutPos = getShortcutPositionOfWord(wordId);
+ if (shortcutPos != NOT_A_DICT_POS) {
+ int shortcutTarget[MAX_WORD_LENGTH];
+ const ShortcutDictContent *const shortcutDictContent =
+ mBuffers->getShortcutDictContent();
+ bool hasNext = true;
+ while (hasNext) {
+ int shortcutTargetLength = 0;
+ int shortcutProbability = NOT_A_PROBABILITY;
+ shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
+ &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
+ shortcuts.emplace_back(
+ CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
+ shortcutProbability);
+ }
+ }
+ const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
+ WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy);
+ const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
+ const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
+ const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
+ wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
+ wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
+ *historicalInfo, std::move(shortcuts));
+ return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
+}
+
+int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount) {
+ *outCodePointCount = 0;
+ if (token == 0) {
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
+ &mTerminalPtNodePositionsForIteratingWords);
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
+ }
+ const int terminalPtNodePositionsVectorSize =
+ static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
+ if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
+ AKLOGE("Given token %d is invalid.", token);
+ return 0;
+ }
+ const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
+ const PtNodeParams ptNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos);
+ *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(),
+ MAX_WORD_LENGTH, outCodePoints);
+ const int nextToken = token + 1;
+ if (nextToken >= terminalPtNodePositionsVectorSize) {
+ // All words have been iterated.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ return 0;
+ }
+ return nextToken;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h
new file mode 100644
index 000000000..d130a4e78
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
+#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
+#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
+#include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/entry_counters.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class DicNode;
+class DicNodeVector;
+
+// Word id = Artificial id that is stored in the PtNode looked up by the word.
+class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
+ public:
+ Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
+ : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()),
+ mDictBuffer(mBuffers->getWritableTrieBuffer()),
+ mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
+ mBuffers->getTerminalPositionLookupTable()),
+ mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer),
+ mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader,
+ &mShortcutPolicy),
+ mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
+ mWritingHelper(mBuffers.get()),
+ mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()),
+ mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
+
+ AK_FORCE_INLINE int getRootPosition() const {
+ return 0;
+ }
+
+ void createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const;
+
+ int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const;
+
+ int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
+
+ const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const;
+
+ // TODO: Remove
+ int getProbability(const int unigramProbability, const int bigramProbability) const {
+ // Not used.
+ return NOT_A_PROBABILITY;
+ }
+
+ int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
+
+ void iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const;
+
+ BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
+
+ const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
+ return mHeaderPolicy;
+ }
+
+ bool addUnigramEntry(const CodePointArrayView wordCodePoints,
+ const UnigramProperty *const unigramProperty);
+
+ bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
+
+ bool addNgramEntry(const NgramProperty *const ngramProperty);
+
+ bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints);
+
+ bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo);
+
+ bool flush(const char *const filePath);
+
+ bool flushWithGC(const char *const filePath);
+
+ bool needsToRunGC(const bool mindsBlockByGC) const;
+
+ void getProperty(const char *const query, const int queryLength, char *const outResult,
+ const int maxResultLength);
+
+ const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
+
+ int getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount);
+
+ bool isCorrupted() const {
+ return mIsCorrupted;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy);
+
+ static const char *const UNIGRAM_COUNT_QUERY;
+ static const char *const BIGRAM_COUNT_QUERY;
+ static const char *const MAX_UNIGRAM_COUNT_QUERY;
+ static const char *const MAX_BIGRAM_COUNT_QUERY;
+ // When the dictionary size is near the maximum size, we have to refuse dynamic operations to
+ // prevent the dictionary from overflowing.
+ static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
+ static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
+
+ const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
+ const HeaderPolicy *const mHeaderPolicy;
+ BufferWithExtendableBuffer *const mDictBuffer;
+ Ver4ShortcutListPolicy mShortcutPolicy;
+ Ver4PatriciaTrieNodeReader mNodeReader;
+ Ver4PtNodeArrayReader mPtNodeArrayReader;
+ Ver4PatriciaTrieNodeWriter mNodeWriter;
+ DynamicPtUpdatingHelper mUpdatingHelper;
+ Ver4PatriciaTrieWritingHelper mWritingHelper;
+ MutableEntryCounters mEntryCounters;
+ std::vector<int> mTerminalPtNodePositionsForIteratingWords;
+ mutable bool mIsCorrupted;
+
+ int getShortcutPositionOfWord(const int wordId) const;
+};
+} // namespace latinime
+#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
new file mode 100644
index 000000000..ccb70cdd3
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
+
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(
+ const uint8_t *const buffer, int *pos) {
+ return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos);
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
new file mode 100644
index 000000000..466ff55d5
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H
+#define LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class Ver4PatriciaTrieReadingUtils {
+ public:
+ static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer,
+ int *const pos);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils);
+};
+} // namespace latinime
+#endif /* LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H */
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
new file mode 100644
index 000000000..6dfdf4d31
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
+
+#include <cstring>
+#include <queue>
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
+ const EntryCounts &entryCounts) const {
+ const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
+ BufferWithExtendableBuffer headerBuffer(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
+ + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
+ if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
+ entryCounts, extendedRegionSize, &headerBuffer)) {
+ AKLOGE("Cannot write header structure to buffer. "
+ "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d,"
+ "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
+ entryCounts.getNgramCount(NgramType::Bigram),
+ entryCounts.getNgramCount(NgramType::Trigram),
+ extendedRegionSize);
+ return false;
+ }
+ return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
+}
+
+bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
+ const char *const dictDirPath) {
+ const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
+ Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
+ Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
+ Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ MutableEntryCounters entryCounters;
+ if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &entryCounters)) {
+ return false;
+ }
+ BufferWithExtendableBuffer headerBuffer(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
+ entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
+ return false;
+ }
+ return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
+}
+
+bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
+ const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
+ MutableEntryCounters *const outEntryCounters) {
+ Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer());
+ Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
+ Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
+ mBuffers->getTerminalPositionLookupTable());
+ Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
+ mBuffers, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy);
+
+ if (!mBuffers->getMutableLanguageModelDictContent()->updateAllProbabilityEntriesForGC(
+ headerPolicy, outEntryCounters)) {
+ AKLOGE("Failed to update probabilities in language model dict content.");
+ return false;
+ }
+ if (headerPolicy->isDecayingDict()) {
+ const EntryCounts &maxEntryCounts = headerPolicy->getMaxNgramCounts();
+ if (!mBuffers->getMutableLanguageModelDictContent()->truncateEntries(
+ outEntryCounters->getEntryCounts(), maxEntryCounts, headerPolicy,
+ outEntryCounters)) {
+ AKLOGE("Failed to truncate entries in language model dict content.");
+ return false;
+ }
+ }
+
+ DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ DynamicPtGcEventListeners
+ ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
+ &ptNodeWriter);
+ if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
+ return false;
+ }
+
+ // Mapping from positions in mBuffer to positions in bufferToWrite.
+ PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
+ readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
+ buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy);
+ DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
+ traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
+ buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
+ if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
+ return false;
+ }
+
+ // Create policy instances for the GCed dictionary.
+ Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer());
+ Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
+ Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
+ buffersToWrite->getTerminalPositionLookupTable());
+ Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
+ buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader,
+ &newShortcutPolicy);
+ // Re-assign terminal IDs for valid terminal PtNodes.
+ TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
+ if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
+ &terminalIdMap)) {
+ return false;
+ }
+ // Run GC for language model dict content.
+ if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap,
+ mBuffers->getLanguageModelDictContent())) {
+ return false;
+ }
+ // Run GC for shortcut dict content.
+ if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
+ mBuffers->getShortcutDictContent())) {
+ return false;
+ }
+ DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
+ newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
+ traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
+ if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ &traversePolicyToUpdateAllPositionFields)) {
+ return false;
+ }
+ newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
+ if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
+ if (!ptNodeParams->isTerminal()) {
+ return true;
+ }
+ TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
+ mTerminalIdMap->find(ptNodeParams->getTerminalId());
+ if (it == mTerminalIdMap->end()) {
+ AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
+ ptNodeParams->getTerminalId(), mTerminalIdMap->size());
+ return false;
+ }
+ if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
+ AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
+ return false;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
new file mode 100644
index 000000000..68dd1caa2
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H
+#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/utils/entry_counters.h"
+
+namespace latinime {
+
+class HeaderPolicy;
+class Ver4DictBuffers;
+class Ver4PatriciaTrieNodeReader;
+class Ver4PatriciaTrieNodeWriter;
+
+class Ver4PatriciaTrieWritingHelper {
+ public:
+ Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers)
+ : mBuffers(buffers) {}
+
+ bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const;
+
+ // This method cannot be const because the original dictionary buffer will be updated to detect
+ // useless PtNodes during GC.
+ bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper);
+
+ class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ : public DynamicPtReadingHelper::TraversingEventListener {
+ public:
+ TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(
+ Ver4PatriciaTrieNodeWriter *const ptNodeWriter,
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap)
+ : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {}
+
+ bool onAscend() { return true; }
+
+ bool onDescend(const int ptNodeArrayPos) { return true; }
+
+ bool onReadingPtNodeArrayTail() { return true; }
+
+ bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds);
+
+ Ver4PatriciaTrieNodeWriter *const mPtNodeWriter;
+ const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
+ };
+
+ bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
+ Ver4DictBuffers *const buffersToWrite, MutableEntryCounters *const outEntryCounters);
+
+ Ver4DictBuffers *const mBuffers;
+};
+} // namespace latinime
+
+#endif /* LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H */
diff --git a/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp
new file mode 100644
index 000000000..63d0b4ad5
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
+
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
+ int *const outPtNodeCount, int *const outFirstPtNodePos) const {
+ if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) {
+ // Reading invalid position because of a bug or a broken dictionary.
+ AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d",
+ ptNodeArrayPos, mBuffer->getTailPosition());
+ ASSERT(false);
+ return false;
+ }
+ const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos);
+ const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
+ int readingPos = ptNodeArrayPos;
+ if (usesAdditionalBuffer) {
+ readingPos -= mBuffer->getOriginalBufferSize();
+ }
+ const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
+ dictBuf, &readingPos);
+ if (usesAdditionalBuffer) {
+ readingPos += mBuffer->getOriginalBufferSize();
+ }
+ if (ptNodeCountInArray < 0) {
+ AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray);
+ return false;
+ }
+ *outPtNodeCount = ptNodeCountInArray;
+ *outFirstPtNodePos = readingPos;
+ return true;
+}
+
+bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos,
+ int *const outNextPtNodeArrayPos) const {
+ if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) {
+ // Reading invalid position because of bug or broken dictionary.
+ AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d",
+ forwordLinkPos, mBuffer->getTailPosition());
+ ASSERT(false);
+ return false;
+ }
+ const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos);
+ const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
+ int readingPos = forwordLinkPos;
+ if (usesAdditionalBuffer) {
+ readingPos -= mBuffer->getOriginalBufferSize();
+ }
+ const int nextPtNodeArrayOffset =
+ DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos);
+ if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) {
+ *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset;
+ } else {
+ *outNextPtNodeArrayPos = NOT_A_DICT_POS;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h
new file mode 100644
index 000000000..ccb760bc1
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_VER4_PT_NODE_ARRAY_READER_H
+#define LATINIME_VER4_PT_NODE_ARRAY_READER_H
+
+#include "defines.h"
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class Ver4PtNodeArrayReader : public PtNodeArrayReader {
+ public:
+ Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {};
+
+ virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
+ int *const outPtNodeCount, int *const outFirstPtNodePos) const;
+ virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos,
+ int *const outNextPtNodeArrayPos) const;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader);
+
+ const BufferWithExtendableBuffer *const mBuffer;
+};
+} // namespace latinime
+#endif /* LATINIME_VER4_PT_NODE_ARRAY_READER_H */
diff --git a/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h
new file mode 100644
index 000000000..8a614730b
--- /dev/null
+++ b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
+#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
+
+namespace latinime {
+
+class BinaryDictionaryBigramsIterator {
+ public:
+ // Empty iterator.
+ BinaryDictionaryBigramsIterator()
+ : mBigramsStructurePolicy(nullptr), mPos(NOT_A_DICT_POS),
+ mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(false) {}
+
+ BinaryDictionaryBigramsIterator(
+ const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos)
+ : mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos),
+ mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY),
+ mHasNext(pos != NOT_A_DICT_POS) {}
+
+ BinaryDictionaryBigramsIterator(BinaryDictionaryBigramsIterator &&bigramsIterator)
+ : mBigramsStructurePolicy(bigramsIterator.mBigramsStructurePolicy),
+ mPos(bigramsIterator.mPos), mBigramPos(bigramsIterator.mBigramPos),
+ mProbability(bigramsIterator.mProbability), mHasNext(bigramsIterator.mHasNext) {}
+
+ AK_FORCE_INLINE bool hasNext() const {
+ return mHasNext;
+ }
+
+ AK_FORCE_INLINE void next() {
+ mBigramsStructurePolicy->getNextBigram(&mBigramPos, &mProbability, &mHasNext, &mPos);
+ }
+
+ AK_FORCE_INLINE int getProbability() const {
+ return mProbability;
+ }
+
+ AK_FORCE_INLINE int getBigramPos() const {
+ return mBigramPos;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator);
+
+ const DictionaryBigramsStructurePolicy *const mBigramsStructurePolicy;
+ int mPos;
+ int mBigramPos;
+ int mProbability;
+ bool mHasNext;
+};
+} // namespace latinime
+#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
diff --git a/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h
new file mode 100644
index 000000000..a4ddd58c2
--- /dev/null
+++ b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H
+#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+
+namespace latinime {
+
+class BinaryDictionaryShortcutIterator {
+ public:
+ BinaryDictionaryShortcutIterator(
+ const DictionaryShortcutsStructurePolicy *const shortcutStructurePolicy,
+ const int shortcutPos)
+ : mShortcutStructurePolicy(shortcutStructurePolicy),
+ mPos(shortcutStructurePolicy->getStartPos(shortcutPos)),
+ mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {}
+
+ BinaryDictionaryShortcutIterator(const BinaryDictionaryShortcutIterator &&shortcutIterator)
+ : mShortcutStructurePolicy(shortcutIterator.mShortcutStructurePolicy),
+ mPos(shortcutIterator.mPos),
+ mHasNextShortcutTarget(shortcutIterator.mHasNextShortcutTarget) {}
+
+ AK_FORCE_INLINE bool hasNextShortcutTarget() const {
+ return mHasNextShortcutTarget;
+ }
+
+ // Gets the shortcut target itself as an int string and put it to outTarget, put its length
+ // to outTargetLength, put whether it is whitelist to outIsWhitelist.
+ AK_FORCE_INLINE void nextShortcutTarget(
+ const int maxDepth, int *const outTarget, int *const outTargetLength,
+ bool *const outIsWhitelist) {
+ mShortcutStructurePolicy->getNextShortcut(maxDepth, outTarget, outTargetLength,
+ outIsWhitelist, &mHasNextShortcutTarget, &mPos);
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(BinaryDictionaryShortcutIterator);
+ DISALLOW_ASSIGNMENT_OPERATOR(BinaryDictionaryShortcutIterator);
+
+ const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy;
+ int mPos;
+ bool mHasNextShortcutTarget;
+};
+} // namespace latinime
+#endif // LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H
diff --git a/native/jni/src/dictionary/utils/bloom_filter.h b/native/jni/src/dictionary/utils/bloom_filter.h
new file mode 100644
index 000000000..1e60f49ed
--- /dev/null
+++ b/native/jni/src/dictionary/utils/bloom_filter.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BLOOM_FILTER_H
+#define LATINIME_BLOOM_FILTER_H
+
+#include <bitset>
+
+#include "defines.h"
+
+namespace latinime {
+
+// This bloom filter is used for optimizing bigram retrieval.
+// Execution times with previous word "this" are as follows:
+// without bloom filter (use only hash_map):
+// Total 147792.34 (sum of others 147771.57)
+// with bloom filter:
+// Total 145900.64 (sum of others 145874.30)
+// always read binary dictionary:
+// Total 148603.14 (sum of others 148579.90)
+class BloomFilter {
+ public:
+ BloomFilter() : mFilter() {}
+
+ AK_FORCE_INLINE void setInFilter(const int position) {
+ mFilter.set(getIndex(position));
+ }
+
+ AK_FORCE_INLINE bool isInFilter(const int position) const {
+ return mFilter.test(getIndex(position));
+ }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter);
+
+ AK_FORCE_INLINE size_t getIndex(const int position) const {
+ return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO;
+ }
+
+ // Size, in bits, of the bloom filter index for bigrams
+ // The probability of false positive is (1 - e ** (-kn/m))**k,
+ // where k is the number of hash functions, n the number of bigrams, and m the number of
+ // bits we can test.
+ // At the moment 100 is the maximum number of bigrams for a word with the current main
+ // dictionaries, so n = 100. 1024 buckets give us m = 1024.
+ // With 1 hash function, our false positive rate is about 9.3%, which should be enough for
+ // our uses since we are only using this to increase average performance. For the record,
+ // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
+ // and m = 4096 gives 2.4%.
+ // This is assigned here because it is used for bitset size.
+ // 1021 is the largest prime under 1024.
+ static const size_t BIGRAM_FILTER_MODULO = 1021;
+ std::bitset<BIGRAM_FILTER_MODULO> mFilter;
+};
+} // namespace latinime
+#endif // LATINIME_BLOOM_FILTER_H
diff --git a/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp
new file mode 100644
index 000000000..217569651
--- /dev/null
+++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+const size_t BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024;
+const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90;
+// TODO: Needs to allocate larger memory corresponding to the current vector size.
+const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024;
+
+uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) const {
+ const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(pos);
+ const int posInBuffer = readingPosIsInAdditionalBuffer ? pos - mOriginalBuffer.size() : pos;
+ return ByteArrayUtils::readUint(getBuffer(readingPosIsInAdditionalBuffer), size, posInBuffer);
+}
+
+uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size,
+ int *const pos) const {
+ const uint32_t value = readUint(size, *pos);
+ *pos += size;
+ return value;
+}
+
+void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxCodePointCount,
+ int *const outCodePoints, int *outCodePointCount, int *const pos) const {
+ const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(*pos);
+ if (readingPosIsInAdditionalBuffer) {
+ *pos -= mOriginalBuffer.size();
+ }
+ // Code point table is not used for dynamic format.
+ *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
+ getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount,
+ nullptr /* codePointTable */, outCodePoints, pos);
+ if (readingPosIsInAdditionalBuffer) {
+ *pos += mOriginalBuffer.size();
+ }
+}
+
+bool BufferWithExtendableBuffer::extend(const int size) {
+ return checkAndPrepareWriting(getTailPosition(), size);
+}
+
+bool BufferWithExtendableBuffer::writeUint(const uint32_t data, const int size, const int pos) {
+ int writingPos = pos;
+ return writeUintAndAdvancePosition(data, size, &writingPos);
+}
+
+bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size,
+ int *const pos) {
+ if (!(size >= 1 && size <= 4)) {
+ AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size);
+ ASSERT(false);
+ return false;
+ }
+ if (!checkAndPrepareWriting(*pos, size)) {
+ return false;
+ }
+ const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos);
+ uint8_t *const buffer =
+ usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data();
+ if (usesAdditionalBuffer) {
+ *pos -= mOriginalBuffer.size();
+ }
+ ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos);
+ if (usesAdditionalBuffer) {
+ *pos += mOriginalBuffer.size();
+ }
+ return true;
+}
+
+bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints,
+ const int codePointCount, const bool writesTerminator, int *const pos) {
+ const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints(
+ codePoints, codePointCount, writesTerminator);
+ if (!checkAndPrepareWriting(*pos, size)) {
+ return false;
+ }
+ const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos);
+ uint8_t *const buffer =
+ usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data();
+ if (usesAdditionalBuffer) {
+ *pos -= mOriginalBuffer.size();
+ }
+ ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePoints, codePointCount,
+ writesTerminator, pos);
+ if (usesAdditionalBuffer) {
+ *pos += mOriginalBuffer.size();
+ }
+ return true;
+}
+
+bool BufferWithExtendableBuffer::extendBuffer(const size_t size) {
+ const size_t extendSize = std::max(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP, size);
+ const size_t sizeAfterExtending =
+ std::min(mAdditionalBuffer.size() + extendSize, mMaxAdditionalBufferSize);
+ if (sizeAfterExtending < mAdditionalBuffer.size() + size) {
+ return false;
+ }
+ mAdditionalBuffer.resize(sizeAfterExtending);
+ return true;
+}
+
+bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) {
+ if (pos < 0 || size < 0) {
+ // Invalid position or size.
+ return false;
+ }
+ const size_t totalRequiredSize = static_cast<size_t>(pos + size);
+ if (!isInAdditionalBuffer(pos)) {
+ // Here don't need to care about the additional buffer.
+ if (mOriginalBuffer.size() < totalRequiredSize) {
+ // Violate the boundary.
+ return false;
+ }
+ // The buffer has sufficient capacity.
+ return true;
+ }
+ // Hereafter, pos is in the additional buffer.
+ const size_t tailPosition = static_cast<size_t>(getTailPosition());
+ if (totalRequiredSize <= tailPosition) {
+ // The buffer has sufficient capacity.
+ return true;
+ }
+ if (static_cast<size_t>(pos) != tailPosition) {
+ // The additional buffer must be extended from the tail position.
+ return false;
+ }
+ const size_t extendSize = totalRequiredSize -
+ std::min(mAdditionalBuffer.size() + mOriginalBuffer.size(), totalRequiredSize);
+ if (extendSize > 0 && !extendBuffer(extendSize)) {
+ // Failed to extend the buffer.
+ return false;
+ }
+ mUsedAdditionalBufferSize += size;
+ return true;
+}
+
+bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) {
+ int copyingPos = 0;
+ const int tailPos = sourceBuffer->getTailPosition();
+ const int maxDataChunkSize = sizeof(uint32_t);
+ while (copyingPos < tailPos) {
+ const int remainingSize = tailPos - copyingPos;
+ const int copyingSize = (remainingSize >= maxDataChunkSize) ?
+ maxDataChunkSize : remainingSize;
+ const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos);
+ if (!writeUint(data, copyingSize, copyingPos)) {
+ return false;
+ }
+ copyingPos += copyingSize;
+ }
+ return true;
+}
+
+}
diff --git a/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h
new file mode 100644
index 000000000..0a141d4db
--- /dev/null
+++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H
+#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/utils/byte_array_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+// This is used as a buffer that can be extended for updatable dictionaries.
+// To optimize performance, raw pointer is directly used for reading buffer. The position has to be
+// adjusted to access additional buffer. On the other hand, this class does not provide writable
+// raw pointer but provides several methods that handle boundary checking for writing data.
+class BufferWithExtendableBuffer {
+ public:
+ static const size_t DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE;
+
+ BufferWithExtendableBuffer(const ReadWriteByteArrayView originalBuffer,
+ const int maxAdditionalBufferSize)
+ : mOriginalBuffer(originalBuffer), mAdditionalBuffer(), mUsedAdditionalBufferSize(0),
+ mMaxAdditionalBufferSize(maxAdditionalBufferSize) {}
+
+ // Without original buffer.
+ BufferWithExtendableBuffer(const int maxAdditionalBufferSize)
+ : mOriginalBuffer(), mAdditionalBuffer(), mUsedAdditionalBufferSize(0),
+ mMaxAdditionalBufferSize(maxAdditionalBufferSize) {}
+
+ AK_FORCE_INLINE int getTailPosition() const {
+ return mOriginalBuffer.size() + mUsedAdditionalBufferSize;
+ }
+
+ AK_FORCE_INLINE int getUsedAdditionalBufferSize() const {
+ return mUsedAdditionalBufferSize;
+ }
+
+ /**
+ * For reading.
+ */
+ AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const {
+ return position >= static_cast<int>(mOriginalBuffer.size());
+ }
+
+ // TODO: Resolve the issue that the address can be changed when the vector is resized.
+ // CAVEAT!: Be careful about array out of bound access with buffers
+ AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const {
+ if (usesAdditionalBuffer) {
+ return mAdditionalBuffer.data();
+ } else {
+ return mOriginalBuffer.data();
+ }
+ }
+
+ uint32_t readUint(const int size, const int pos) const;
+
+ uint32_t readUintAndAdvancePosition(const int size, int *const pos) const;
+
+ void readCodePointsAndAdvancePosition(const int maxCodePointCount,
+ int *const outCodePoints, int *outCodePointCount, int *const pos) const;
+
+ AK_FORCE_INLINE int getOriginalBufferSize() const {
+ return mOriginalBuffer.size();
+ }
+
+ AK_FORCE_INLINE bool isNearSizeLimit() const {
+ return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize
+ * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100);
+ }
+
+ bool extend(const int size);
+
+ /**
+ * For writing.
+ *
+ * Writing is allowed for original buffer, already written region of additional buffer and the
+ * tail of additional buffer.
+ */
+ bool writeUint(const uint32_t data, const int size, const int pos);
+
+ bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos);
+
+ bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount,
+ const bool writesTerminator, int *const pos);
+
+ bool copy(const BufferWithExtendableBuffer *const sourceBuffer);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer);
+
+ static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE;
+ static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP;
+
+ const ReadWriteByteArrayView mOriginalBuffer;
+ std::vector<uint8_t> mAdditionalBuffer;
+ int mUsedAdditionalBufferSize;
+ const size_t mMaxAdditionalBufferSize;
+
+ // Return if the buffer is successfully extended or not.
+ bool extendBuffer(const size_t size);
+
+ // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of
+ // the additional buffer, try extending the buffer.
+ bool checkAndPrepareWriting(const int pos, const int size);
+};
+}
+#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */
diff --git a/native/jni/src/dictionary/utils/byte_array_utils.cpp b/native/jni/src/dictionary/utils/byte_array_utils.cpp
new file mode 100644
index 000000000..d38f08217
--- /dev/null
+++ b/native/jni/src/dictionary/utils/byte_array_utils.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20;
+const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF;
+const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F;
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/byte_array_utils.h b/native/jni/src/dictionary/utils/byte_array_utils.h
new file mode 100644
index 000000000..abb979050
--- /dev/null
+++ b/native/jni/src/dictionary/utils/byte_array_utils.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_BYTE_ARRAY_UTILS_H
+#define LATINIME_BYTE_ARRAY_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+
+namespace latinime {
+
+/**
+ * Utility methods for reading byte arrays.
+ */
+class ByteArrayUtils {
+ public:
+ /**
+ * Integer writing
+ *
+ * Each method write a corresponding size integer in a big endian manner.
+ */
+ static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer,
+ const uint32_t data, const int size, int *const pos) {
+ // size must be in 1 to 4.
+ ASSERT(size >= 1 && size <= 4);
+ switch (size) {
+ case 1:
+ ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos);
+ return;
+ case 2:
+ ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos);
+ return;
+ case 3:
+ ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos);
+ return;
+ case 4:
+ ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos);
+ return;
+ default:
+ break;
+ }
+ }
+
+ /**
+ * Integer reading
+ *
+ * Each method read a corresponding size integer in a big endian manner.
+ */
+ static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) {
+ return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16)
+ ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3];
+ }
+
+ static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) {
+ return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2];
+ }
+
+ static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) {
+ return (buffer[pos] << 8) ^ buffer[pos + 1];
+ }
+
+ static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) {
+ return buffer[pos];
+ }
+
+ static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint32_t value = readUint32(buffer, *pos);
+ *pos += 4;
+ return value;
+ }
+
+ static AK_FORCE_INLINE int readSint24AndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint8_t value = readUint8(buffer, *pos);
+ if (value < 0x80) {
+ return readUint24AndAdvancePosition(buffer, pos);
+ } else {
+ (*pos)++;
+ return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos));
+ }
+ }
+
+ static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint32_t value = readUint24(buffer, *pos);
+ *pos += 3;
+ return value;
+ }
+
+ static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ const uint16_t value = readUint16(buffer, *pos);
+ *pos += 2;
+ return value;
+ }
+
+ static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition(
+ const uint8_t *const buffer, int *const pos) {
+ return buffer[(*pos)++];
+ }
+
+ static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer,
+ const int size, const int pos) {
+ // size must be in 1 to 4.
+ ASSERT(size >= 1 && size <= 4);
+ switch (size) {
+ case 1:
+ return ByteArrayUtils::readUint8(buffer, pos);
+ case 2:
+ return ByteArrayUtils::readUint16(buffer, pos);
+ case 3:
+ return ByteArrayUtils::readUint24(buffer, pos);
+ case 4:
+ return ByteArrayUtils::readUint32(buffer, pos);
+ default:
+ return 0;
+ }
+ }
+
+ /**
+ * Code Point Reading
+ *
+ * 1 byte = bbbbbbbb match
+ * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
+ * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
+ * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
+ * 00011111 would be outside unicode.
+ * else: iso-latin-1 code
+ * This allows for the whole unicode range to be encoded, including chars outside of
+ * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
+ * characters which should never happen anyway (and still work, but take 3 bytes).
+ */
+ static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
+ int p = pos;
+ return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
+ }
+
+ static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
+ const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
+ /*
+ * codePointTable is an array to convert the most frequent characters in this dictionary to
+ * 1 byte code points. It is only made of the original code points of the most frequent
+ * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
+ * The original code points are restored by picking the code points at the indices of the
+ * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
+ */
+ const uint8_t firstByte = readUint8(buffer, *pos);
+ if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
+ if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
+ *pos += 1;
+ return NOT_A_CODE_POINT;
+ } else {
+ return readUint24AndAdvancePosition(buffer, pos);
+ }
+ } else {
+ *pos += 1;
+ if (codePointTable) {
+ return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
+ }
+ return firstByte;
+ }
+ }
+
+ /**
+ * String (array of code points) Reading
+ *
+ * Reads code points until the terminator is found.
+ */
+ // Returns the length of the string.
+ static int readStringAndAdvancePosition(const uint8_t *const buffer,
+ const int maxLength, const int *const codePointTable, int *const outBuffer,
+ int *const pos) {
+ int length = 0;
+ int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
+ while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
+ outBuffer[length++] = codePoint;
+ codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
+ }
+ return length;
+ }
+
+ // Advances the position and returns the length of the string.
+ static int advancePositionToBehindString(
+ const uint8_t *const buffer, const int maxLength, int *const pos) {
+ int length = 0;
+ int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
+ while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
+ codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
+ length++;
+ }
+ return length;
+ }
+
+ /**
+ * String (array of code points) Writing
+ */
+ static void writeCodePointsAndAdvancePosition(uint8_t *const buffer,
+ const int *const codePoints, const int codePointCount, const bool writesTerminator,
+ int *const pos) {
+ for (int i = 0; i < codePointCount; ++i) {
+ const int codePoint = codePoints[i];
+ if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
+ break;
+ } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
+ || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
+ // three bytes character.
+ writeUint24AndAdvancePosition(buffer, codePoint, pos);
+ } else {
+ // one byte character.
+ writeUint8AndAdvancePosition(buffer, codePoint, pos);
+ }
+ }
+ if (writesTerminator) {
+ writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos);
+ }
+ }
+
+ static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints,
+ const int codePointCount, const bool writesTerminator) {
+ int byteCount = 0;
+ for (int i = 0; i < codePointCount; ++i) {
+ const int codePoint = codePoints[i];
+ if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
+ break;
+ } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
+ || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
+ // three bytes character.
+ byteCount += 3;
+ } else {
+ // one byte character.
+ byteCount += 1;
+ }
+ }
+ if (writesTerminator) {
+ // The terminator is one byte.
+ byteCount += 1;
+ }
+ return byteCount;
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);
+
+ static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE;
+ static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE;
+ static const uint8_t CHARACTER_ARRAY_TERMINATOR;
+
+ static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
+ const uint32_t data, int *const pos) {
+ buffer[(*pos)++] = (data >> 24) & 0xFF;
+ buffer[(*pos)++] = (data >> 16) & 0xFF;
+ buffer[(*pos)++] = (data >> 8) & 0xFF;
+ buffer[(*pos)++] = data & 0xFF;
+ }
+
+ static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer,
+ const uint32_t data, int *const pos) {
+ buffer[(*pos)++] = (data >> 16) & 0xFF;
+ buffer[(*pos)++] = (data >> 8) & 0xFF;
+ buffer[(*pos)++] = data & 0xFF;
+ }
+
+ static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer,
+ const uint16_t data, int *const pos) {
+ buffer[(*pos)++] = (data >> 8) & 0xFF;
+ buffer[(*pos)++] = data & 0xFF;
+ }
+
+ static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer,
+ const uint8_t data, int *const pos) {
+ buffer[(*pos)++] = data & 0xFF;
+ }
+};
+} // namespace latinime
+#endif /* LATINIME_BYTE_ARRAY_UTILS_H */
diff --git a/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp
new file mode 100644
index 000000000..033a758ba
--- /dev/null
+++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/dict_file_writing_utils.h"
+
+#include <cstdio>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/entry_counters.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/format_utils.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp";
+// Enough size to describe buffer size.
+const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4;
+
+/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath,
+ const int dictVersion, const std::vector<int> localeAsCodePointVector,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) {
+ TimeKeeper::setCurrentTime();
+ const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion);
+ switch (formatVersion) {
+ case FormatUtils::VERSION_402:
+ return createEmptyV4DictFile<backward::v402::Ver4DictConstants,
+ backward::v402::Ver4DictBuffers,
+ backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr>(
+ filePath, localeAsCodePointVector, attributeMap, formatVersion);
+ case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
+ case FormatUtils::VERSION_403:
+ return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers,
+ Ver4DictBuffers::Ver4DictBuffersPtr>(
+ filePath, localeAsCodePointVector, attributeMap, formatVersion);
+ default:
+ AKLOGE("Cannot create dictionary %s because format version %d is not supported.",
+ filePath, dictVersion);
+ return false;
+ }
+}
+
+template<class DictConstants, class DictBuffers, class DictBuffersPtr>
+/* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath,
+ const std::vector<int> localeAsCodePointVector,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap,
+ const FormatUtils::FORMAT_VERSION formatVersion) {
+ HeaderPolicy headerPolicy(formatVersion, localeAsCodePointVector, attributeMap);
+ DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy,
+ DictConstants::MAX_DICT_EXTENDED_REGION_SIZE);
+ headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
+ EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer());
+ if (!DynamicPtWritingUtils::writeEmptyDictionary(
+ dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) {
+ AKLOGE("Empty ver4 dictionary structure cannot be created on memory.");
+ return false;
+ }
+ return dictBuffers->flush(dirPath);
+}
+
+/* static */ bool DictFileWritingUtils::flushBufferToFileWithSuffix(const char *const basePath,
+ const char *const suffix, const BufferWithExtendableBuffer *const buffer) {
+ const int filePathBufSize = FileUtils::getFilePathWithSuffixBufSize(basePath, suffix);
+ char filePath[filePathBufSize];
+ FileUtils::getFilePathWithSuffix(basePath, suffix, filePathBufSize, filePath);
+ return flushBufferToFile(filePath, buffer);
+}
+
+/* static */ bool DictFileWritingUtils::writeBufferToFileTail(FILE *const file,
+ const BufferWithExtendableBuffer *const buffer) {
+ uint8_t bufferSize[SIZE_OF_BUFFER_SIZE_FIELD];
+ int writingPos = 0;
+ ByteArrayUtils::writeUintAndAdvancePosition(bufferSize, buffer->getTailPosition(),
+ SIZE_OF_BUFFER_SIZE_FIELD, &writingPos);
+ if (fwrite(bufferSize, SIZE_OF_BUFFER_SIZE_FIELD, 1 /* count */, file) < 1) {
+ return false;
+ }
+ return writeBufferToFile(file, buffer);
+}
+
+/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath,
+ const BufferWithExtendableBuffer *const buffer) {
+ const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+ if (fd == -1) {
+ AKLOGE("File %s cannot be opened. errno: %d", filePath, errno);
+ ASSERT(false);
+ return false;
+ }
+ FILE *const file = fdopen(fd, "wb");
+ if (!file) {
+ AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno);
+ ASSERT(false);
+ return false;
+ }
+ if (!writeBufferToFile(file, buffer)) {
+ fclose(file);
+ remove(filePath);
+ AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath,
+ buffer->getTailPosition());
+ ASSERT(false);
+ return false;
+ }
+ fclose(file);
+ return true;
+}
+
+// Returns whether the writing was succeeded or not.
+/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file,
+ const BufferWithExtendableBuffer *const buffer) {
+ const int originalBufSize = buffer->getOriginalBufferSize();
+ if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */),
+ originalBufSize, 1, file) < 1) {
+ return false;
+ }
+ const int additionalBufSize = buffer->getUsedAdditionalBufferSize();
+ if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */),
+ additionalBufSize, 1, file) < 1) {
+ return false;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/dictionary/utils/dict_file_writing_utils.h
new file mode 100644
index 000000000..102a89da4
--- /dev/null
+++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H
+#define LATINIME_DICT_FILE_WRITING_UTILS_H
+
+#include <cstdio>
+
+#include "defines.h"
+#include "dictionary/header/header_read_write_utils.h"
+#include "dictionary/utils/format_utils.h"
+
+namespace latinime {
+
+class BufferWithExtendableBuffer;
+
+class DictFileWritingUtils {
+ public:
+ static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE;
+
+ static bool createEmptyDictFile(const char *const filePath, const int dictVersion,
+ const std::vector<int> localeAsCodePointVector,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap);
+
+ static bool flushBufferToFileWithSuffix(const char *const basePath, const char *const suffix,
+ const BufferWithExtendableBuffer *const buffer);
+
+ static bool writeBufferToFileTail(FILE *const file,
+ const BufferWithExtendableBuffer *const buffer);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils);
+
+ static const int SIZE_OF_BUFFER_SIZE_FIELD;
+
+ static bool createEmptyV401DictFile(const char *const filePath,
+ const std::vector<int> localeAsCodePointVector,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap,
+ const FormatUtils::FORMAT_VERSION formatVersion);
+
+ template<class DictConstants, class DictBuffers, class DictBuffersPtr>
+ static bool createEmptyV4DictFile(const char *const filePath,
+ const std::vector<int> localeAsCodePointVector,
+ const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap,
+ const FormatUtils::FORMAT_VERSION formatVersion);
+
+ static bool flushBufferToFile(const char *const filePath,
+ const BufferWithExtendableBuffer *const buffer);
+
+ static bool writeBufferToFile(FILE *const file,
+ const BufferWithExtendableBuffer *const buffer);
+};
+} // namespace latinime
+#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */
diff --git a/native/jni/src/dictionary/utils/entry_counters.h b/native/jni/src/dictionary/utils/entry_counters.h
new file mode 100644
index 000000000..5e443026e
--- /dev/null
+++ b/native/jni/src/dictionary/utils/entry_counters.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_ENTRY_COUNTERS_H
+#define LATINIME_ENTRY_COUNTERS_H
+
+#include <array>
+
+#include "defines.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+// Copyable but immutable
+class EntryCounts final {
+ public:
+ EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {}
+
+ explicit EntryCounts(const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters)
+ : mEntryCounts(counters) {}
+
+ int getNgramCount(const NgramType ngramType) const {
+ return mEntryCounts[static_cast<int>(ngramType)];
+ }
+
+ const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &getCountArray() const {
+ return mEntryCounts;
+ }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts);
+
+ // Counts from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram
+ // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element)
+ const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounts;
+};
+
+class MutableEntryCounters final {
+ public:
+ MutableEntryCounters() {
+ mEntryCounters.fill(0);
+ }
+
+ explicit MutableEntryCounters(
+ const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters)
+ : mEntryCounters(counters) {}
+
+ const EntryCounts getEntryCounts() const {
+ return EntryCounts(mEntryCounters);
+ }
+
+ void incrementNgramCount(const NgramType ngramType) {
+ ++mEntryCounters[static_cast<int>(ngramType)];
+ }
+
+ void decrementNgramCount(const NgramType ngramType) {
+ --mEntryCounters[static_cast<int>(ngramType)];
+ }
+
+ int getNgramCount(const NgramType ngramType) const {
+ return mEntryCounters[static_cast<int>(ngramType)];
+ }
+
+ void setNgramCount(const NgramType ngramType, const int count) {
+ mEntryCounters[static_cast<int>(ngramType)] = count;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters);
+
+ // Counters from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram
+ // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element)
+ std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounters;
+};
+} // namespace latinime
+#endif /* LATINIME_ENTRY_COUNTERS_H */
diff --git a/native/jni/src/dictionary/utils/file_utils.cpp b/native/jni/src/dictionary/utils/file_utils.cpp
new file mode 100644
index 000000000..bb392fb32
--- /dev/null
+++ b/native/jni/src/dictionary/utils/file_utils.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/file_utils.h"
+
+#include <cstdio>
+#include <cstring>
+#include <dirent.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace latinime {
+
+// Returns -1 on error.
+/* static */ int FileUtils::getFileSize(const char *const filePath) {
+ const int fd = open(filePath, O_RDONLY);
+ if (fd == -1) {
+ return -1;
+ }
+ struct stat statBuf;
+ if (fstat(fd, &statBuf) != 0) {
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ return static_cast<int>(statBuf.st_size);
+}
+
+/* static */ bool FileUtils::existsDir(const char *const dirPath) {
+ DIR *const dir = opendir(dirPath);
+ if (dir == NULL) {
+ return false;
+ }
+ closedir(dir);
+ return true;
+}
+
+// Remove a directory and all files in the directory.
+/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath) {
+ return removeDirAndFiles(dirPath, 5 /* maxTries */);
+}
+
+// Remove a directory and all files in the directory, trying up to maxTimes.
+/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath, const int maxTries) {
+ DIR *const dir = opendir(dirPath);
+ if (dir == NULL) {
+ AKLOGE("Cannot open dir %s.", dirPath);
+ return true;
+ }
+ struct dirent *dirent;
+ while ((dirent = readdir(dir)) != NULL) {
+ if (dirent->d_type == DT_DIR) {
+ continue;
+ }
+ if (strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0) {
+ continue;
+ }
+ const int filePathBufSize = getFilePathBufSize(dirPath, dirent->d_name);
+ char filePath[filePathBufSize];
+ getFilePath(dirPath, dirent->d_name, filePathBufSize, filePath);
+ if (remove(filePath) != 0) {
+ AKLOGE("Cannot remove file %s.", filePath);
+ closedir(dir);
+ return false;
+ }
+ }
+ closedir(dir);
+ if (remove(dirPath) != 0) {
+ if (maxTries > 0) {
+ // On NFS, deleting files sometimes creates new files. I'm not sure what the
+ // correct way of dealing with this is, but for the time being, this seems to work.
+ removeDirAndFiles(dirPath, maxTries - 1);
+ } else {
+ AKLOGE("Cannot remove directory %s.", dirPath);
+ return false;
+ }
+ }
+ return true;
+}
+
+/* static */ int FileUtils::getFilePathWithSuffixBufSize(const char *const filePath,
+ const char *const suffix) {
+ return strlen(filePath) + strlen(suffix) + 1 /* terminator */;
+}
+
+/* static */ void FileUtils::getFilePathWithSuffix(const char *const filePath,
+ const char *const suffix, const int filePathBufSize, char *const outFilePath) {
+ snprintf(outFilePath, filePathBufSize, "%s%s", filePath, suffix);
+}
+
+/* static */ int FileUtils::getFilePathBufSize(const char *const dirPath,
+ const char *const fileName) {
+ return strlen(dirPath) + 1 /* '/' */ + strlen(fileName) + 1 /* terminator */;
+}
+
+/* static */ void FileUtils::getFilePath(const char *const dirPath, const char *const fileName,
+ const int filePathBufSize, char *const outFilePath) {
+ snprintf(outFilePath, filePathBufSize, "%s/%s", dirPath, fileName);
+}
+
+/* static */ bool FileUtils::getFilePathWithoutSuffix(const char *const filePath,
+ const char *const suffix, const int outDirPathBufSize, char *const outDirPath) {
+ const int filePathLength = strlen(filePath);
+ const int suffixLength = strlen(suffix);
+ if (filePathLength <= suffixLength) {
+ AKLOGE("File path length (%s:%d) is shorter that suffix length (%s:%d).",
+ filePath, filePathLength, suffix, suffixLength);
+ return false;
+ }
+ const int resultFilePathLength = filePathLength - suffixLength;
+ if (outDirPathBufSize <= resultFilePathLength) {
+ AKLOGE("outDirPathBufSize is too small. filePath: %s, suffix: %s, outDirPathBufSize: %d",
+ filePath, suffix, outDirPathBufSize);
+ return false;
+ }
+ if (strncmp(filePath + resultFilePathLength, suffix, suffixLength) != 0) {
+ AKLOGE("File Path %s does not have %s as a suffix", filePath, suffix);
+ return false;
+ }
+ snprintf(outDirPath, resultFilePathLength + 1 /* terminator */, "%s", filePath);
+ return true;
+}
+
+/* static */ void FileUtils::getDirPath(const char *const filePath, const int outDirPathBufSize,
+ char *const outDirPath) {
+ for (int i = strlen(filePath) - 1; i >= 0; --i) {
+ if (filePath[i] == '/') {
+ if (i >= outDirPathBufSize) {
+ AKLOGE("outDirPathBufSize is too small. filePath: %s, outDirPathBufSize: %d",
+ filePath, outDirPathBufSize);
+ ASSERT(false);
+ return;
+ }
+ snprintf(outDirPath, i + 1 /* terminator */, "%s", filePath);
+ return;
+ }
+ }
+}
+
+/* static */ void FileUtils::getBasename(const char *const filePath,
+ const int outNameBufSize, char *const outName) {
+ const int filePathBufSize = strlen(filePath) + 1 /* terminator */;
+ char filePathBuf[filePathBufSize];
+ snprintf(filePathBuf, filePathBufSize, "%s", filePath);
+ const char *const baseName = basename(filePathBuf);
+ const int baseNameLength = strlen(baseName);
+ if (baseNameLength >= outNameBufSize) {
+ AKLOGE("outNameBufSize is too small. filePath: %s, outNameBufSize: %d",
+ filePath, outNameBufSize);
+ return;
+ }
+ snprintf(outName, baseNameLength + 1 /* terminator */, "%s", baseName);
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/file_utils.h b/native/jni/src/dictionary/utils/file_utils.h
new file mode 100644
index 000000000..4f1b93a6a
--- /dev/null
+++ b/native/jni/src/dictionary/utils/file_utils.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_FILE_UTILS_H
+#define LATINIME_FILE_UTILS_H
+
+#include "defines.h"
+
+namespace latinime {
+
+class FileUtils {
+ public:
+ // Returns -1 on error.
+ static int getFileSize(const char *const filePath);
+
+ static bool existsDir(const char *const dirPath);
+
+ // Remove a directory and all files in the directory.
+ static bool removeDirAndFiles(const char *const dirPath);
+
+ static int getFilePathWithSuffixBufSize(const char *const filePath, const char *const suffix);
+
+ static void getFilePathWithSuffix(const char *const filePath, const char *const suffix,
+ const int filePathBufSize, char *const outFilePath);
+
+ static int getFilePathBufSize(const char *const dirPath, const char *const fileName);
+
+ static void getFilePath(const char *const dirPath, const char *const fileName,
+ const int filePathBufSize, char *const outFilePath);
+
+ // Returns whether the filePath have the suffix.
+ static bool getFilePathWithoutSuffix(const char *const filePath, const char *const suffix,
+ const int dirPathBufSize, char *const outDirPath);
+
+ static void getDirPath(const char *const filePath, const int dirPathBufSize,
+ char *const outDirPath);
+
+ static void getBasename(const char *const filePath, const int outNameBufSize,
+ char *const outName);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(FileUtils);
+
+ static bool removeDirAndFiles(const char *const dirPath, const int maxTries);
+};
+} // namespace latinime
+#endif /* LATINIME_FILE_UTILS_H */
diff --git a/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp
new file mode 100644
index 000000000..d79ed911b
--- /dev/null
+++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/forgetting_curve_utils.h"
+
+#include <algorithm>
+#include <cmath>
+#include <stdlib.h>
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/utils/probability_utils.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
+const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
+
+const int ForgettingCurveUtils::MAX_LEVEL = 15;
+const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2;
+const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31;
+const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30;
+const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1;
+// TODO: Evaluate whether this should be 7.5 days.
+// 15 days
+const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60;
+
+const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2;
+
+const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
+
+// TODO: Revise the logic to decide the initial probability depending on the given probability.
+/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo(
+ const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
+ const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) {
+ const int timestamp = newHistoricalInfo->getTimestamp();
+ if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
+ // Add entry as a valid word.
+ const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel());
+ const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy);
+ return HistoricalInfo(timestamp, level, count);
+ } else if (!originalHistoricalInfo->isValid()
+ || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel()
+ || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel()
+ && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) {
+ // Initial information.
+ int count = newHistoricalInfo->getCount();
+ if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
+ const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1);
+ return HistoricalInfo(timestamp, level, 0 /* count */);
+ }
+ const int level = clampToValidLevelRange(newHistoricalInfo->getLevel());
+ return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy));
+ } else {
+ const int updatedCount = originalHistoricalInfo->getCount() + 1;
+ if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
+ // The count exceeds the max value the level can be incremented.
+ if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) {
+ // The level is already max.
+ return HistoricalInfo(timestamp,
+ originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount());
+ } else {
+ // Raise the level.
+ return HistoricalInfo(timestamp,
+ originalHistoricalInfo->getLevel() + 1, 0 /* count */);
+ }
+ } else {
+ return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount);
+ }
+ }
+}
+
+/* static */ int ForgettingCurveUtils::decodeProbability(
+ const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
+ const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(),
+ DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS);
+ return sProbabilityTable.getProbability(
+ headerPolicy->getForgettingCurveProbabilityValuesTableId(),
+ clampToValidLevelRange(historicalInfo->getLevel()),
+ clampToValidTimeStepCountRange(elapsedTimeStepCount));
+}
+
+/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
+ const HeaderPolicy *const headerPolicy) {
+ return historicalInfo->getLevel() > 0
+ || getElapsedTimeStepCount(historicalInfo->getTimestamp(),
+ DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS)
+ < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
+}
+
+/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
+ const HistoricalInfo *const originalHistoricalInfo,
+ const HeaderPolicy *const headerPolicy) {
+ if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) {
+ return HistoricalInfo();
+ }
+ const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
+ const int elapsedTimeStep = getElapsedTimeStepCount(
+ originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds);
+ if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
+ // No need to update historical info.
+ return *originalHistoricalInfo;
+ }
+ // Lower the level.
+ const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
+ const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
+ originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
+ const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() +
+ levelDownAmount * durationToLevelDownInSeconds;
+ return HistoricalInfo(adjustedTimestampInSeconds,
+ originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
+}
+
+/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
+ const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) {
+ const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts();
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ if (entryCounts.getNgramCount(ngramType)
+ >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) {
+ // Unigram count exceeds the limit.
+ return true;
+ }
+ }
+ if (mindsBlockByDecay) {
+ return false;
+ }
+ if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS
+ < TimeKeeper::peekCurrentTime()) {
+ // Time to decay.
+ return true;
+ }
+ return false;
+}
+
+// See comments in ProbabilityUtils::backoff().
+/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
+ // See TODO comments in ForgettingCurveUtils::getProbability().
+ return unigramProbability;
+}
+
+/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp,
+ const int durationToLevelDownInSeconds) {
+ const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp;
+ const int timeStepDurationInSeconds =
+ durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
+ return elapsedTimeInSeconds / timeStepDurationInSeconds;
+}
+
+/* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) {
+ return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL);
+}
+
+/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count,
+ const HeaderPolicy *const headerPolicy) {
+ return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1);
+}
+
+/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) {
+ return std::min(std::max(level, 0), MAX_LEVEL);
+}
+
+/* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) {
+ return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT);
+}
+
+const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4;
+const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0;
+const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1;
+const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2;
+const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3;
+const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127;
+const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8;
+const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9;
+const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10;
+
+
+ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
+ mTables.resize(PROBABILITY_TABLE_COUNT);
+ for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) {
+ mTables[tableId].resize(MAX_LEVEL + 1);
+ for (int level = 0; level <= MAX_LEVEL; ++level) {
+ mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1);
+ const float initialProbability = getBaseProbabilityForLevel(tableId, level);
+ const float endProbability = getBaseProbabilityForLevel(tableId, level - 1);
+ for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT;
+ ++timeStepCount) {
+ if (level < MIN_VISIBLE_LEVEL) {
+ mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
+ continue;
+ }
+ const float probability = initialProbability
+ * powf(initialProbability / endProbability,
+ -1.0f * static_cast<float>(timeStepCount)
+ / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1));
+ mTables[tableId][level][timeStepCount] =
+ std::min(std::max(static_cast<int>(probability), 1), MAX_PROBABILITY);
+ }
+ }
+ }
+}
+
+/* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel(
+ const int tableId, const int level) {
+ if (tableId == WEAK_PROBABILITY_TABLE_ID) {
+ // Max probability is 127.
+ return static_cast<float>(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level)));
+ } else if (tableId == MODEST_PROBABILITY_TABLE_ID) {
+ // Max probability is 128.
+ return static_cast<float>(MODEST_BASE_PROBABILITY * (level + 1));
+ } else if (tableId == STRONG_PROBABILITY_TABLE_ID) {
+ // Max probability is 140.
+ return static_cast<float>(STRONG_BASE_PROBABILITY * (level + 1));
+ } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) {
+ // Max probability is 160.
+ return static_cast<float>(AGGRESSIVE_BASE_PROBABILITY * (level + 1));
+ } else {
+ return NOT_A_PROBABILITY;
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/dictionary/utils/forgetting_curve_utils.h
new file mode 100644
index 000000000..ddaac7e3b
--- /dev/null
+++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_FORGETTING_CURVE_UTILS_H
+#define LATINIME_FORGETTING_CURVE_UTILS_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/utils/entry_counters.h"
+
+namespace latinime {
+
+class HeaderPolicy;
+
+class ForgettingCurveUtils {
+ public:
+ static const HistoricalInfo createUpdatedHistoricalInfo(
+ const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
+ const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy);
+
+ static const HistoricalInfo createHistoricalInfoToSave(
+ const HistoricalInfo *const originalHistoricalInfo,
+ const HeaderPolicy *const headerPolicy);
+
+ static int decodeProbability(const HistoricalInfo *const historicalInfo,
+ const HeaderPolicy *const headerPolicy);
+
+ static bool needsToKeep(const HistoricalInfo *const historicalInfo,
+ const HeaderPolicy *const headerPolicy);
+
+ static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters,
+ const HeaderPolicy *const headerPolicy);
+
+ // TODO: Improve probability computation method and remove this.
+ static int getProbabilityBiasForNgram(const int n) {
+ return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE;
+ }
+
+ AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) {
+ return static_cast<int>(static_cast<float>(maxEntryCount)
+ * ENTRY_COUNT_HARD_LIMIT_WEIGHT);
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
+
+ class ProbabilityTable {
+ public:
+ ProbabilityTable();
+
+ int getProbability(const int tableId, const int level,
+ const int elapsedTimeStepCount) const {
+ return mTables[tableId][level][elapsedTimeStepCount];
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(ProbabilityTable);
+
+ static const int PROBABILITY_TABLE_COUNT;
+ static const int WEAK_PROBABILITY_TABLE_ID;
+ static const int MODEST_PROBABILITY_TABLE_ID;
+ static const int STRONG_PROBABILITY_TABLE_ID;
+ static const int AGGRESSIVE_PROBABILITY_TABLE_ID;
+
+ static const int WEAK_MAX_PROBABILITY;
+ static const int MODEST_BASE_PROBABILITY;
+ static const int STRONG_BASE_PROBABILITY;
+ static const int AGGRESSIVE_BASE_PROBABILITY;
+
+ std::vector<std::vector<std::vector<int>>> mTables;
+
+ static int getBaseProbabilityForLevel(const int tableId, const int level);
+ };
+
+ static const int MULTIPLIER_TWO_IN_PROBABILITY_SCALE;
+ static const int DECAY_INTERVAL_SECONDS;
+
+ static const int MAX_LEVEL;
+ static const int MIN_VISIBLE_LEVEL;
+ static const int MAX_ELAPSED_TIME_STEP_COUNT;
+ static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
+ static const int OCCURRENCES_TO_RAISE_THE_LEVEL;
+ static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
+
+ static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT;
+
+ static const ProbabilityTable sProbabilityTable;
+
+ static int backoff(const int unigramProbability);
+ static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown);
+ static int clampToVisibleEntryLevelRange(const int level);
+ static int clampToValidLevelRange(const int level);
+ static int clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy);
+ static int clampToValidTimeStepCountRange(const int timeStepCount);
+};
+} // namespace latinime
+#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */
diff --git a/native/jni/src/dictionary/utils/format_utils.cpp b/native/jni/src/dictionary/utils/format_utils.cpp
new file mode 100644
index 000000000..cef3b094c
--- /dev/null
+++ b/native/jni/src/dictionary/utils/format_utils.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/format_utils.h"
+
+#include "dictionary/utils/byte_array_utils.h"
+
+namespace latinime {
+
+const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE;
+
+// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12
+const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
+
+/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) {
+ switch (formatVersion) {
+ case VERSION_2:
+ case VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ return UNKNOWN_VERSION;
+ case VERSION_202:
+ return VERSION_202;
+ case VERSION_4_ONLY_FOR_TESTING:
+ return VERSION_4_ONLY_FOR_TESTING;
+ case VERSION_402:
+ return VERSION_402;
+ case VERSION_403:
+ return VERSION_403;
+ default:
+ return UNKNOWN_VERSION;
+ }
+}
+/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion(
+ const ReadOnlyByteArrayView dictBuffer) {
+ // The magic number is stored big-endian.
+ // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't
+ // understand this format.
+ if (dictBuffer.size() < DICTIONARY_MINIMUM_SIZE) {
+ return UNKNOWN_VERSION;
+ }
+ const uint32_t magicNumber = ByteArrayUtils::readUint32(dictBuffer.data(), 0);
+ switch (magicNumber) {
+ case MAGIC_NUMBER:
+ // The layout of the header is as follows:
+ // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
+ // Dictionary format version number (2 bytes)
+ // Options (2 bytes)
+ // Header size (4 bytes) : integer, big endian
+ // Conceptually this converts the hardcoded value of the bytes in the file into
+ // the symbolic value we use in the code. But we want the constants to be the
+ // same so we use them for both here.
+ return getFormatVersion(ByteArrayUtils::readUint16(dictBuffer.data(), 4));
+ default:
+ return UNKNOWN_VERSION;
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/format_utils.h b/native/jni/src/dictionary/utils/format_utils.h
new file mode 100644
index 000000000..1616efcce
--- /dev/null
+++ b/native/jni/src/dictionary/utils/format_utils.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_FORMAT_UTILS_H
+#define LATINIME_FORMAT_UTILS_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+/**
+ * Methods to handle binary dictionary format version.
+ */
+class FormatUtils {
+ public:
+ enum FORMAT_VERSION {
+ // These MUST have the same values as the relevant constants in FormatSpec.java.
+ // TODO: Remove VERSION_2 and VERSION_201 when we:
+ // * Confirm that old versions of LatinIME download old-format dictionaries
+ // * We no longer need the corresponding constants on the Java side for dicttool
+ VERSION_2 = 2,
+ VERSION_201 = 201,
+ VERSION_202 = 202,
+ VERSION_4_ONLY_FOR_TESTING = 399,
+ VERSION_402 = 402,
+ VERSION_403 = 403,
+ UNKNOWN_VERSION = -1
+ };
+
+ // 32 bit magic number is stored at the beginning of the dictionary header to reject
+ // unsupported or obsolete dictionary formats.
+ static const uint32_t MAGIC_NUMBER;
+
+ static FORMAT_VERSION getFormatVersion(const int formatVersion);
+ static FORMAT_VERSION detectFormatVersion(const ReadOnlyByteArrayView dictBuffer);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils);
+
+ static const size_t DICTIONARY_MINIMUM_SIZE;
+};
+} // namespace latinime
+#endif /* LATINIME_FORMAT_UTILS_H */
diff --git a/native/jni/src/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/dictionary/utils/mmapped_buffer.cpp
new file mode 100644
index 000000000..c5259de6d
--- /dev/null
+++ b/native/jni/src/dictionary/utils/mmapped_buffer.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/mmapped_buffer.h"
+
+#include <cerrno>
+#include <climits>
+#include <cstdio>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "dictionary/utils/file_utils.h"
+
+namespace latinime {
+
+/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer(
+ const char *const path, const int bufferOffset, const int bufferSize,
+ const bool isUpdatable) {
+ const int mmapFd = open(path, O_RDONLY);
+ if (mmapFd < 0) {
+ AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno);
+ return nullptr;
+ }
+ const int pagesize = sysconf(_SC_PAGESIZE);
+ const int offset = bufferOffset % pagesize;
+ int alignedOffset = bufferOffset - offset;
+ int alignedSize = bufferSize + offset;
+ const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ;
+ void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd,
+ alignedOffset);
+ if (mmappedBuffer == MAP_FAILED) {
+ AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno);
+ close(mmapFd);
+ return nullptr;
+ }
+ uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset;
+ if (!buffer) {
+ AKLOGE("DICT: buffer is null");
+ close(mmapFd);
+ return nullptr;
+ }
+ return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize,
+ mmapFd, isUpdatable));
+}
+
+/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer(
+ const char *const path, const bool isUpdatable) {
+ const int fileSize = FileUtils::getFileSize(path);
+ if (fileSize == -1) {
+ return nullptr;
+ } else if (fileSize == 0) {
+ return MmappedBufferPtr(new MmappedBuffer(isUpdatable));
+ } else {
+ return openBuffer(path, 0 /* bufferOffset */, fileSize, isUpdatable);
+ }
+}
+
+/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer(
+ const char *const dirPath, const char *const fileName, const bool isUpdatable) {
+ const int filePathBufferSize = PATH_MAX + 1 /* terminator */;
+ char filePath[filePathBufferSize];
+ const int filePathLength = snprintf(filePath, filePathBufferSize, "%s%s", dirPath,
+ fileName);
+ if (filePathLength >= filePathBufferSize) {
+ return nullptr;
+ }
+ return openBuffer(filePath, isUpdatable);
+}
+
+MmappedBuffer::~MmappedBuffer() {
+ if (mAlignedSize == 0) {
+ return;
+ }
+ int ret = munmap(mMmappedBuffer, mAlignedSize);
+ if (ret != 0) {
+ AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno);
+ }
+ ret = close(mMmapFd);
+ if (ret != 0) {
+ AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno);
+ }
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/mmapped_buffer.h b/native/jni/src/dictionary/utils/mmapped_buffer.h
new file mode 100644
index 000000000..e25310373
--- /dev/null
+++ b/native/jni/src/dictionary/utils/mmapped_buffer.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_MMAPPED_BUFFER_H
+#define LATINIME_MMAPPED_BUFFER_H
+
+#include <cstdint>
+#include <memory>
+
+#include "defines.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class MmappedBuffer {
+ public:
+ typedef std::unique_ptr<const MmappedBuffer> MmappedBufferPtr;
+
+ static MmappedBufferPtr openBuffer(const char *const path,
+ const int bufferOffset, const int bufferSize, const bool isUpdatable);
+
+ // Mmap entire file.
+ static MmappedBufferPtr openBuffer(const char *const path, const bool isUpdatable);
+
+ static MmappedBufferPtr openBuffer(const char *const dirPath, const char *const fileName,
+ const bool isUpdatable);
+
+ ~MmappedBuffer();
+
+ ReadWriteByteArrayView getReadWriteByteArrayView() const {
+ return mByteArrayView;
+ }
+
+ ReadOnlyByteArrayView getReadOnlyByteArrayView() const {
+ return mByteArrayView.getReadOnlyView();
+ }
+
+ AK_FORCE_INLINE bool isUpdatable() const {
+ return mIsUpdatable;
+ }
+
+ private:
+ AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize,
+ void *const mmappedBuffer, const int alignedSize, const int mmapFd,
+ const bool isUpdatable)
+ : mByteArrayView(buffer, bufferSize), mMmappedBuffer(mmappedBuffer),
+ mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {}
+
+ // Empty file. We have to handle an empty file as a valid part of a dictionary.
+ AK_FORCE_INLINE MmappedBuffer(const bool isUpdatable)
+ : mByteArrayView(), mMmappedBuffer(nullptr), mAlignedSize(0),
+ mMmapFd(0), mIsUpdatable(isUpdatable) {}
+
+ DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer);
+
+ const ReadWriteByteArrayView mByteArrayView;
+ void *const mMmappedBuffer;
+ const int mAlignedSize;
+ const int mMmapFd;
+ const bool mIsUpdatable;
+};
+}
+#endif /* LATINIME_MMAPPED_BUFFER_H */
diff --git a/native/jni/src/dictionary/utils/multi_bigram_map.cpp b/native/jni/src/dictionary/utils/multi_bigram_map.cpp
new file mode 100644
index 000000000..e730fff8e
--- /dev/null
+++ b/native/jni/src/dictionary/utils/multi_bigram_map.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/multi_bigram_map.h"
+
+#include <cstddef>
+#include <unordered_map>
+
+namespace latinime {
+
+// Max number of bigram maps (previous word contexts) to be cached. Increasing this number
+// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory
+// usage. Also, there are diminishing returns since the most frequently used bigrams are
+// typically near the beginning of the input and are thus the first ones to be cached. Note
+// that these bigrams are reset for each new composing word.
+const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25;
+
+// Most common previous word contexts currently have 100 bigrams
+const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100;
+
+// Look up the bigram probability for the given word pair from the cached bigram maps.
+// Also caches the bigrams if there is space remaining and they have not been cached already.
+int MultiBigramMap::getBigramProbability(
+ const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds, const int nextWordId,
+ const int unigramProbability) {
+ if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) {
+ return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
+ }
+ const auto mapPosition = mBigramMaps.find(prevWordIds[0]);
+ if (mapPosition != mBigramMaps.end()) {
+ return mapPosition->second.getBigramProbability(structurePolicy, nextWordId,
+ unigramProbability);
+ }
+ if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
+ addBigramsForWord(structurePolicy, prevWordIds);
+ return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy,
+ nextWordId, unigramProbability);
+ }
+ return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds,
+ nextWordId, unigramProbability);
+}
+
+void MultiBigramMap::BigramMap::init(
+ const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds) {
+ structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */);
+}
+
+int MultiBigramMap::BigramMap::getBigramProbability(
+ const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const int nextWordId, const int unigramProbability) const {
+ int bigramProbability = NOT_A_PROBABILITY;
+ if (mBloomFilter.isInFilter(nextWordId)) {
+ const auto bigramProbabilityIt = mBigramMap.find(nextWordId);
+ if (bigramProbabilityIt != mBigramMap.end()) {
+ bigramProbability = bigramProbabilityIt->second;
+ }
+ }
+ return structurePolicy->getProbability(unigramProbability, bigramProbability);
+}
+
+void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) {
+ if (targetWordId == NOT_A_WORD_ID) {
+ return;
+ }
+ mBigramMap[targetWordId] = ngramProbability;
+ mBloomFilter.setInFilter(targetWordId);
+}
+
+void MultiBigramMap::addBigramsForWord(
+ const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds) {
+ mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds);
+}
+
+int MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
+ const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) {
+ const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId);
+ if (bigramProbability != NOT_A_PROBABILITY) {
+ return bigramProbability;
+ }
+ return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/multi_bigram_map.h b/native/jni/src/dictionary/utils/multi_bigram_map.h
new file mode 100644
index 000000000..6f23d98bc
--- /dev/null
+++ b/native/jni/src/dictionary/utils/multi_bigram_map.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_MULTI_BIGRAM_MAP_H
+#define LATINIME_MULTI_BIGRAM_MAP_H
+
+#include <cstddef>
+#include <unordered_map>
+
+#include "defines.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
+#include "dictionary/utils/bloom_filter.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+// Class for caching bigram maps for multiple previous word contexts. This is useful since the
+// algorithm needs to look up the set of bigrams for every word pair that occurs in every
+// multi-word suggestion.
+class MultiBigramMap {
+ public:
+ MultiBigramMap() : mBigramMaps() {}
+ ~MultiBigramMap() {}
+
+ // Look up the bigram probability for the given word pair from the cached bigram maps.
+ // Also caches the bigrams if there is space remaining and they have not been cached already.
+ int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability);
+
+ void clear() {
+ mBigramMaps.clear();
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(MultiBigramMap);
+
+ class BigramMap : public NgramListener {
+ public:
+ BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {}
+ // Copy constructor needed for std::unordered_map.
+ BigramMap(const BigramMap &bigramMap)
+ : mBigramMap(bigramMap.mBigramMap), mBloomFilter(bigramMap.mBloomFilter) {}
+ virtual ~BigramMap() {}
+
+ void init(const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds);
+ int getBigramProbability(
+ const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const int nextWordId, const int unigramProbability) const;
+ virtual void onVisitEntry(const int ngramProbability, const int targetWordId);
+
+ private:
+ static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP;
+ std::unordered_map<int, int> mBigramMap;
+ BloomFilter mBloomFilter;
+ };
+
+ void addBigramsForWord(const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds);
+
+ int readBigramProbabilityFromBinaryDictionary(
+ const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability);
+
+ static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;
+ std::unordered_map<int, BigramMap> mBigramMaps;
+};
+} // namespace latinime
+#endif // LATINIME_MULTI_BIGRAM_MAP_H
diff --git a/native/jni/src/dictionary/utils/probability_utils.cpp b/native/jni/src/dictionary/utils/probability_utils.cpp
new file mode 100644
index 000000000..426a0e783
--- /dev/null
+++ b/native/jni/src/dictionary/utils/probability_utils.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/probability_utils.h"
+
+namespace latinime {
+
+const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f;
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/probability_utils.h b/native/jni/src/dictionary/utils/probability_utils.h
new file mode 100644
index 000000000..2050af1e9
--- /dev/null
+++ b/native/jni/src/dictionary/utils/probability_utils.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PROBABILITY_UTILS_H
+#define LATINIME_PROBABILITY_UTILS_H
+
+#include <algorithm>
+#include <cmath>
+
+#include "defines.h"
+
+namespace latinime {
+
+// TODO: Quit using bigram probability to indicate the delta.
+class ProbabilityUtils {
+ public:
+ static AK_FORCE_INLINE int backoff(const int unigramProbability) {
+ return unigramProbability;
+ // For some reason, applying the backoff weight gives bad results in tests. To apply the
+ // backoff weight, we divide the probability by 2, which in our storing format means
+ // decreasing the score by 8.
+ // TODO: figure out what's wrong with this.
+ // return unigramProbability > 8 ?
+ // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8);
+ }
+
+ static AK_FORCE_INLINE int computeProbabilityForBigram(
+ const int unigramProbability, const int bigramProbability) {
+ // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want
+ // the unigram probability to be the median value of the 17th step from the top. A value of
+ // 0 for the bigram probability represents the middle of the 16th step from the top,
+ // while a value of 15 represents the middle of the top step.
+ // See makedict.BinaryDictEncoder#makeBigramFlags for details.
+ const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability)
+ / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY);
+ return unigramProbability
+ + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize);
+ }
+
+ // Encode probability using the same way as we are doing for main dictionaries.
+ static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) {
+ const float probability = static_cast<float>(MAX_PROBABILITY)
+ + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER;
+ if (probability < 0.0f) {
+ return 0;
+ }
+ return std::min(static_cast<int>(probability + 0.5f), MAX_PROBABILITY);
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils);
+
+ static const float PROBABILITY_ENCODING_SCALER;
+};
+}
+#endif /* LATINIME_PROBABILITY_UTILS_H */
diff --git a/native/jni/src/dictionary/utils/sparse_table.cpp b/native/jni/src/dictionary/utils/sparse_table.cpp
new file mode 100644
index 000000000..029329fab
--- /dev/null
+++ b/native/jni/src/dictionary/utils/sparse_table.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/sparse_table.h"
+
+namespace latinime {
+
+const int SparseTable::NOT_EXIST = -1;
+const int SparseTable::INDEX_SIZE = 4;
+
+bool SparseTable::contains(const int id) const {
+ const int readingPos = getPosInIndexTable(id);
+ if (id < 0 || mIndexTableBuffer->getTailPosition() <= readingPos) {
+ return false;
+ }
+ const int index = mIndexTableBuffer->readUint(INDEX_SIZE, readingPos);
+ return index != NOT_EXIST;
+}
+
+uint32_t SparseTable::get(const int id) const {
+ const int indexTableReadingPos = getPosInIndexTable(id);
+ const int index = mIndexTableBuffer->readUint(INDEX_SIZE, indexTableReadingPos);
+ const int contentTableReadingPos = getPosInContentTable(id, index);
+ if (contentTableReadingPos < 0
+ || contentTableReadingPos >= mContentTableBuffer->getTailPosition()) {
+ AKLOGE("contentTableReadingPos(%d) is invalid. id: %d, index: %d",
+ contentTableReadingPos, id, index);
+ return NOT_A_DICT_POS;
+ }
+ const int contentValue = mContentTableBuffer->readUint(mDataSize, contentTableReadingPos);
+ return contentValue == NOT_EXIST ? NOT_A_DICT_POS : contentValue;
+}
+
+bool SparseTable::set(const int id, const uint32_t value) {
+ const int posInIndexTable = getPosInIndexTable(id);
+ // Extends the index table if needed.
+ int tailPos = mIndexTableBuffer->getTailPosition();
+ while (tailPos <= posInIndexTable) {
+ if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) {
+ AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable);
+ return false;
+ }
+ }
+ if (contains(id)) {
+ // The entry is already in the content table.
+ const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable);
+ if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) {
+ AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value,
+ getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(),
+ mDataSize);
+ return false;
+ }
+ return true;
+ }
+ // The entry is not in the content table.
+ // Create new entry in the content table.
+ const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition());
+ if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) {
+ AKLOGE("cannot write index %d. pos %d", index, posInIndexTable);
+ return false;
+ }
+ // Write a new block that containing the entry to be set.
+ int writingPos = getPosInContentTable(0 /* id */, index);
+ for (int i = 0; i < mBlockSize; ++i) {
+ if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, mDataSize,
+ &writingPos)) {
+ AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, "
+ "mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize);
+ return false;
+ }
+ }
+ return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index));
+}
+
+int SparseTable::getIndexFromContentTablePos(const int contentTablePos) const {
+ return contentTablePos / mDataSize / mBlockSize;
+}
+
+int SparseTable::getPosInIndexTable(const int id) const {
+ return (id / mBlockSize) * INDEX_SIZE;
+}
+
+int SparseTable::getPosInContentTable(const int id, const int index) const {
+ const int offset = id % mBlockSize;
+ return (index * mBlockSize + offset) * mDataSize;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/sparse_table.h b/native/jni/src/dictionary/utils/sparse_table.h
new file mode 100644
index 000000000..bd1190e8b
--- /dev/null
+++ b/native/jni/src/dictionary/utils/sparse_table.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_SPARSE_TABLE_H
+#define LATINIME_SPARSE_TABLE_H
+
+#include <cstdint>
+
+#include "defines.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+
+namespace latinime {
+
+// TODO: Support multiple content buffers.
+class SparseTable {
+ public:
+ SparseTable(BufferWithExtendableBuffer *const indexTableBuffer,
+ BufferWithExtendableBuffer *const contentTableBuffer, const int blockSize,
+ const int dataSize)
+ : mIndexTableBuffer(indexTableBuffer), mContentTableBuffer(contentTableBuffer),
+ mBlockSize(blockSize), mDataSize(dataSize) {}
+
+ bool contains(const int id) const;
+
+ uint32_t get(const int id) const;
+
+ bool set(const int id, const uint32_t value);
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTable);
+
+ int getIndexFromContentTablePos(const int contentTablePos) const;
+
+ int getPosInIndexTable(const int id) const;
+
+ int getPosInContentTable(const int id, const int index) const;
+
+ static const int NOT_EXIST;
+ static const int INDEX_SIZE;
+
+ BufferWithExtendableBuffer *const mIndexTableBuffer;
+ BufferWithExtendableBuffer *const mContentTableBuffer;
+ const int mBlockSize;
+ const int mDataSize;
+};
+} // namespace latinime
+#endif /* LATINIME_SPARSE_TABLE_H */
diff --git a/native/jni/src/dictionary/utils/trie_map.cpp b/native/jni/src/dictionary/utils/trie_map.cpp
new file mode 100644
index 000000000..0bef8c702
--- /dev/null
+++ b/native/jni/src/dictionary/utils/trie_map.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/trie_map.h"
+
+#include "dictionary/utils/dict_file_writing_utils.h"
+
+namespace latinime {
+
+const int TrieMap::INVALID_INDEX = -1;
+const int TrieMap::FIELD0_SIZE = 4;
+const int TrieMap::FIELD1_SIZE = 3;
+const int TrieMap::ENTRY_SIZE = FIELD0_SIZE + FIELD1_SIZE;
+const uint32_t TrieMap::VALUE_FLAG = 0x400000;
+const uint32_t TrieMap::VALUE_MASK = 0x3FFFFF;
+const uint32_t TrieMap::INVALID_VALUE_IN_KEY_VALUE_ENTRY = VALUE_MASK;
+const uint32_t TrieMap::TERMINAL_LINK_FLAG = 0x800000;
+const uint32_t TrieMap::TERMINAL_LINK_MASK = 0x7FFFFF;
+const int TrieMap::NUM_OF_BITS_USED_FOR_ONE_LEVEL = 5;
+const uint32_t TrieMap::LABEL_MASK = 0x1F;
+const int TrieMap::MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL = 1 << NUM_OF_BITS_USED_FOR_ONE_LEVEL;
+const int TrieMap::ROOT_BITMAP_ENTRY_INDEX = 0;
+const int TrieMap::ROOT_BITMAP_ENTRY_POS = MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL * FIELD0_SIZE;
+const TrieMap::Entry TrieMap::EMPTY_BITMAP_ENTRY = TrieMap::Entry(0, 0);
+const int TrieMap::TERMINAL_LINKED_ENTRY_COUNT = 2; // Value entry and bitmap entry.
+const uint64_t TrieMap::MAX_VALUE =
+ (static_cast<uint64_t>(1) << ((FIELD0_SIZE + FIELD1_SIZE) * CHAR_BIT)) - 1;
+const int TrieMap::MAX_BUFFER_SIZE = TERMINAL_LINK_MASK * ENTRY_SIZE;
+
+TrieMap::TrieMap() : mBuffer(MAX_BUFFER_SIZE) {
+ mBuffer.extend(ROOT_BITMAP_ENTRY_POS);
+ writeEntry(EMPTY_BITMAP_ENTRY, ROOT_BITMAP_ENTRY_INDEX);
+}
+
+TrieMap::TrieMap(const ReadWriteByteArrayView buffer)
+ : mBuffer(buffer, BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {}
+
+void TrieMap::dump(const int from, const int to) const {
+ AKLOGI("BufSize: %d", mBuffer.getTailPosition());
+ for (int i = from; i < to; ++i) {
+ AKLOGI("Entry[%d]: %x, %x", i, readField0(i), readField1(i));
+ }
+ int unusedRegionSize = 0;
+ for (int i = 1; i <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; ++i) {
+ int index = readEmptyTableLink(i);
+ while (index != ROOT_BITMAP_ENTRY_INDEX) {
+ index = readField0(index);
+ unusedRegionSize += i;
+ }
+ }
+ AKLOGI("Unused Size: %d", unusedRegionSize);
+}
+
+int TrieMap::getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex) {
+ const Entry bitmapEntry = readEntry(bitmapEntryIndex);
+ const uint32_t unsignedKey = static_cast<uint32_t>(key);
+ const int terminalEntryIndex = getTerminalEntryIndex(
+ unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */);
+ if (terminalEntryIndex == INVALID_INDEX) {
+ // Not found.
+ return INVALID_INDEX;
+ }
+ const Entry terminalEntry = readEntry(terminalEntryIndex);
+ if (terminalEntry.hasTerminalLink()) {
+ return terminalEntry.getValueEntryIndex() + 1;
+ }
+ // Create a value entry and a bitmap entry.
+ const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT);
+ if (valueEntryIndex == INVALID_INDEX) {
+ return INVALID_INDEX;
+ }
+ if (!writeEntry(Entry(0, terminalEntry.getValue()), valueEntryIndex)) {
+ return INVALID_INDEX;
+ }
+ if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) {
+ return INVALID_INDEX;
+ }
+ if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex)) {
+ return INVALID_INDEX;
+ }
+ return valueEntryIndex + 1;
+}
+
+const TrieMap::Result TrieMap::get(const int key, const int bitmapEntryIndex) const {
+ const uint32_t unsignedKey = static_cast<uint32_t>(key);
+ return getInternal(unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntryIndex,
+ 0 /* level */);
+}
+
+bool TrieMap::put(const int key, const uint64_t value, const int bitmapEntryIndex) {
+ if (value > MAX_VALUE) {
+ return false;
+ }
+ const uint32_t unsignedKey = static_cast<uint32_t>(key);
+ return putInternal(unsignedKey, value, getBitShuffledKey(unsignedKey), bitmapEntryIndex,
+ readEntry(bitmapEntryIndex), 0 /* level */);
+}
+
+bool TrieMap::save(FILE *const file) const {
+ return DictFileWritingUtils::writeBufferToFileTail(file, &mBuffer);
+}
+
+bool TrieMap::remove(const int key, const int bitmapEntryIndex) {
+ const Entry bitmapEntry = readEntry(bitmapEntryIndex);
+ const uint32_t unsignedKey = static_cast<uint32_t>(key);
+ const int terminalEntryIndex = getTerminalEntryIndex(
+ unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */);
+ if (terminalEntryIndex == INVALID_INDEX) {
+ // Not found.
+ return false;
+ }
+ const Entry terminalEntry = readEntry(terminalEntryIndex);
+ if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , terminalEntryIndex)) {
+ return false;
+ }
+ if (terminalEntry.hasTerminalLink()) {
+ const Entry nextLevelBitmapEntry = readEntry(terminalEntry.getValueEntryIndex() + 1);
+ if (!freeTable(terminalEntry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) {
+ return false;
+ }
+ if (!removeInner(nextLevelBitmapEntry)){
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * Iterate next entry in a certain level.
+ *
+ * @param iterationState the iteration state that will be read and updated in this method.
+ * @param outKey the output key
+ * @return Result instance. mIsValid is false when all entries are iterated.
+ */
+const TrieMap::Result TrieMap::iterateNext(std::vector<TableIterationState> *const iterationState,
+ int *const outKey) const {
+ while (!iterationState->empty()) {
+ TableIterationState &state = iterationState->back();
+ if (state.mTableSize <= state.mCurrentIndex) {
+ // Move to parent.
+ iterationState->pop_back();
+ } else {
+ const int entryIndex = state.mTableIndex + state.mCurrentIndex;
+ state.mCurrentIndex += 1;
+ const Entry entry = readEntry(entryIndex);
+ if (entry.isBitmapEntry()) {
+ // Move to child.
+ iterationState->emplace_back(popCount(entry.getBitmap()), entry.getTableIndex());
+ } else if (entry.isValidTerminalEntry()) {
+ if (outKey) {
+ *outKey = entry.getKey();
+ }
+ if (!entry.hasTerminalLink()) {
+ return Result(entry.getValue(), true, INVALID_INDEX);
+ }
+ const int valueEntryIndex = entry.getValueEntryIndex();
+ const Entry valueEntry = readEntry(valueEntryIndex);
+ return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1);
+ }
+ }
+ }
+ // Visited all entries.
+ return Result(0, false, INVALID_INDEX);
+}
+
+/**
+ * Shuffle bits of the key in the fixed order.
+ *
+ * This method is used as a hash function. This returns different values for different inputs.
+ */
+uint32_t TrieMap::getBitShuffledKey(const uint32_t key) const {
+ uint32_t shuffledKey = 0;
+ for (int i = 0; i < 4; ++i) {
+ const uint32_t keyPiece = (key >> (i * 8)) & 0xFF;
+ shuffledKey ^= ((keyPiece ^ (keyPiece << 7) ^ (keyPiece << 14) ^ (keyPiece << 21))
+ & 0x11111111) << i;
+ }
+ return shuffledKey;
+}
+
+bool TrieMap::writeValue(const uint64_t value, const int terminalEntryIndex) {
+ if (value < VALUE_MASK) {
+ // Write value into the terminal entry.
+ return writeField1(value | VALUE_FLAG, terminalEntryIndex);
+ }
+ // Create value entry and write value.
+ const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT);
+ if (valueEntryIndex == INVALID_INDEX) {
+ return false;
+ }
+ if (!writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex)) {
+ return false;
+ }
+ if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) {
+ return false;
+ }
+ return writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex);
+}
+
+bool TrieMap::updateValue(const Entry &terminalEntry, const uint64_t value,
+ const int terminalEntryIndex) {
+ if (!terminalEntry.hasTerminalLink()) {
+ return writeValue(value, terminalEntryIndex);
+ }
+ const int valueEntryIndex = terminalEntry.getValueEntryIndex();
+ return writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex);
+}
+
+bool TrieMap::freeTable(const int tableIndex, const int entryCount) {
+ if (!writeField0(readEmptyTableLink(entryCount), tableIndex)) {
+ return false;
+ }
+ return writeEmptyTableLink(tableIndex, entryCount);
+}
+
+/**
+ * Allocate table with entryCount-entries. Reuse freed table if possible.
+ */
+int TrieMap::allocateTable(const int entryCount) {
+ if (entryCount > 0 && entryCount <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL) {
+ const int tableIndex = readEmptyTableLink(entryCount);
+ if (tableIndex > 0) {
+ if (!writeEmptyTableLink(readField0(tableIndex), entryCount)) {
+ return INVALID_INDEX;
+ }
+ // Reuse the table.
+ return tableIndex;
+ }
+ }
+ // Allocate memory space at tail position of the buffer.
+ const int mapIndex = getTailEntryIndex();
+ if (!mBuffer.extend(entryCount * ENTRY_SIZE)) {
+ return INVALID_INDEX;
+ }
+ return mapIndex;
+}
+
+int TrieMap::getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey,
+ const Entry &bitmapEntry, const int level) const {
+ const int label = getLabel(hashedKey, level);
+ if (!exists(bitmapEntry.getBitmap(), label)) {
+ return INVALID_INDEX;
+ }
+ const int entryIndex = bitmapEntry.getTableIndex() + popCount(bitmapEntry.getBitmap(), label);
+ const Entry entry = readEntry(entryIndex);
+ if (entry.isBitmapEntry()) {
+ // Move to the next level.
+ return getTerminalEntryIndex(key, hashedKey, entry, level + 1);
+ }
+ if (!entry.isValidTerminalEntry()) {
+ return INVALID_INDEX;
+ }
+ if (entry.getKey() == key) {
+ // Terminal entry is found.
+ return entryIndex;
+ }
+ return INVALID_INDEX;
+}
+
+/**
+ * Get Result corresponding to the key.
+ *
+ * @param key the key.
+ * @param hashedKey the hashed key.
+ * @param bitmapEntryIndex the index of bitmap entry
+ * @param level current level
+ * @return Result instance corresponding to the key. mIsValid indicates whether the key is in the
+ * map.
+ */
+const TrieMap::Result TrieMap::getInternal(const uint32_t key, const uint32_t hashedKey,
+ const int bitmapEntryIndex, const int level) const {
+ const int terminalEntryIndex = getTerminalEntryIndex(key, hashedKey,
+ readEntry(bitmapEntryIndex), level);
+ if (terminalEntryIndex == INVALID_INDEX) {
+ // Not found.
+ return Result(0, false, INVALID_INDEX);
+ }
+ const Entry terminalEntry = readEntry(terminalEntryIndex);
+ if (!terminalEntry.hasTerminalLink()) {
+ return Result(terminalEntry.getValue(), true, INVALID_INDEX);
+ }
+ const int valueEntryIndex = terminalEntry.getValueEntryIndex();
+ const Entry valueEntry = readEntry(valueEntryIndex);
+ return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1);
+}
+
+/**
+ * Put key to value mapping to the map.
+ *
+ * @param key the key.
+ * @param value the value
+ * @param hashedKey the hashed key.
+ * @param bitmapEntryIndex the index of bitmap entry
+ * @param bitmapEntry the bitmap entry
+ * @param level current level
+ * @return whether the key-value has been correctly inserted to the map or not.
+ */
+bool TrieMap::putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey,
+ const int bitmapEntryIndex, const Entry &bitmapEntry, const int level) {
+ const int label = getLabel(hashedKey, level);
+ const uint32_t bitmap = bitmapEntry.getBitmap();
+ const int mapIndex = bitmapEntry.getTableIndex();
+ if (!exists(bitmap, label)) {
+ // Current map doesn't contain the label.
+ return addNewEntryByExpandingTable(key, value, mapIndex, bitmap, bitmapEntryIndex, label);
+ }
+ const int entryIndex = mapIndex + popCount(bitmap, label);
+ const Entry entry = readEntry(entryIndex);
+ if (entry.isBitmapEntry()) {
+ // Bitmap entry is found. Go to the next level.
+ return putInternal(key, value, hashedKey, entryIndex, entry, level + 1);
+ }
+ if (!entry.isValidTerminalEntry()) {
+ // Overwrite invalid terminal entry.
+ return writeTerminalEntry(key, value, entryIndex);
+ }
+ if (entry.getKey() == key) {
+ // Terminal entry for the key is found. Update the value.
+ return updateValue(entry, value, entryIndex);
+ }
+ // Conflict with the existing key.
+ return addNewEntryByResolvingConflict(key, value, hashedKey, entry, entryIndex, level);
+}
+
+/**
+ * Resolve a conflict in the current level and add new entry.
+ *
+ * @param key the key
+ * @param value the value
+ * @param hashedKey the hashed key
+ * @param conflictedEntry the existing conflicted entry
+ * @param conflictedEntryIndex the index of existing conflicted entry
+ * @param level current level
+ * @return whether the key-value has been correctly inserted to the map or not.
+ */
+bool TrieMap::addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value,
+ const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex,
+ const int level) {
+ const int conflictedKeyNextLabel =
+ getLabel(getBitShuffledKey(conflictedEntry.getKey()), level + 1);
+ const int nextLabel = getLabel(hashedKey, level + 1);
+ if (conflictedKeyNextLabel == nextLabel) {
+ // Conflicted again in the next level.
+ const int newTableIndex = allocateTable(1 /* entryCount */);
+ if (newTableIndex == INVALID_INDEX) {
+ return false;
+ }
+ if (!writeEntry(conflictedEntry, newTableIndex)) {
+ return false;
+ }
+ const Entry newBitmapEntry(setExist(0 /* bitmap */, nextLabel), newTableIndex);
+ if (!writeEntry(newBitmapEntry, conflictedEntryIndex)) {
+ return false;
+ }
+ return putInternal(key, value, hashedKey, conflictedEntryIndex, newBitmapEntry, level + 1);
+ }
+ // The conflict has been resolved. Create a table that contains 2 entries.
+ const int newTableIndex = allocateTable(2 /* entryCount */);
+ if (newTableIndex == INVALID_INDEX) {
+ return false;
+ }
+ if (nextLabel < conflictedKeyNextLabel) {
+ if (!writeTerminalEntry(key, value, newTableIndex)) {
+ return false;
+ }
+ if (!writeEntry(conflictedEntry, newTableIndex + 1)) {
+ return false;
+ }
+ } else { // nextLabel > conflictedKeyNextLabel
+ if (!writeEntry(conflictedEntry, newTableIndex)) {
+ return false;
+ }
+ if (!writeTerminalEntry(key, value, newTableIndex + 1)) {
+ return false;
+ }
+ }
+ const uint32_t updatedBitmap =
+ setExist(setExist(0 /* bitmap */, nextLabel), conflictedKeyNextLabel);
+ return writeEntry(Entry(updatedBitmap, newTableIndex), conflictedEntryIndex);
+}
+
+/**
+ * Add new entry to the existing table.
+ */
+bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t value,
+ const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, const int label) {
+ // Current map doesn't contain the label.
+ const int entryCount = popCount(bitmap);
+ const int newTableIndex = allocateTable(entryCount + 1);
+ if (newTableIndex == INVALID_INDEX) {
+ return false;
+ }
+ const int newEntryIndexInTable = popCount(bitmap, label);
+ // Copy from existing table to the new table.
+ for (int i = 0; i < entryCount; ++i) {
+ if (!copyEntry(tableIndex + i, newTableIndex + i + (i >= newEntryIndexInTable ? 1 : 0))) {
+ return false;
+ }
+ }
+ // Write new terminal entry.
+ if (!writeTerminalEntry(key, value, newTableIndex + newEntryIndexInTable)) {
+ return false;
+ }
+ // Update bitmap.
+ if (!writeEntry(Entry(setExist(bitmap, label), newTableIndex), bitmapEntryIndex)) {
+ return false;
+ }
+ if (entryCount > 0) {
+ return freeTable(tableIndex, entryCount);
+ }
+ return true;
+}
+
+bool TrieMap::removeInner(const Entry &bitmapEntry) {
+ const int tableSize = popCount(bitmapEntry.getBitmap());
+ if (tableSize <= 0) {
+ // The table is empty. No need to remove any entries.
+ return true;
+ }
+ for (int i = 0; i < tableSize; ++i) {
+ const int entryIndex = bitmapEntry.getTableIndex() + i;
+ const Entry entry = readEntry(entryIndex);
+ if (entry.isBitmapEntry()) {
+ // Delete next bitmap entry recursively.
+ if (!removeInner(entry)) {
+ return false;
+ }
+ } else {
+ // Invalidate terminal entry just in case.
+ if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , entryIndex)) {
+ return false;
+ }
+ if (entry.hasTerminalLink()) {
+ const Entry nextLevelBitmapEntry = readEntry(entry.getValueEntryIndex() + 1);
+ if (!freeTable(entry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) {
+ return false;
+ }
+ if (!removeInner(nextLevelBitmapEntry)) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/utils/trie_map.h b/native/jni/src/dictionary/utils/trie_map.h
new file mode 100644
index 000000000..5fc6c2690
--- /dev/null
+++ b/native/jni/src/dictionary/utils/trie_map.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_TRIE_MAP_H
+#define LATINIME_TRIE_MAP_H
+
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+/**
+ * Trie map derived from Phil Bagwell's Hash Array Mapped Trie.
+ * key is int and value is uint64_t.
+ * This supports multiple level map. Terminal entries can have a bitmap for the next level map.
+ * This doesn't support root map resizing.
+ */
+class TrieMap {
+ public:
+ struct Result {
+ const uint64_t mValue;
+ const bool mIsValid;
+ const int mNextLevelBitmapEntryIndex;
+
+ Result(const uint64_t value, const bool isValid, const int nextLevelBitmapEntryIndex)
+ : mValue(value), mIsValid(isValid),
+ mNextLevelBitmapEntryIndex(nextLevelBitmapEntryIndex) {}
+ };
+
+ /**
+ * Struct to record iteration state in a table.
+ */
+ struct TableIterationState {
+ int mTableSize;
+ int mTableIndex;
+ int mCurrentIndex;
+
+ TableIterationState(const int tableSize, const int tableIndex)
+ : mTableSize(tableSize), mTableIndex(tableIndex), mCurrentIndex(0) {}
+ };
+
+ class TrieMapRange;
+ class TrieMapIterator {
+ public:
+ class IterationResult {
+ public:
+ IterationResult(const TrieMap *const trieMap, const int key, const uint64_t value,
+ const int nextLeveBitmapEntryIndex)
+ : mTrieMap(trieMap), mKey(key), mValue(value),
+ mNextLevelBitmapEntryIndex(nextLeveBitmapEntryIndex) {}
+
+ const TrieMapRange getEntriesInNextLevel() const {
+ return TrieMapRange(mTrieMap, mNextLevelBitmapEntryIndex);
+ }
+
+ bool hasNextLevelMap() const {
+ return mNextLevelBitmapEntryIndex != INVALID_INDEX;
+ }
+
+ AK_FORCE_INLINE int key() const {
+ return mKey;
+ }
+
+ AK_FORCE_INLINE uint64_t value() const {
+ return mValue;
+ }
+
+ AK_FORCE_INLINE int getNextLevelBitmapEntryIndex() const {
+ return mNextLevelBitmapEntryIndex;
+ }
+
+ private:
+ const TrieMap *const mTrieMap;
+ const int mKey;
+ const uint64_t mValue;
+ const int mNextLevelBitmapEntryIndex;
+ };
+
+ TrieMapIterator(const TrieMap *const trieMap, const int bitmapEntryIndex)
+ : mTrieMap(trieMap), mStateStack(), mBaseBitmapEntryIndex(bitmapEntryIndex),
+ mKey(0), mValue(0), mIsValid(false), mNextLevelBitmapEntryIndex(INVALID_INDEX) {
+ if (!trieMap || mBaseBitmapEntryIndex == INVALID_INDEX) {
+ return;
+ }
+ const Entry bitmapEntry = mTrieMap->readEntry(mBaseBitmapEntryIndex);
+ mStateStack.emplace_back(
+ mTrieMap->popCount(bitmapEntry.getBitmap()), bitmapEntry.getTableIndex());
+ this->operator++();
+ }
+
+ const IterationResult operator*() const {
+ return IterationResult(mTrieMap, mKey, mValue, mNextLevelBitmapEntryIndex);
+ }
+
+ bool operator!=(const TrieMapIterator &other) const {
+ // Caveat: This works only for for loops.
+ return mIsValid || other.mIsValid;
+ }
+
+ const TrieMapIterator &operator++() {
+ const Result result = mTrieMap->iterateNext(&mStateStack, &mKey);
+ mValue = result.mValue;
+ mIsValid = result.mIsValid;
+ mNextLevelBitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
+ return *this;
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapIterator);
+ DISALLOW_ASSIGNMENT_OPERATOR(TrieMapIterator);
+
+ const TrieMap *const mTrieMap;
+ std::vector<TrieMap::TableIterationState> mStateStack;
+ const int mBaseBitmapEntryIndex;
+ int mKey;
+ uint64_t mValue;
+ bool mIsValid;
+ int mNextLevelBitmapEntryIndex;
+ };
+
+ /**
+ * Class to support iterating entries in TrieMap by range base for loops.
+ */
+ class TrieMapRange {
+ public:
+ TrieMapRange(const TrieMap *const trieMap, const int bitmapEntryIndex)
+ : mTrieMap(trieMap), mBaseBitmapEntryIndex(bitmapEntryIndex) {};
+
+ TrieMapIterator begin() const {
+ return TrieMapIterator(mTrieMap, mBaseBitmapEntryIndex);
+ }
+
+ const TrieMapIterator end() const {
+ return TrieMapIterator(nullptr, INVALID_INDEX);
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapRange);
+ DISALLOW_ASSIGNMENT_OPERATOR(TrieMapRange);
+
+ const TrieMap *const mTrieMap;
+ const int mBaseBitmapEntryIndex;
+ };
+
+ static const int INVALID_INDEX;
+ static const uint64_t MAX_VALUE;
+
+ TrieMap();
+ // Construct TrieMap using existing data in the memory region written by save().
+ TrieMap(const ReadWriteByteArrayView buffer);
+ void dump(const int from = 0, const int to = 0) const;
+
+ bool isNearSizeLimit() const {
+ return mBuffer.isNearSizeLimit();
+ }
+
+ int getRootBitmapEntryIndex() const {
+ return ROOT_BITMAP_ENTRY_INDEX;
+ }
+
+ // Returns bitmapEntryIndex. Create the next level map if it doesn't exist.
+ int getNextLevelBitmapEntryIndex(const int key) {
+ return getNextLevelBitmapEntryIndex(key, ROOT_BITMAP_ENTRY_INDEX);
+ }
+
+ int getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex);
+
+ const Result getRoot(const int key) const {
+ return get(key, ROOT_BITMAP_ENTRY_INDEX);
+ }
+
+ const Result get(const int key, const int bitmapEntryIndex) const;
+
+ bool putRoot(const int key, const uint64_t value) {
+ return put(key, value, ROOT_BITMAP_ENTRY_INDEX);
+ }
+
+ bool put(const int key, const uint64_t value, const int bitmapEntryIndex);
+
+ const TrieMapRange getEntriesInRootLevel() const {
+ return getEntriesInSpecifiedLevel(ROOT_BITMAP_ENTRY_INDEX);
+ }
+
+ const TrieMapRange getEntriesInSpecifiedLevel(const int bitmapEntryIndex) const {
+ return TrieMapRange(this, bitmapEntryIndex);
+ }
+
+ bool save(FILE *const file) const;
+
+ bool remove(const int key, const int bitmapEntryIndex);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(TrieMap);
+
+ /**
+ * Struct represents an entry.
+ *
+ * Entry is one of these entry types. All entries are fixed size and have 2 fields FIELD_0 and
+ * FIELD_1.
+ * 1. bitmap entry. bitmap entry contains bitmap and the link to hash table.
+ * FIELD_0(bitmap) FIELD_1(LINK_TO_HASH_TABLE)
+ * 2. terminal entry. terminal entry contains hashed key and value or terminal link. terminal
+ * entry have terminal link when the value is not fit to FIELD_1 or there is a next level map
+ * for the key.
+ * FIELD_0(hashed key) (FIELD_1(VALUE_FLAG VALUE) | FIELD_1(TERMINAL_LINK_FLAG TERMINAL_LINK))
+ * 3. value entry. value entry represents a value. Upper order bytes are stored in FIELD_0 and
+ * lower order bytes are stored in FIELD_1.
+ * FIELD_0(value (upper order bytes)) FIELD_1(value (lower order bytes))
+ */
+ struct Entry {
+ const uint32_t mData0;
+ const uint32_t mData1;
+
+ Entry(const uint32_t data0, const uint32_t data1) : mData0(data0), mData1(data1) {}
+
+ AK_FORCE_INLINE bool isBitmapEntry() const {
+ return (mData1 & VALUE_FLAG) == 0 && (mData1 & TERMINAL_LINK_FLAG) == 0;
+ }
+
+ AK_FORCE_INLINE bool hasTerminalLink() const {
+ return (mData1 & TERMINAL_LINK_FLAG) != 0;
+ }
+
+ // For terminal entry.
+ AK_FORCE_INLINE uint32_t getKey() const {
+ return mData0;
+ }
+
+ // For terminal entry.
+ AK_FORCE_INLINE uint32_t getValue() const {
+ return mData1 & VALUE_MASK;
+ }
+
+ // For terminal entry.
+ AK_FORCE_INLINE bool isValidTerminalEntry() const {
+ return hasTerminalLink() || ((mData1 & VALUE_MASK) != INVALID_VALUE_IN_KEY_VALUE_ENTRY);
+ }
+
+ // For terminal entry.
+ AK_FORCE_INLINE uint32_t getValueEntryIndex() const {
+ return mData1 & TERMINAL_LINK_MASK;
+ }
+
+ // For bitmap entry.
+ AK_FORCE_INLINE uint32_t getBitmap() const {
+ return mData0;
+ }
+
+ // For bitmap entry.
+ AK_FORCE_INLINE int getTableIndex() const {
+ return static_cast<int>(mData1);
+ }
+
+ // For value entry.
+ AK_FORCE_INLINE uint64_t getValueOfValueEntry() const {
+ return ((static_cast<uint64_t>(mData0) << (FIELD1_SIZE * CHAR_BIT)) ^ mData1);
+ }
+ };
+
+ BufferWithExtendableBuffer mBuffer;
+
+ static const int FIELD0_SIZE;
+ static const int FIELD1_SIZE;
+ static const int ENTRY_SIZE;
+ static const uint32_t VALUE_FLAG;
+ static const uint32_t VALUE_MASK;
+ static const uint32_t INVALID_VALUE_IN_KEY_VALUE_ENTRY;
+ static const uint32_t TERMINAL_LINK_FLAG;
+ static const uint32_t TERMINAL_LINK_MASK;
+ static const int NUM_OF_BITS_USED_FOR_ONE_LEVEL;
+ static const uint32_t LABEL_MASK;
+ static const int MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL;
+ static const int ROOT_BITMAP_ENTRY_INDEX;
+ static const int ROOT_BITMAP_ENTRY_POS;
+ static const Entry EMPTY_BITMAP_ENTRY;
+ static const int TERMINAL_LINKED_ENTRY_COUNT;
+ static const int MAX_BUFFER_SIZE;
+
+ uint32_t getBitShuffledKey(const uint32_t key) const;
+ bool writeValue(const uint64_t value, const int terminalEntryIndex);
+ bool updateValue(const Entry &terminalEntry, const uint64_t value,
+ const int terminalEntryIndex);
+ bool freeTable(const int tableIndex, const int entryCount);
+ int allocateTable(const int entryCount);
+ int getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey,
+ const Entry &bitmapEntry, const int level) const;
+ const Result getInternal(const uint32_t key, const uint32_t hashedKey,
+ const int bitmapEntryIndex, const int level) const;
+ bool putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey,
+ const int bitmapEntryIndex, const Entry &bitmapEntry, const int level);
+ bool addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value,
+ const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex,
+ const int level);
+ bool addNewEntryByExpandingTable(const uint32_t key, const uint64_t value,
+ const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex,
+ const int label);
+ const Result iterateNext(std::vector<TableIterationState> *const iterationState,
+ int *const outKey) const;
+
+ AK_FORCE_INLINE const Entry readEntry(const int entryIndex) const {
+ return Entry(readField0(entryIndex), readField1(entryIndex));
+ }
+
+ // Returns whether an entry for the index is existing by testing if the index-th bit in the
+ // bitmap is set or not.
+ AK_FORCE_INLINE bool exists(const uint32_t bitmap, const int index) const {
+ return (bitmap & (1 << index)) != 0;
+ }
+
+ // Set index-th bit in the bitmap.
+ AK_FORCE_INLINE uint32_t setExist(const uint32_t bitmap, const int index) const {
+ return bitmap | (1 << index);
+ }
+
+ // Count set bits before index in the bitmap.
+ AK_FORCE_INLINE int popCount(const uint32_t bitmap, const int index) const {
+ return popCount(bitmap & ((1 << index) - 1));
+ }
+
+ // Count set bits in the bitmap.
+ AK_FORCE_INLINE int popCount(const uint32_t bitmap) const {
+ return __builtin_popcount(bitmap);
+ // int v = bitmap - ((bitmap >> 1) & 0x55555555);
+ // v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+ // return (((v + (v >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
+ }
+
+ AK_FORCE_INLINE int getLabel(const uint32_t hashedKey, const int level) const {
+ return (hashedKey >> (level * NUM_OF_BITS_USED_FOR_ONE_LEVEL)) & LABEL_MASK;
+ }
+
+ AK_FORCE_INLINE uint32_t readField0(const int entryIndex) const {
+ return mBuffer.readUint(FIELD0_SIZE, ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE);
+ }
+
+ AK_FORCE_INLINE uint32_t readField1(const int entryIndex) const {
+ return mBuffer.readUint(FIELD1_SIZE,
+ ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE);
+ }
+
+ AK_FORCE_INLINE int readEmptyTableLink(const int entryCount) const {
+ return mBuffer.readUint(FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE);
+ }
+
+ AK_FORCE_INLINE bool writeEmptyTableLink(const int tableIndex, const int entryCount) {
+ return mBuffer.writeUint(tableIndex, FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE);
+ }
+
+ AK_FORCE_INLINE bool writeField0(const uint32_t data, const int entryIndex) {
+ return mBuffer.writeUint(data, FIELD0_SIZE,
+ ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE);
+ }
+
+ AK_FORCE_INLINE bool writeField1(const uint32_t data, const int entryIndex) {
+ return mBuffer.writeUint(data, FIELD1_SIZE,
+ ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE);
+ }
+
+ AK_FORCE_INLINE bool writeEntry(const Entry &entry, const int entryIndex) {
+ return writeField0(entry.mData0, entryIndex) && writeField1(entry.mData1, entryIndex);
+ }
+
+ AK_FORCE_INLINE bool writeTerminalEntry(const uint32_t key, const uint64_t value,
+ const int entryIndex) {
+ return writeField0(key, entryIndex) && writeValue(value, entryIndex);
+ }
+
+ AK_FORCE_INLINE bool copyEntry(const int originalEntryIndex, const int newEntryIndex) {
+ return writeEntry(readEntry(originalEntryIndex), newEntryIndex);
+ }
+
+ AK_FORCE_INLINE int getTailEntryIndex() const {
+ return (mBuffer.getTailPosition() - ROOT_BITMAP_ENTRY_POS) / ENTRY_SIZE;
+ }
+
+ bool removeInner(const Entry &bitmapEntry);
+};
+
+} // namespace latinime
+#endif /* LATINIME_TRIE_MAP_H */