aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni')
-rw-r--r--native/jni/NativeFileList.mk1
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp37
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h114
3 files changed, 152 insertions, 0 deletions
diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk
index 72bddbba1..55bb68344 100644
--- a/native/jni/NativeFileList.mk
+++ b/native/jni/NativeFileList.mk
@@ -71,6 +71,7 @@ LATIN_IME_CORE_SRC_FILES := \
ver4_patricia_trie_writing_helper.cpp \
ver4_pt_node_array_reader.cpp) \
$(addprefix suggest/policyimpl/dictionary/structure/v4/content/, \
+ dynamic_language_model_probability_utils.cpp \
language_model_dict_content.cpp \
language_model_dict_content_global_counters.cpp \
shortcut_dict_content.cpp \
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
new file mode 100644
index 000000000..b0fbb3e72
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
+
+namespace latinime {
+
+// These counts are used to provide stable probabilities even if the user's input count is small.
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_UNIGRAMS = 8192;
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_BIGRAMS = 2;
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_TRIGRAMS = 2;
+
+// These are encoded backoff weights.
+// Note that we give positive value for trigrams that means the weight is more than 1.
+// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight.
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS = -32;
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS = 0;
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS = 8;
+
+// This value is used to remove too old entries from the dictionary.
+const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS =
+ 300 * 24 * 60 * 60; // 300 days
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
new file mode 100644
index 000000000..88bc58fe8
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+
+#include <algorithm>
+
+#include "defines.h"
+#include "suggest/core/dictionary/property/historical_info.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+class DynamicLanguageModelProbabilityUtils {
+ public:
+ static float computeRawProbabilityFromCounts(const int count, const int contextCount,
+ const int matchedWordCountInContext) {
+ int minCount = 0;
+ switch (matchedWordCountInContext) {
+ case 1:
+ minCount = ASSUMED_MIN_COUNT_FOR_UNIGRAMS;
+ break;
+ case 2:
+ minCount = ASSUMED_MIN_COUNT_FOR_BIGRAMS;
+ break;
+ case 3:
+ minCount = ASSUMED_MIN_COUNT_FOR_TRIGRAMS;
+ break;
+ default:
+ AKLOGE("computeRawProbabilityFromCounts is called with invalid "
+ "matchedWordCountInContext (%d).", matchedWordCountInContext);
+ ASSERT(false);
+ return 0.0f;
+ }
+ return static_cast<float>(count) / static_cast<float>(std::max(contextCount, minCount));
+ }
+
+ static float backoff(const int ngramProbability, const int matchedWordCountInContext) {
+ int probability = NOT_A_PROBABILITY;
+
+ switch (matchedWordCountInContext) {
+ case 1:
+ probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS;
+ break;
+ case 2:
+ probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS;
+ break;
+ case 3:
+ probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS;
+ break;
+ default:
+ AKLOGE("backoff is called with invalid matchedWordCountInContext (%d).",
+ matchedWordCountInContext);
+ ASSERT(false);
+ return NOT_A_PROBABILITY;
+ }
+ return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY);
+ }
+
+ static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) {
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ if (elapsedTime < 0) {
+ AKLOGE("The elapsed time is negatime value. Timestamp overflow?");
+ return NOT_A_PROBABILITY;
+ }
+ // TODO: Improve this logic.
+ // We don't modify probability depending on the elapsed time.
+ return probability;
+ }
+
+ static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+ }
+
+ static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ // More recently input entries get higher priority.
+ return historicalInfo.getTimestamp();
+ }
+
+private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils);
+
+ static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 2, "Max supported Ngram is Trigram.");
+
+ static const int ASSUMED_MIN_COUNT_FOR_UNIGRAMS;
+ static const int ASSUMED_MIN_COUNT_FOR_BIGRAMS;
+ static const int ASSUMED_MIN_COUNT_FOR_TRIGRAMS;
+
+ static const int ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS;
+ static const int ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS;
+ static const int ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS;
+
+ static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+};
+
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */