aboutsummaryrefslogtreecommitdiffstats
path: root/native
diff options
context:
space:
mode:
Diffstat (limited to 'native')
-rw-r--r--native/dicttoolkit/Android.mk67
-rw-r--r--native/dicttoolkit/CleanupNativeFileList.mk17
-rw-r--r--native/dicttoolkit/NativeFileList.mk21
-rw-r--r--native/dicttoolkit/UnitTests.mk68
-rw-r--r--native/dicttoolkit/dict_toolkit_main.cpp23
-rwxr-xr-xnative/dicttoolkit/run_tests.sh34
-rw-r--r--native/dicttoolkit/src/dict_toolkit_defines.h22
-rw-r--r--native/dicttoolkit/tests/dict_toolkit_defines_test.cpp32
-rw-r--r--native/jni/NativeFileList.mk3
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.cpp3
-rw-r--r--native/jni/src/suggest/core/dictionary/error_type_utils.cpp1
-rw-r--r--native/jni/src/suggest/core/dictionary/error_type_utils.h5
-rw-r--r--native/jni/src/suggest/core/dictionary/ngram_listener.h2
-rw-r--r--native/jni/src/suggest/core/dictionary/property/historical_info.h1
-rw-r--r--native/jni/src/suggest/core/policy/scoring.h2
-rw-r--r--native/jni/src/suggest/core/result/suggestions_output_utils.cpp66
-rw-r--r--native/jni/src/suggest/core/result/suggestions_output_utils.h9
-rw-r--r--native/jni/src/suggest/core/suggest.cpp5
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h16
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp7
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp3
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp18
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h11
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp12
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp37
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h114
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp127
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h26
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h2
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h1
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp65
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp14
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h8
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp23
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h15
-rw-r--r--native/jni/src/suggest/policyimpl/typing/scoring_params.cpp1
-rw-r--r--native/jni/src/suggest/policyimpl/typing/scoring_params.h1
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_scoring.h47
-rw-r--r--native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp8
-rw-r--r--native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp8
-rw-r--r--native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp8
-rw-r--r--native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp33
47 files changed, 824 insertions, 180 deletions
diff --git a/native/dicttoolkit/Android.mk b/native/dicttoolkit/Android.mk
new file mode 100644
index 000000000..118682dfc
--- /dev/null
+++ b/native/dicttoolkit/Android.mk
@@ -0,0 +1,67 @@
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ifeq (,$(TARGET_BUILD_APPS))
+
+# Only build if it's explicitly requested, or running mm/mmm.
+ifneq ($(ONE_SHOT_MAKEFILE)$(filter $(MAKECMDGOALS),dicttoolkit),)
+
+# HACK: Temporarily disable host tool build on Mac until the build system is ready for C++11.
+LATINIME_HOST_OSNAME := $(shell uname -s)
+ifneq ($(LATINIME_HOST_OSNAME), Darwin) # TODO: Remove this
+
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LATIN_IME_CORE_PATH := $(LOCAL_PATH)/../jni
+
+LATIN_IME_DICT_TOOLKIT_SRC_DIR := src
+LATIN_IME_CORE_SRC_DIR := ../jni/src
+
+LOCAL_CFLAGS += -Werror -Wall -Wextra -Weffc++ -Wformat=2 -Wcast-qual -Wcast-align \
+ -Wwrite-strings -Wfloat-equal -Wpointer-arith -Winit-self -Wredundant-decls \
+ -Woverloaded-virtual -Wsign-promo -Wno-system-headers
+
+# To suppress compiler warnings for unused variables/functions used for debug features etc.
+LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function
+LOCAL_CFLAGS += -std=c++11 -Wno-unused-parameter -Wno-unused-function
+
+include $(LOCAL_PATH)/NativeFileList.mk
+include $(LATIN_IME_CORE_PATH)/NativeFileList.mk
+
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_DICT_TOOLKIT_SRC_DIR) \
+ $(LATIN_IME_CORE_PATH)/$(LATIN_IME_CORE_SRC_DIR)
+
+LOCAL_SRC_FILES := $(LATIN_IME_DICT_TOOLKIT_MAIN_SRC_FILES) \
+ $(addprefix $(LATIN_IME_DICT_TOOLKIT_SRC_DIR)/, $(LATIN_IME_DICT_TOOLKIT_SRC_FILES)) \
+ $(addprefix $(LATIN_IME_CORE_SRC_DIR)/, $(LATIN_IME_CORE_SRC_FILES))
+
+LOCAL_MODULE := dicttoolkit
+LOCAL_MODULE_TAGS := optional
+
+LOCAL_CLANG := true
+LOCAL_CXX_STL := libc++
+
+include $(BUILD_HOST_EXECUTABLE)
+#################### Clean up the tmp vars
+include $(LOCAL_PATH)/CleanupNativeFileList.mk
+#################### Unit test
+include $(LOCAL_PATH)/UnitTests.mk
+
+endif # Darwin - TODO: Remove this
+
+endif
+
+endif # TARGET_BUILD_APPS
diff --git a/native/dicttoolkit/CleanupNativeFileList.mk b/native/dicttoolkit/CleanupNativeFileList.mk
new file mode 100644
index 000000000..b804b41ed
--- /dev/null
+++ b/native/dicttoolkit/CleanupNativeFileList.mk
@@ -0,0 +1,17 @@
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LATIN_IME_DICT_TOOLKIT_MAIN_SRC_FILES :=
+LATIN_IME_DICT_TOOLKIT_SRC_FILES :=
+LATIN_IME_DICT_TOOLKIT_TEST_FILES :=
diff --git a/native/dicttoolkit/NativeFileList.mk b/native/dicttoolkit/NativeFileList.mk
new file mode 100644
index 000000000..b6be9c541
--- /dev/null
+++ b/native/dicttoolkit/NativeFileList.mk
@@ -0,0 +1,21 @@
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LATIN_IME_DICT_TOOLKIT_MAIN_SRC_FILES := \
+ dict_toolkit_main.cpp
+
+LATIN_IME_DICT_TOOLKIT_SRC_FILES :=
+
+LATIN_IME_DICT_TOOLKIT_TEST_FILES := \
+ dict_toolkit_defines_test.cpp
diff --git a/native/dicttoolkit/UnitTests.mk b/native/dicttoolkit/UnitTests.mk
new file mode 100644
index 000000000..d568db44a
--- /dev/null
+++ b/native/dicttoolkit/UnitTests.mk
@@ -0,0 +1,68 @@
+# Copyright (C) 2014 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ifeq (,$(TARGET_BUILD_APPS))
+
+LOCAL_PATH := $(call my-dir)
+
+######################################
+include $(CLEAR_VARS)
+
+LATIN_IME_CORE_PATH := $(LOCAL_PATH)/../jni
+
+LATIN_IME_DICT_TOOLKIT_SRC_DIR := src
+LATIN_IME_CORE_SRC_DIR := ../jni/src
+LATIN_DICT_TOOLKIT_TEST_SRC_DIR := tests
+
+include $(LOCAL_PATH)/NativeFileList.mk
+include $(LATIN_IME_CORE_PATH)/NativeFileList.mk
+
+# TODO: Remove -std=c++11 once it is set by default on host build.
+LATIN_IME_SRC_DIR := src
+LOCAL_ADDRESS_SANITIZER := true
+LOCAL_CFLAGS += -std=c++11 -Wno-unused-parameter -Wno-unused-function
+LOCAL_CLANG := true
+LOCAL_CXX_STL := libc++
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_DICT_TOOLKIT_SRC_DIR) \
+ $(LATIN_IME_CORE_PATH)/$(LATIN_IME_CORE_SRC_DIR)
+LOCAL_MODULE := liblatinime_dicttoolkit_host_static_for_unittests
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := \
+ $(addprefix $(LATIN_IME_DICT_TOOLKIT_SRC_DIR)/, $(LATIN_IME_DICT_TOOLKIT_SRC_FILES)) \
+ $(addprefix $(LATIN_IME_CORE_SRC_DIR)/, $(LATIN_IME_CORE_SRC_FILES))
+include $(BUILD_HOST_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+# TODO: Remove -std=c++11 once it is set by default on host build.
+LOCAL_CFLAGS += -std=c++11 -Wno-unused-parameter -Wno-unused-function
+LOCAL_CLANG := true
+LOCAL_CXX_STL := libc++
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_DICT_TOOLKIT_SRC_DIR) \
+ $(LATIN_IME_CORE_PATH)/$(LATIN_IME_CORE_SRC_DIR)
+LOCAL_MODULE := dicttoolkit_unittests
+LOCAL_MODULE_TAGS := tests
+LOCAL_SRC_FILES := \
+ $(addprefix $(LATIN_DICT_TOOLKIT_TEST_SRC_DIR)/, $(LATIN_IME_DICT_TOOLKIT_TEST_FILES))
+LOCAL_STATIC_LIBRARIES += liblatinime_dicttoolkit_host_static_for_unittests
+include $(BUILD_HOST_NATIVE_TEST)
+
+include $(LOCAL_PATH)/CleanupNativeFileList.mk
+
+#################### Clean up the tmp vars
+LATINIME_HOST_OSNAME :=
+LATIN_IME_SRC_DIR :=
+LATIN_IME_TEST_SRC_DIR :=
+
+endif # TARGET_BUILD_APPS
diff --git a/native/dicttoolkit/dict_toolkit_main.cpp b/native/dicttoolkit/dict_toolkit_main.cpp
new file mode 100644
index 000000000..d71b50eb4
--- /dev/null
+++ b/native/dicttoolkit/dict_toolkit_main.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdio>
+
+int main(int argc, char **argv) {
+ // TODO: Implement.
+ printf("%s\n", argv[0]);
+ return 0;
+}
diff --git a/native/dicttoolkit/run_tests.sh b/native/dicttoolkit/run_tests.sh
new file mode 100755
index 000000000..44c99c144
--- /dev/null
+++ b/native/dicttoolkit/run_tests.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright 2014, The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# check script arguments
+if [[ $(type -t mmm) != function ]]; then
+if [[ ${BASH_SOURCE[0]} != $0 ]]; then return; else exit 1; fi
+fi
+
+# Host build is never supported in unbundled (NDK/tapas) build
+if [[ -n $TARGET_BUILD_APPS ]]; then
+ echo "Host build is never supported in tapas build." 1>&2
+ echo "Use lunch command instead." 1>&2
+ if [[ ${BASH_SOURCE[0]} != $0 ]]; then return; else exit 1; fi
+fi
+
+test_name=dicttoolkit_unittests
+
+pushd $PWD > /dev/null
+cd $(gettop)
+(mmm -j16 packages/inputmethods/LatinIME/native/dicttoolkit) || (make -j16 $test_name)
+$ANDROID_HOST_OUT/bin/$test_name
+popd > /dev/null
diff --git a/native/dicttoolkit/src/dict_toolkit_defines.h b/native/dicttoolkit/src/dict_toolkit_defines.h
new file mode 100644
index 000000000..2a2104e26
--- /dev/null
+++ b/native/dicttoolkit/src/dict_toolkit_defines.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICT_TOOLKIT_DEFINES_H
+#define LATINIME_DICT_TOOLKIT_DEFINES_H
+
+#include "defines.h"
+
+#endif // LATINIME_DICT_TOOLKIT_DEFINES_H
diff --git a/native/dicttoolkit/tests/dict_toolkit_defines_test.cpp b/native/dicttoolkit/tests/dict_toolkit_defines_test.cpp
new file mode 100644
index 000000000..3445bd0c5
--- /dev/null
+++ b/native/dicttoolkit/tests/dict_toolkit_defines_test.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dict_toolkit_defines.h"
+
+#include <gtest/gtest.h>
+
+namespace latinime {
+namespace dicttoolkit {
+namespace {
+
+// Initial trivial test case.
+TEST(DictToolkitDefinesTest, TestKeycodeSpace) {
+ EXPECT_EQ(' ', KEYCODE_SPACE);
+}
+
+} // namespace
+} // namespace dicttoolkit
+} // namespace latinime
diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk
index ca40ba8a5..55bb68344 100644
--- a/native/jni/NativeFileList.mk
+++ b/native/jni/NativeFileList.mk
@@ -71,6 +71,7 @@ LATIN_IME_CORE_SRC_FILES := \
ver4_patricia_trie_writing_helper.cpp \
ver4_pt_node_array_reader.cpp) \
$(addprefix suggest/policyimpl/dictionary/structure/v4/content/, \
+ dynamic_language_model_probability_utils.cpp \
language_model_dict_content.cpp \
language_model_dict_content_global_counters.cpp \
shortcut_dict_content.cpp \
@@ -84,6 +85,7 @@ LATIN_IME_CORE_SRC_FILES := \
forgetting_curve_utils.cpp \
format_utils.cpp \
mmapped_buffer.cpp \
+ probability_utils.cpp \
sparse_table.cpp \
trie_map.cpp ) \
suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \
@@ -135,6 +137,7 @@ LATIN_IME_CORE_TEST_FILES := \
suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer_test.cpp \
suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp \
suggest/policyimpl/dictionary/utils/format_utils_test.cpp \
+ suggest/policyimpl/dictionary/utils/probability_utils_test.cpp \
suggest/policyimpl/dictionary/utils/sparse_table_test.cpp \
suggest/policyimpl/dictionary/utils/trie_map_test.cpp \
suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp
index bfe17cc4c..6a5df9d95 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp
@@ -81,6 +81,9 @@ void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbabi
}
const WordAttributes wordAttributes = mDictStructurePolicy->getWordAttributesInContext(
mPrevWordIds, targetWordId, nullptr /* multiBigramMap */);
+ if (wordAttributes.getProbability() == NOT_A_PROBABILITY) {
+ return;
+ }
mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount,
wordAttributes.getProbability());
}
diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp
index 1e2494e92..8f07ce275 100644
--- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp
+++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp
@@ -31,6 +31,7 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x100;
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH =
NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR;
const ErrorTypeUtils::ErrorType
ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION =
diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h
index fd1d5fcff..e92c509fa 100644
--- a/native/jni/src/suggest/core/dictionary/error_type_utils.h
+++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h
@@ -52,6 +52,10 @@ class ErrorTypeUtils {
return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0;
}
+ static bool isPerfectMatch(const ErrorType containedErrorTypes) {
+ return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0;
+ }
+
static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) {
return (containedErrorTypes
& ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0;
@@ -73,6 +77,7 @@ class ErrorTypeUtils {
DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils);
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH;
+ static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH;
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION;
};
} // namespace latinime
diff --git a/native/jni/src/suggest/core/dictionary/ngram_listener.h b/native/jni/src/suggest/core/dictionary/ngram_listener.h
index e9b3c1aaf..2eb5e9fd1 100644
--- a/native/jni/src/suggest/core/dictionary/ngram_listener.h
+++ b/native/jni/src/suggest/core/dictionary/ngram_listener.h
@@ -26,6 +26,8 @@ namespace latinime {
*/
class NgramListener {
public:
+ // ngramProbability is always 0 for v403 decaying dictionary.
+ // TODO: Remove ngramProbability.
virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0;
virtual ~NgramListener() {};
diff --git a/native/jni/src/suggest/core/dictionary/property/historical_info.h b/native/jni/src/suggest/core/dictionary/property/historical_info.h
index f9bd6fd8c..e5ce1ea25 100644
--- a/native/jni/src/suggest/core/dictionary/property/historical_info.h
+++ b/native/jni/src/suggest/core/dictionary/property/historical_info.h
@@ -38,6 +38,7 @@ class HistoricalInfo {
return mTimestamp;
}
+ // TODO: Remove
int getLevel() const {
return mLevel;
}
diff --git a/native/jni/src/suggest/core/policy/scoring.h b/native/jni/src/suggest/core/policy/scoring.h
index ce3684a1c..b9dda83ad 100644
--- a/native/jni/src/suggest/core/policy/scoring.h
+++ b/native/jni/src/suggest/core/policy/scoring.h
@@ -30,7 +30,7 @@ class Scoring {
public:
virtual int calculateFinalScore(const float compoundDistance, const int inputSize,
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
- const bool boostExactMatches) const = 0;
+ const bool boostExactMatches, const bool hasProbabilityZero) const = 0;
virtual void getMostProbableString(const DicTraverseSession *const traverseSession,
const float weightOfLangModelVsSpatialModel,
SuggestionResults *const outSuggestionResults) const = 0;
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
index 3283f6deb..74db95953 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
@@ -76,6 +76,52 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
}
+/* static */ bool SuggestionsOutputUtils::shouldBlockWord(
+ const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
+ const WordAttributes wordAttributes, const bool isLastWord) {
+ const bool currentWordExactMatch =
+ ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
+ // When we have to block offensive words, non-exact matched offensive words should not be
+ // output.
+ const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
+
+ const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
+ wordAttributes.isPossiblyOffensive();
+
+ // This function is called in two situations:
+ //
+ // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
+ // of the search, and isLastWord will be true.
+ // "fuck"
+ // |
+ // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
+ // In this case, if the current word is an exact match, we will always let the word
+ // through, even if the user is blocking offensive words (it's exactly what they typed!)
+ //
+ // 2) In the middle of the search, when we hit a terminal node, to decide whether or not
+ // to start a new search at root, to try to match the rest of the input. In this case,
+ // terminalDicNode will point to the terminal node we just hit, and isLastWord will be
+ // false.
+ // "fuckvthis"
+ // |
+ // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
+ //
+ // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
+ // when offensive words are blocked would be a bad idea).
+ //
+ // In the case of a multi-word correction where the offensive word is typed last (eg.
+ // for the input "allfuck"), this function will be called with isLastWord==true, but
+ // currentWordExactMatch==false. So we are OK in this case as well.
+ // "allfuck"
+ // |
+ // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
+ if (isLastWord && currentWordExactMatch) {
+ return false;
+ } else {
+ return isBlockedOffensiveWord;
+ }
+}
+
/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
@@ -98,24 +144,16 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
const bool isExactMatchWithIntentionalOmission =
ErrorTypeUtils::isExactMatchWithIntentionalOmission(
terminalDicNode->getContainedErrorTypes());
- const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
- // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
- // (e.g. "AMD" and "and")
- const bool isSafeExactMatch = isExactMatch
- && !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase);
const int outputTypeFlags =
(wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
- | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
+ | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
| (isExactMatchWithIntentionalOmission ?
Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
-
// Entries that are blacklisted or do not represent a word should not be output.
const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
- // When we have to block offensive words, non-exact matched offensive words should not be
- // output.
- const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
- const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive()
- && !isSafeExactMatch;
+
+ const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
+ terminalDicNode, wordAttributes, true /* isLastWord */);
// Increase output score of top typing suggestion to ensure autocorrection.
// TODO: Better integration with java side autocorrection logic.
@@ -123,11 +161,11 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
compoundDistance, traverseSession->getInputSize(),
terminalDicNode->getContainedErrorTypes(),
(forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
- boostExactMatches);
+ boostExactMatches, wordAttributes.getProbability() == 0);
// Don't output invalid or blocked offensive words. However, we still need to submit their
// shortcuts if any.
- if (isValidWord && !isBlockedOffensiveWord) {
+ if (isValidWord && !shouldBlockThisWord) {
int codePoints[MAX_WORD_LENGTH];
terminalDicNode->outputResult(codePoints);
const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h
index bf8497828..eca1f78b2 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.h
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h
@@ -18,6 +18,7 @@
#define LATINIME_SUGGESTIONS_OUTPUT_UTILS
#include "defines.h"
+#include "suggest/core/dictionary/word_attributes.h"
namespace latinime {
@@ -25,11 +26,19 @@ class BinaryDictionaryShortcutIterator;
class DicNode;
class DicTraverseSession;
class Scoring;
+class SuggestOptions;
class SuggestionResults;
class SuggestionsOutputUtils {
public:
/**
+ * Returns true if we should block the incoming word, in the context of the user's
+ * preferences to include or not include possibly offensive words
+ */
+ static bool shouldBlockWord(const SuggestOptions *const suggestOptions,
+ const DicNode *const terminalDicNode, const WordAttributes wordAttributes,
+ const bool isLastWord);
+ /**
* Outputs the final list of suggestions (i.e., terminal nodes).
*/
static void outputSuggestions(const Scoring *const scoringPolicy,
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index 68a36454e..c372d668b 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -416,6 +416,11 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext(
dicNode->getPrevWordIds(), dicNode->getWordId(),
traverseSession->getMultiBigramMap());
+ if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(),
+ dicNode, wordAttributes, false /* isLastWord */)) {
+ return;
+ }
+
if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) {
return;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
index 44c2f443f..7a5acd7d5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
@@ -134,15 +134,17 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
// same so we use them for both here.
switch (mDictFormatVersion) {
case FormatUtils::VERSION_2:
- return FormatUtils::VERSION_2;
case FormatUtils::VERSION_201:
- return FormatUtils::VERSION_201;
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ return FormatUtils::UNKNOWN_VERSION;
+ case FormatUtils::VERSION_202:
+ return FormatUtils::VERSION_202;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
- case FormatUtils::VERSION_4:
- return FormatUtils::VERSION_4;
- case FormatUtils::VERSION_4_DEV:
- return FormatUtils::VERSION_4_DEV;
+ case FormatUtils::VERSION_402:
+ return FormatUtils::VERSION_402;
+ case FormatUtils::VERSION_403:
+ return FormatUtils::VERSION_403;
default:
return FormatUtils::UNKNOWN_VERSION;
}
@@ -245,7 +247,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
}
bool supportsBeginningOfSentence() const {
- return mDictFormatVersion >= FormatUtils::VERSION_4;
+ return mDictFormatVersion >= FormatUtils::VERSION_402;
}
const int *getCodePointTable() const {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
index 41a8b13b8..19ed0d468 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
@@ -111,11 +111,12 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
switch (version) {
case FormatUtils::VERSION_2:
case FormatUtils::VERSION_201:
- // Version 2 or 201 dictionary writing is not supported.
+ case FormatUtils::VERSION_202:
+ // None of the static dictionaries (v2x) support writing
return false;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4:
- case FormatUtils::VERSION_4_DEV:
+ case FormatUtils::VERSION_402:
+ case FormatUtils::VERSION_403:
return buffer->writeUintAndAdvancePosition(version /* data */,
HEADER_DICTIONARY_VERSION_SIZE, writingPos);
default:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
index 9e1adff70..15ac88319 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
@@ -65,6 +65,8 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
(encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ?
Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId;
if (mHasHistoricalInfo) {
+ // Hack for better migration.
+ count += level;
const HistoricalInfo historicalInfo(timestamp, level, count);
return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId);
} else {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp
index ef6166ffd..61ef4aa42 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp
@@ -50,7 +50,8 @@ const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int ter
Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos);
const int count = buffer->readUintAndAdvancePosition(
Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos);
- const HistoricalInfo historicalInfo(timestamp, level, count);
+ // Hack for better migration.
+ const HistoricalInfo historicalInfo(timestamp, level, count + level);
return ProbabilityEntry(flags, probability, &historicalInfo);
} else {
return ProbabilityEntry(flags, probability);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
index 08e39ce43..9455222dd 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
@@ -140,7 +140,7 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability,
const PtNodeParams &ptNodeParams) const {
- return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
+ return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
ptNodeParams.getProbability() == 0);
}
@@ -164,7 +164,7 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
}
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
- if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
+ if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) {
return NOT_A_PROBABILITY;
}
if (prevWordIds.empty()) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
index 372c9e36f..9a9a21b6b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
@@ -58,7 +58,7 @@ namespace latinime {
const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) {
FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion);
switch (dictFormatVersion) {
- case FormatUtils::VERSION_4: {
+ case FormatUtils::VERSION_402: {
return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants,
backward::v402::Ver4DictBuffers,
backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr,
@@ -66,7 +66,7 @@ namespace latinime {
dictFormatVersion, locale, attributeMap);
}
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4_DEV: {
+ case FormatUtils::VERSION_403: {
return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers,
Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>(
dictFormatVersion, locale, attributeMap);
@@ -115,9 +115,10 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
switch (formatVersion) {
case FormatUtils::VERSION_2:
case FormatUtils::VERSION_201:
- AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path);
+ case FormatUtils::VERSION_202:
+ AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path);
break;
- case FormatUtils::VERSION_4: {
+ case FormatUtils::VERSION_402: {
return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
backward::v402::Ver4DictBuffers,
backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr,
@@ -125,7 +126,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
headerFilePath, formatVersion, std::move(mmappedBuffer));
}
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4_DEV: {
+ case FormatUtils::VERSION_403: {
return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers,
Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>(
headerFilePath, formatVersion, std::move(mmappedBuffer));
@@ -177,11 +178,14 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
case FormatUtils::VERSION_2:
case FormatUtils::VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ break;
+ case FormatUtils::VERSION_202:
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
new PatriciaTriePolicy(std::move(mmappedBuffer)));
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4:
- case FormatUtils::VERSION_4_DEV:
+ case FormatUtils::VERSION_402:
+ case FormatUtils::VERSION_403:
AKLOGE("Given path is a file but the format is version 4. path: %s", path);
break;
default:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
index 585e87a24..e52706e07 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
@@ -144,17 +144,6 @@ class PtNodeParams {
return PatriciaTrieReadingUtils::isTerminal(mFlags);
}
- AK_FORCE_INLINE bool isBlacklisted() const {
- // Note: this method will be removed in the next change.
- // It is used in getProbabilityOfWord and getWordAttributes for both v402 and v403.
- // * getProbabilityOfWord will be changed to no longer return NOT_A_PROBABILITY
- // when isBlacklisted (i.e. to only check if isNotAWord or isDeleted)
- // * getWordAttributes will be changed to always return blacklisted=false and
- // isPossiblyOffensive according to the function below (instead of the current
- // behaviour of checking if the probability is zero)
- return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
- }
-
AK_FORCE_INLINE bool isPossiblyOffensive() const {
return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
index 66fd18a52..59873612a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -14,7 +14,6 @@
* limitations under the License.
*/
-
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h"
#include "defines.h"
@@ -317,8 +316,8 @@ const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(
const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability,
const PtNodeParams &ptNodeParams) const {
- return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
- ptNodeParams.getProbability() == 0);
+ return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
+ ptNodeParams.isPossiblyOffensive());
}
int PatriciaTriePolicy::getProbability(const int unigramProbability,
@@ -345,10 +344,9 @@ int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
const PtNodeParams ptNodeParams =
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
- if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) {
- // If this is not a word, or if it's a blacklisted entry, it should behave as
- // having no probability outside of the suggestion process (where it should be used
- // for shortcuts).
+ if (ptNodeParams.isNotAWord()) {
+ // If this is not a word, it should behave as having no probability outside of the
+ // suggestion process (where it should be used for shortcuts).
return NOT_A_PROBABILITY;
}
if (!prevWordIds.empty()) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
new file mode 100644
index 000000000..b0fbb3e72
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
+
+namespace latinime {
+
+// These counts are used to provide stable probabilities even if the user's input count is small.
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_UNIGRAMS = 8192;
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_BIGRAMS = 2;
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_TRIGRAMS = 2;
+
+// These are encoded backoff weights.
+// Note that we give positive value for trigrams that means the weight is more than 1.
+// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight.
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS = -32;
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS = 0;
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS = 8;
+
+// This value is used to remove too old entries from the dictionary.
+const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS =
+ 300 * 24 * 60 * 60; // 300 days
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
new file mode 100644
index 000000000..88bc58fe8
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+
+#include <algorithm>
+
+#include "defines.h"
+#include "suggest/core/dictionary/property/historical_info.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+class DynamicLanguageModelProbabilityUtils {
+ public:
+ static float computeRawProbabilityFromCounts(const int count, const int contextCount,
+ const int matchedWordCountInContext) {
+ int minCount = 0;
+ switch (matchedWordCountInContext) {
+ case 1:
+ minCount = ASSUMED_MIN_COUNT_FOR_UNIGRAMS;
+ break;
+ case 2:
+ minCount = ASSUMED_MIN_COUNT_FOR_BIGRAMS;
+ break;
+ case 3:
+ minCount = ASSUMED_MIN_COUNT_FOR_TRIGRAMS;
+ break;
+ default:
+ AKLOGE("computeRawProbabilityFromCounts is called with invalid "
+ "matchedWordCountInContext (%d).", matchedWordCountInContext);
+ ASSERT(false);
+ return 0.0f;
+ }
+ return static_cast<float>(count) / static_cast<float>(std::max(contextCount, minCount));
+ }
+
+ static float backoff(const int ngramProbability, const int matchedWordCountInContext) {
+ int probability = NOT_A_PROBABILITY;
+
+ switch (matchedWordCountInContext) {
+ case 1:
+ probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS;
+ break;
+ case 2:
+ probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS;
+ break;
+ case 3:
+ probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS;
+ break;
+ default:
+ AKLOGE("backoff is called with invalid matchedWordCountInContext (%d).",
+ matchedWordCountInContext);
+ ASSERT(false);
+ return NOT_A_PROBABILITY;
+ }
+ return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY);
+ }
+
+ static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) {
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ if (elapsedTime < 0) {
+ AKLOGE("The elapsed time is negatime value. Timestamp overflow?");
+ return NOT_A_PROBABILITY;
+ }
+ // TODO: Improve this logic.
+ // We don't modify probability depending on the elapsed time.
+ return probability;
+ }
+
+ static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+ }
+
+ static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ // More recently input entries get higher priority.
+ return historicalInfo.getTimestamp();
+ }
+
+private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils);
+
+ static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 2, "Max supported Ngram is Trigram.");
+
+ static const int ASSUMED_MIN_COUNT_FOR_UNIGRAMS;
+ static const int ASSUMED_MIN_COUNT_FOR_BIGRAMS;
+ static const int ASSUMED_MIN_COUNT_FOR_TRIGRAMS;
+
+ static const int ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS;
+ static const int ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS;
+ static const int ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS;
+
+ static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+};
+
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
index 05a3a6356..31b1ea696 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
@@ -19,11 +19,11 @@
#include <algorithm>
#include <cstring>
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
+#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime {
-const int LanguageModelDictContent::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0;
const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1;
@@ -39,7 +39,8 @@ bool LanguageModelDictContent::runGC(
}
const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds,
- const int wordId, const HeaderPolicy *const headerPolicy) const {
+ const int wordId, const bool mustMatchAllPrevWords,
+ const HeaderPolicy *const headerPolicy) const {
int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex();
int maxPrevWordCount = 0;
@@ -53,7 +54,15 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr
bitmapEntryIndices[i + 1] = nextBitmapEntryIndex;
}
+ const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
+ if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) {
+ // The word should be treated as a invalid word.
+ return WordAttributes();
+ }
for (int i = maxPrevWordCount; i >= 0; --i) {
+ if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) {
+ break;
+ }
const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]);
if (!result.mIsValid) {
continue;
@@ -62,36 +71,39 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr
ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
int probability = NOT_A_PROBABILITY;
if (mHasHistoricalInfo) {
- const int rawProbability = ForgettingCurveUtils::decodeProbability(
- probabilityEntry.getHistoricalInfo(), headerPolicy);
- if (rawProbability == NOT_A_PROBABILITY) {
- // The entry should not be treated as a valid entry.
- continue;
- }
+ const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
+ int contextCount = 0;
if (i == 0) {
// unigram
- probability = rawProbability;
+ contextCount = mGlobalCounters.getTotalCount();
} else {
const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry(
prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]);
if (!prevWordProbabilityEntry.isValid()) {
continue;
}
- if (prevWordProbabilityEntry.representsBeginningOfSentence()) {
- probability = rawProbability;
- } else {
- const int prevWordRawProbability = ForgettingCurveUtils::decodeProbability(
- prevWordProbabilityEntry.getHistoricalInfo(), headerPolicy);
- probability = std::min(MAX_PROBABILITY - prevWordRawProbability
- + rawProbability, MAX_PROBABILITY);
+ if (prevWordProbabilityEntry.representsBeginningOfSentence()
+ && historicalInfo->getCount() == 1) {
+ // BoS ngram requires multiple contextCount.
+ continue;
}
+ contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount();
}
+ const float rawProbability =
+ DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts(
+ historicalInfo->getCount(), contextCount, i + 1);
+ const int encodedRawProbability =
+ ProbabilityUtils::encodeRawProbability(rawProbability);
+ const int decayedProbability =
+ DynamicLanguageModelProbabilityUtils::getDecayedProbability(
+ encodedRawProbability, *historicalInfo);
+ probability = DynamicLanguageModelProbabilityUtils::backoff(
+ decayedProbability, i + 1 /* n */);
} else {
probability = probabilityEntry.getProbability();
}
// TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
// probabilityEntry.
- const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
unigramProbabilityEntry.isNotAWord(),
unigramProbabilityEntry.isPossiblyOffensive());
@@ -167,7 +179,8 @@ void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner(
ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
if (probabilityEntry.isValid()) {
const WordAttributes wordAttributes = getWordAttributes(
- WordIdArrayView(*prevWordIds), wordId, headerPolicy);
+ WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */,
+ headerPolicy);
outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId,
wordAttributes, probabilityEntry);
}
@@ -231,7 +244,7 @@ bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView
return false;
}
mGlobalCounters.updateMaxValueOfCounters(
- updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount());
+ updatedNgramProbabilityEntry.getHistoricalInfo()->getCount());
if (!originalNgramProbabilityEntry.isValid()) {
entryCountersToUpdate->incrementNgramCount(i + 2);
}
@@ -242,10 +255,9 @@ bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView
const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom(
const ProbabilityEntry &originalProbabilityEntry, const bool isValid,
const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const {
- const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo(
- originalProbabilityEntry.getHistoricalInfo(), isValid ?
- DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY,
- &historicalInfo, headerPolicy);
+ const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(),
+ 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount()
+ + historicalInfo.getCount());
if (originalProbabilityEntry.isValid()) {
return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo);
} else {
@@ -311,7 +323,7 @@ int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWord
bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex,
const int prevWordCount, const HeaderPolicy *const headerPolicy,
- MutableEntryCounters *const outEntryCounters) {
+ const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) {
for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.",
@@ -328,33 +340,41 @@ bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int b
}
continue;
}
- if (mHasHistoricalInfo && !probabilityEntry.representsBeginningOfSentence()
- && probabilityEntry.isValid()) {
- const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
- probabilityEntry.getHistoricalInfo(), headerPolicy);
- if (ForgettingCurveUtils::needsToKeep(&historicalInfo, headerPolicy)) {
- // Update the entry.
- const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), &historicalInfo);
- if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo),
- bitmapEntryIndex)) {
- return false;
- }
- } else {
+ if (mHasHistoricalInfo && probabilityEntry.isValid()) {
+ const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo();
+ if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC(
+ *originalHistoricalInfo)) {
// Remove the entry.
if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
return false;
}
continue;
}
+ if (needsToHalveCounters) {
+ const int updatedCount = originalHistoricalInfo->getCount() / 2;
+ if (updatedCount == 0) {
+ // Remove the entry.
+ if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
+ return false;
+ }
+ continue;
+ }
+ const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(),
+ originalHistoricalInfo->getLevel(), updatedCount);
+ const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(),
+ &historicalInfoToSave);
+ if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo),
+ bitmapEntryIndex)) {
+ return false;
+ }
+ }
}
- if (!probabilityEntry.representsBeginningOfSentence()) {
- outEntryCounters->incrementNgramCount(prevWordCount + 1);
- }
+ outEntryCounters->incrementNgramCount(prevWordCount + 1);
if (!entry.hasNextLevelMap()) {
continue;
}
if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(),
- prevWordCount + 1, headerPolicy, outEntryCounters)) {
+ prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) {
return false;
}
}
@@ -408,11 +428,11 @@ bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPoli
}
const ProbabilityEntry probabilityEntry =
ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
- const int probability = (mHasHistoricalInfo) ?
- ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo(),
- headerPolicy) : probabilityEntry.getProbability();
- outEntryInfo->emplace_back(probability,
- probabilityEntry.getHistoricalInfo()->getTimestamp(),
+ const int priority = mHasHistoricalInfo
+ ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction(
+ *probabilityEntry.getHistoricalInfo())
+ : probabilityEntry.getProbability();
+ outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(),
entry.key(), targetLevel, prevWordIds->data());
}
return true;
@@ -420,11 +440,11 @@ bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPoli
bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()(
const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const {
- if (left.mProbability != right.mProbability) {
- return left.mProbability < right.mProbability;
+ if (left.mPriority != right.mPriority) {
+ return left.mPriority < right.mPriority;
}
- if (left.mTimestamp != right.mTimestamp) {
- return left.mTimestamp > right.mTimestamp;
+ if (left.mCount != right.mCount) {
+ return left.mCount < right.mCount;
}
if (left.mKey != right.mKey) {
return left.mKey < right.mKey;
@@ -441,10 +461,9 @@ bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()(
return false;
}
-LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int probability,
- const int timestamp, const int key, const int prevWordCount, const int *const prevWordIds)
- : mProbability(probability), mTimestamp(timestamp), mKey(key),
- mPrevWordCount(prevWordCount) {
+LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority,
+ const int count, const int key, const int prevWordCount, const int *const prevWordIds)
+ : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) {
memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0]));
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h
index 5b92b96e3..9678c35f9 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h
@@ -151,13 +151,14 @@ class LanguageModelDictContent {
const LanguageModelDictContent *const originalContent);
const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId,
- const HeaderPolicy *const headerPolicy) const;
+ const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const;
ProbabilityEntry getProbabilityEntry(const int wordId) const {
return getNgramProbabilityEntry(WordIdArrayView(), wordId);
}
bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) {
+ mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount());
return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry);
}
@@ -180,8 +181,15 @@ class LanguageModelDictContent {
bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy,
MutableEntryCounters *const outEntryCounters) {
- return updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(),
- 0 /* prevWordCount */, headerPolicy, outEntryCounters);
+ if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(),
+ 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(),
+ outEntryCounters)) {
+ return false;
+ }
+ if (mGlobalCounters.needsToHalveCounters()) {
+ mGlobalCounters.halveCounters();
+ }
+ return true;
}
// entryCounts should be created by updateAllProbabilityEntries.
@@ -206,11 +214,12 @@ class LanguageModelDictContent {
DISALLOW_ASSIGNMENT_OPERATOR(Comparator);
};
- EntryInfoToTurncate(const int probability, const int timestamp, const int key,
+ EntryInfoToTurncate(const int priority, const int count, const int key,
const int prevWordCount, const int *const prevWordIds);
- int mProbability;
- int mTimestamp;
+ int mPriority;
+ // TODO: Remove.
+ int mCount;
int mKey;
int mPrevWordCount;
int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
@@ -219,8 +228,6 @@ class LanguageModelDictContent {
DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate);
};
- // TODO: Remove
- static const int DUMMY_PROBABILITY_FOR_VALID_WORDS;
static const int TRIE_MAP_BUFFER_INDEX;
static const int GLOBAL_COUNTERS_BUFFER_INDEX;
@@ -233,7 +240,8 @@ class LanguageModelDictContent {
int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds);
int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const;
bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount,
- const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters);
+ const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters,
+ MutableEntryCounters *const outEntryCounters);
bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy,
const int maxEntryCount, const int targetLevel, int *const outEntryCount);
bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel,
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h
index 9953aa425..283c2691a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h
@@ -63,6 +63,10 @@ class LanguageModelDictContentGlobalCounters {
mTotalCount += 1;
}
+ void addToTotalCount(const int count) {
+ mTotalCount += count;
+ }
+
void updateMaxValueOfCounters(const int count) {
mMaxValueOfCounters = std::max(count, mMaxValueOfCounters);
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h
index f4d340f86..9c4ab18e4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h
@@ -105,7 +105,7 @@ class ProbabilityEntry {
encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT))
| static_cast<uint8_t>(mHistoricalInfo.getLevel());
encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT))
- | static_cast<uint8_t>(mHistoricalInfo.getCount());
+ | static_cast<uint16_t>(mHistoricalInfo.getCount());
} else {
encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT))
| static_cast<uint8_t>(mProbability);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp
index eb6080a24..bd89b8da7 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp
@@ -49,8 +49,8 @@ const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4;
-const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
-const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
+const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0;
+const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2;
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h
index 600b5ffe4..13d7a5714 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h
@@ -47,6 +47,7 @@ class Ver4DictConstants {
static const int NOT_A_TERMINAL_ADDRESS;
static const int TERMINAL_ID_FIELD_SIZE;
static const int TIME_STAMP_FIELD_SIZE;
+ // TODO: Remove
static const int WORD_LEVEL_FIELD_SIZE;
static const int WORD_COUNT_FIELD_SIZE;
// Flags in probability entry.
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
index d3de322f9..1992d4a5a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -110,7 +110,7 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
return WordAttributes();
}
return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
- mHeaderPolicy);
+ false /* mustMatchAllPrevWords */, mHeaderPolicy);
}
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
@@ -118,18 +118,13 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
return NOT_A_PROBABILITY;
}
- const ProbabilityEntry probabilityEntry =
- mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId);
- if (!probabilityEntry.isValid() || probabilityEntry.isBlacklisted()
- || probabilityEntry.isNotAWord()) {
+ const WordAttributes wordAttributes =
+ mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
+ true /* mustMatchAllPrevWords */, mHeaderPolicy);
+ if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) {
return NOT_A_PROBABILITY;
}
- if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
- return ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo(),
- mHeaderPolicy);
- } else {
- return probabilityEntry.getProbability();
- }
+ return wordAttributes.getProbability();
}
BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
@@ -151,10 +146,16 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI
if (!probabilityEntry.isValid()) {
continue;
}
- const int probability = probabilityEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(
- probabilityEntry.getHistoricalInfo(), mHeaderPolicy) :
- probabilityEntry.getProbability();
+ int probability = NOT_A_PROBABILITY;
+ if (probabilityEntry.hasHistoricalInfo()) {
+ // TODO: Quit checking count here.
+ // If count <= 1, the word can be an invaild word. The actual probability should
+ // be checked using getWordAttributesInContext() in onVisitEntry().
+ probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ?
+ NOT_A_PROBABILITY : 0;
+ } else {
+ probability = probabilityEntry.getProbability();
+ }
listener->onVisitEntry(probability, entry.getWordId());
}
}
@@ -386,25 +387,35 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
return false;
}
+ if (!isValidWord) {
+ return true;
+ }
wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
}
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
false /* tryLowerCaseSearch */);
- if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID
- && ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
- const UnigramProperty beginningOfSentenceUnigramProperty(
- true /* representsBeginningOfSentence */,
- true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
- HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
- if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
- &beginningOfSentenceUnigramProperty)) {
- AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
+ if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
+ if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
+ const UnigramProperty beginningOfSentenceUnigramProperty(
+ true /* representsBeginningOfSentence */,
+ true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
+ HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
+ if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
+ &beginningOfSentenceUnigramProperty)) {
+ AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ // Refresh word ids.
+ ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
+ }
+ // Update entries for beginning of sentence.
+ if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(
+ prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo,
+ mHeaderPolicy, &mEntryCounters)) {
return false;
}
- // Refresh word ids.
- ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
}
if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds,
wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) {
@@ -542,7 +553,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
}
}
const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
- WordIdArrayView(), wordId, mHeaderPolicy);
+ WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy);
const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
index 9d8e86675..edcb43678 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
@@ -44,13 +44,13 @@ const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4;
TimeKeeper::setCurrentTime();
const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion);
switch (formatVersion) {
- case FormatUtils::VERSION_4:
+ case FormatUtils::VERSION_402:
return createEmptyV4DictFile<backward::v402::Ver4DictConstants,
backward::v402::Ver4DictBuffers,
backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr>(
filePath, localeAsCodePointVector, attributeMap, formatVersion);
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4_DEV:
+ case FormatUtils::VERSION_403:
return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers,
Ver4DictBuffers::Ver4DictBuffersPtr>(
filePath, localeAsCodePointVector, attributeMap, formatVersion);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
index 0cffe569d..e225c235e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
@@ -28,15 +28,17 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) {
switch (formatVersion) {
case VERSION_2:
- return VERSION_2;
case VERSION_201:
- return VERSION_201;
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ return UNKNOWN_VERSION;
+ case VERSION_202:
+ return VERSION_202;
case VERSION_4_ONLY_FOR_TESTING:
return VERSION_4_ONLY_FOR_TESTING;
- case VERSION_4:
- return VERSION_4;
- case VERSION_4_DEV:
- return VERSION_4_DEV;
+ case VERSION_402:
+ return VERSION_402;
+ case VERSION_403:
+ return VERSION_403;
default:
return UNKNOWN_VERSION;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
index 96310086b..1616efcce 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
@@ -31,11 +31,15 @@ class FormatUtils {
public:
enum FORMAT_VERSION {
// These MUST have the same values as the relevant constants in FormatSpec.java.
+ // TODO: Remove VERSION_2 and VERSION_201 when we:
+ // * Confirm that old versions of LatinIME download old-format dictionaries
+ // * We no longer need the corresponding constants on the Java side for dicttool
VERSION_2 = 2,
VERSION_201 = 201,
+ VERSION_202 = 202,
VERSION_4_ONLY_FOR_TESTING = 399,
- VERSION_4 = 402,
- VERSION_4_DEV = 403,
+ VERSION_402 = 402,
+ VERSION_403 = 403,
UNKNOWN_VERSION = -1
};
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp
new file mode 100644
index 000000000..e8fa06942
--- /dev/null
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+
+namespace latinime {
+
+const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f;
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h
index 3b339e61a..2050af1e9 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h
+++ b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h
@@ -17,6 +17,9 @@
#ifndef LATINIME_PROBABILITY_UTILS_H
#define LATINIME_PROBABILITY_UTILS_H
+#include <algorithm>
+#include <cmath>
+
#include "defines.h"
namespace latinime {
@@ -47,8 +50,20 @@ class ProbabilityUtils {
+ static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize);
}
+ // Encode probability using the same way as we are doing for main dictionaries.
+ static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) {
+ const float probability = static_cast<float>(MAX_PROBABILITY)
+ + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER;
+ if (probability < 0.0f) {
+ return 0;
+ }
+ return std::min(static_cast<int>(probability + 0.5f), MAX_PROBABILITY);
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils);
+
+ static const float PROBABILITY_ENCODING_SCALER;
};
}
#endif /* LATINIME_PROBABILITY_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
index a6f9a8b23..856808a74 100644
--- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
+++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
@@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120;
const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f;
const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f;
+const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f;
const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f;
const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f;
const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;
diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h
index b8f889559..6f327a370 100644
--- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h
+++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h
@@ -34,6 +34,7 @@ class ScoringParams {
static const int THRESHOLD_SHORT_WORD_LENGTH;
static const float EXACT_MATCH_PROMOTION;
+ static const float PERFECT_MATCH_PROMOTION;
static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH;
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h
index 0240bcf54..6acd767ea 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h
@@ -44,23 +44,50 @@ class TypingScoring : public Scoring {
AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize,
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
- const bool boostExactMatches) const {
+ const bool boostExactMatches, const bool hasProbabilityZero) const {
const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE
+ static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance;
if (forceCommit) {
score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD;
}
- if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
- score += ScoringParams::EXACT_MATCH_PROMOTION;
- if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
- score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
+ if (hasProbabilityZero) {
+ // Previously, when both legitimate 0-frequency words (such as distracters) and
+ // offensive words were encoded in the same way, distracters would never show up
+ // when the user blocked offensive words (the default setting, as well as the
+ // setting for regression tests).
+ //
+ // When b/11031090 was fixed and a separate encoding was used for offensive words,
+ // 0-frequency words would no longer be blocked when they were an "exact match"
+ // (where case mismatches and accent mismatches would be considered an "exact
+ // match"). The exact match boosting functionality meant that, for example, when
+ // the user typed "mt" they would be suggested the word "Mt", although they most
+ // probably meant to type "my".
+ //
+ // For this reason, we introduced this change, which does the following:
+ // * Defines the "perfect match" as a really exact match, with no room for case or
+ // accent mismatches
+ // * When the target word has probability zero (as "Mt" does, because it is a
+ // distracter), ONLY boost its score if it is a perfect match.
+ //
+ // By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and
+ // they will get "my". However, if the user makes an explicit effort to type "Mt",
+ // we do boost the word "Mt" so that the user's input is not autocorrected to "My".
+ if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) {
+ score += ScoringParams::PERFECT_MATCH_PROMOTION;
}
- if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
- score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
- }
- if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
- score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
+ } else {
+ if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
+ score += ScoringParams::EXACT_MATCH_PROMOTION;
+ if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
+ score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
+ }
+ if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
+ score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
+ }
+ if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
+ score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
+ }
}
}
return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE);
diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp
index 4469dc715..313a9af10 100644
--- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp
+++ b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp
@@ -52,16 +52,14 @@ TEST(LanguageModelDictContentTest, TestUnigramProbabilityWithHistoricalInfo) {
const int flag = 0xF0;
const int timestamp = 0x3FFFFFFF;
- const int level = 3;
const int count = 10;
const int wordId = 100;
- const HistoricalInfo historicalInfo(timestamp, level, count);
+ const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count);
const ProbabilityEntry probabilityEntry(flag, &historicalInfo);
languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry);
const ProbabilityEntry entry = languageModelDictContent.getProbabilityEntry(wordId);
EXPECT_EQ(flag, entry.getFlags());
EXPECT_EQ(timestamp, entry.getHistoricalInfo()->getTimestamp());
- EXPECT_EQ(level, entry.getHistoricalInfo()->getLevel());
EXPECT_EQ(count, entry.getHistoricalInfo()->getCount());
// Remove
@@ -108,14 +106,14 @@ TEST(LanguageModelDictContentTest, TestGetWordProbability) {
languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), wordId,
&bigramProbabilityEntry);
EXPECT_EQ(bigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId,
- nullptr /* headerPolicy */).getProbability());
+ false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability());
const ProbabilityEntry trigramProbabilityEntry(flag, trigramProbability);
languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1),
prevWordIds[1], &probabilityEntry);
languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(2), wordId,
&trigramProbabilityEntry);
EXPECT_EQ(trigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId,
- nullptr /* headerPolicy */).getProbability());
+ false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability());
}
} // namespace
diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp
index 260b347ce..eb78034ba 100644
--- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp
+++ b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp
@@ -39,20 +39,18 @@ TEST(ProbabilityEntryTest, TestEncodeDecode) {
TEST(ProbabilityEntryTest, TestEncodeDecodeWithHistoricalInfo) {
const int flag = 0xF0;
const int timestamp = 0x3FFFFFFF;
- const int level = 3;
- const int count = 10;
+ const int count = 0xABCD;
- const HistoricalInfo historicalInfo(timestamp, level, count);
+ const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count);
const ProbabilityEntry entry(flag, &historicalInfo);
const uint64_t encodedEntry = entry.encode(true /* hasHistoricalInfo */);
- EXPECT_EQ(0xF03FFFFFFF030Aull, encodedEntry);
+ EXPECT_EQ(0xF03FFFFFFFABCDull, encodedEntry);
const ProbabilityEntry decodedEntry =
ProbabilityEntry::decode(encodedEntry, true /* hasHistoricalInfo */);
EXPECT_EQ(flag, decodedEntry.getFlags());
EXPECT_EQ(timestamp, decodedEntry.getHistoricalInfo()->getTimestamp());
- EXPECT_EQ(level, decodedEntry.getHistoricalInfo()->getLevel());
EXPECT_EQ(count, decodedEntry.getHistoricalInfo()->getCount());
}
diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp
index 15f560cd1..494200568 100644
--- a/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp
+++ b/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp
@@ -62,14 +62,14 @@ TEST(FormatUtilsTest, TestDetectFormatVersion) {
}
{
const std::vector<uint8_t> buffer =
- getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_4, 0, 0);
- EXPECT_EQ(FormatUtils::VERSION_4, FormatUtils::detectFormatVersion(
+ getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_402, 0, 0);
+ EXPECT_EQ(FormatUtils::VERSION_402, FormatUtils::detectFormatVersion(
ReadOnlyByteArrayView(buffer.data(), buffer.size())));
}
{
const std::vector<uint8_t> buffer =
- getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_4_DEV, 0, 0);
- EXPECT_EQ(FormatUtils::VERSION_4_DEV, FormatUtils::detectFormatVersion(
+ getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_403, 0, 0);
+ EXPECT_EQ(FormatUtils::VERSION_403, FormatUtils::detectFormatVersion(
ReadOnlyByteArrayView(buffer.data(), buffer.size())));
}
diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp
new file mode 100644
index 000000000..be1f278c6
--- /dev/null
+++ b/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+
+#include <gtest/gtest.h>
+
+#include "defines.h"
+
+namespace latinime {
+namespace {
+
+TEST(ProbabilityUtilsTest, TestEncodeRawProbability) {
+ EXPECT_EQ(MAX_PROBABILITY, ProbabilityUtils::encodeRawProbability(1.0f));
+ EXPECT_EQ(MAX_PROBABILITY - 9, ProbabilityUtils::encodeRawProbability(0.5f));
+ EXPECT_EQ(0, ProbabilityUtils::encodeRawProbability(0.0f));
+}
+
+} // namespace
+} // namespace latinime