diff options
Diffstat (limited to 'native')
47 files changed, 824 insertions, 180 deletions
diff --git a/native/dicttoolkit/Android.mk b/native/dicttoolkit/Android.mk new file mode 100644 index 000000000..118682dfc --- /dev/null +++ b/native/dicttoolkit/Android.mk @@ -0,0 +1,67 @@ +# Copyright (C) 2014 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq (,$(TARGET_BUILD_APPS)) + +# Only build if it's explicitly requested, or running mm/mmm. +ifneq ($(ONE_SHOT_MAKEFILE)$(filter $(MAKECMDGOALS),dicttoolkit),) + +# HACK: Temporarily disable host tool build on Mac until the build system is ready for C++11. +LATINIME_HOST_OSNAME := $(shell uname -s) +ifneq ($(LATINIME_HOST_OSNAME), Darwin) # TODO: Remove this + +LOCAL_PATH := $(call my-dir) + +include $(CLEAR_VARS) + +LATIN_IME_CORE_PATH := $(LOCAL_PATH)/../jni + +LATIN_IME_DICT_TOOLKIT_SRC_DIR := src +LATIN_IME_CORE_SRC_DIR := ../jni/src + +LOCAL_CFLAGS += -Werror -Wall -Wextra -Weffc++ -Wformat=2 -Wcast-qual -Wcast-align \ + -Wwrite-strings -Wfloat-equal -Wpointer-arith -Winit-self -Wredundant-decls \ + -Woverloaded-virtual -Wsign-promo -Wno-system-headers + +# To suppress compiler warnings for unused variables/functions used for debug features etc. +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function +LOCAL_CFLAGS += -std=c++11 -Wno-unused-parameter -Wno-unused-function + +include $(LOCAL_PATH)/NativeFileList.mk +include $(LATIN_IME_CORE_PATH)/NativeFileList.mk + +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_DICT_TOOLKIT_SRC_DIR) \ + $(LATIN_IME_CORE_PATH)/$(LATIN_IME_CORE_SRC_DIR) + +LOCAL_SRC_FILES := $(LATIN_IME_DICT_TOOLKIT_MAIN_SRC_FILES) \ + $(addprefix $(LATIN_IME_DICT_TOOLKIT_SRC_DIR)/, $(LATIN_IME_DICT_TOOLKIT_SRC_FILES)) \ + $(addprefix $(LATIN_IME_CORE_SRC_DIR)/, $(LATIN_IME_CORE_SRC_FILES)) + +LOCAL_MODULE := dicttoolkit +LOCAL_MODULE_TAGS := optional + +LOCAL_CLANG := true +LOCAL_CXX_STL := libc++ + +include $(BUILD_HOST_EXECUTABLE) +#################### Clean up the tmp vars +include $(LOCAL_PATH)/CleanupNativeFileList.mk +#################### Unit test +include $(LOCAL_PATH)/UnitTests.mk + +endif # Darwin - TODO: Remove this + +endif + +endif # TARGET_BUILD_APPS diff --git a/native/dicttoolkit/CleanupNativeFileList.mk b/native/dicttoolkit/CleanupNativeFileList.mk new file mode 100644 index 000000000..b804b41ed --- /dev/null +++ b/native/dicttoolkit/CleanupNativeFileList.mk @@ -0,0 +1,17 @@ +# Copyright (C) 2014 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LATIN_IME_DICT_TOOLKIT_MAIN_SRC_FILES := +LATIN_IME_DICT_TOOLKIT_SRC_FILES := +LATIN_IME_DICT_TOOLKIT_TEST_FILES := diff --git a/native/dicttoolkit/NativeFileList.mk b/native/dicttoolkit/NativeFileList.mk new file mode 100644 index 000000000..b6be9c541 --- /dev/null +++ b/native/dicttoolkit/NativeFileList.mk @@ -0,0 +1,21 @@ +# Copyright (C) 2014 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LATIN_IME_DICT_TOOLKIT_MAIN_SRC_FILES := \ + dict_toolkit_main.cpp + +LATIN_IME_DICT_TOOLKIT_SRC_FILES := + +LATIN_IME_DICT_TOOLKIT_TEST_FILES := \ + dict_toolkit_defines_test.cpp diff --git a/native/dicttoolkit/UnitTests.mk b/native/dicttoolkit/UnitTests.mk new file mode 100644 index 000000000..d568db44a --- /dev/null +++ b/native/dicttoolkit/UnitTests.mk @@ -0,0 +1,68 @@ +# Copyright (C) 2014 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ifeq (,$(TARGET_BUILD_APPS)) + +LOCAL_PATH := $(call my-dir) + +###################################### +include $(CLEAR_VARS) + +LATIN_IME_CORE_PATH := $(LOCAL_PATH)/../jni + +LATIN_IME_DICT_TOOLKIT_SRC_DIR := src +LATIN_IME_CORE_SRC_DIR := ../jni/src +LATIN_DICT_TOOLKIT_TEST_SRC_DIR := tests + +include $(LOCAL_PATH)/NativeFileList.mk +include $(LATIN_IME_CORE_PATH)/NativeFileList.mk + +# TODO: Remove -std=c++11 once it is set by default on host build. +LATIN_IME_SRC_DIR := src +LOCAL_ADDRESS_SANITIZER := true +LOCAL_CFLAGS += -std=c++11 -Wno-unused-parameter -Wno-unused-function +LOCAL_CLANG := true +LOCAL_CXX_STL := libc++ +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_DICT_TOOLKIT_SRC_DIR) \ + $(LATIN_IME_CORE_PATH)/$(LATIN_IME_CORE_SRC_DIR) +LOCAL_MODULE := liblatinime_dicttoolkit_host_static_for_unittests +LOCAL_MODULE_TAGS := optional +LOCAL_SRC_FILES := \ + $(addprefix $(LATIN_IME_DICT_TOOLKIT_SRC_DIR)/, $(LATIN_IME_DICT_TOOLKIT_SRC_FILES)) \ + $(addprefix $(LATIN_IME_CORE_SRC_DIR)/, $(LATIN_IME_CORE_SRC_FILES)) +include $(BUILD_HOST_STATIC_LIBRARY) + +include $(CLEAR_VARS) + +# TODO: Remove -std=c++11 once it is set by default on host build. +LOCAL_CFLAGS += -std=c++11 -Wno-unused-parameter -Wno-unused-function +LOCAL_CLANG := true +LOCAL_CXX_STL := libc++ +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_DICT_TOOLKIT_SRC_DIR) \ + $(LATIN_IME_CORE_PATH)/$(LATIN_IME_CORE_SRC_DIR) +LOCAL_MODULE := dicttoolkit_unittests +LOCAL_MODULE_TAGS := tests +LOCAL_SRC_FILES := \ + $(addprefix $(LATIN_DICT_TOOLKIT_TEST_SRC_DIR)/, $(LATIN_IME_DICT_TOOLKIT_TEST_FILES)) +LOCAL_STATIC_LIBRARIES += liblatinime_dicttoolkit_host_static_for_unittests +include $(BUILD_HOST_NATIVE_TEST) + +include $(LOCAL_PATH)/CleanupNativeFileList.mk + +#################### Clean up the tmp vars +LATINIME_HOST_OSNAME := +LATIN_IME_SRC_DIR := +LATIN_IME_TEST_SRC_DIR := + +endif # TARGET_BUILD_APPS diff --git a/native/dicttoolkit/dict_toolkit_main.cpp b/native/dicttoolkit/dict_toolkit_main.cpp new file mode 100644 index 000000000..d71b50eb4 --- /dev/null +++ b/native/dicttoolkit/dict_toolkit_main.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstdio> + +int main(int argc, char **argv) { + // TODO: Implement. + printf("%s\n", argv[0]); + return 0; +} diff --git a/native/dicttoolkit/run_tests.sh b/native/dicttoolkit/run_tests.sh new file mode 100755 index 000000000..44c99c144 --- /dev/null +++ b/native/dicttoolkit/run_tests.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2014, The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# check script arguments +if [[ $(type -t mmm) != function ]]; then +if [[ ${BASH_SOURCE[0]} != $0 ]]; then return; else exit 1; fi +fi + +# Host build is never supported in unbundled (NDK/tapas) build +if [[ -n $TARGET_BUILD_APPS ]]; then + echo "Host build is never supported in tapas build." 1>&2 + echo "Use lunch command instead." 1>&2 + if [[ ${BASH_SOURCE[0]} != $0 ]]; then return; else exit 1; fi +fi + +test_name=dicttoolkit_unittests + +pushd $PWD > /dev/null +cd $(gettop) +(mmm -j16 packages/inputmethods/LatinIME/native/dicttoolkit) || (make -j16 $test_name) +$ANDROID_HOST_OUT/bin/$test_name +popd > /dev/null diff --git a/native/dicttoolkit/src/dict_toolkit_defines.h b/native/dicttoolkit/src/dict_toolkit_defines.h new file mode 100644 index 000000000..2a2104e26 --- /dev/null +++ b/native/dicttoolkit/src/dict_toolkit_defines.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_TOOLKIT_DEFINES_H +#define LATINIME_DICT_TOOLKIT_DEFINES_H + +#include "defines.h" + +#endif // LATINIME_DICT_TOOLKIT_DEFINES_H diff --git a/native/dicttoolkit/tests/dict_toolkit_defines_test.cpp b/native/dicttoolkit/tests/dict_toolkit_defines_test.cpp new file mode 100644 index 000000000..3445bd0c5 --- /dev/null +++ b/native/dicttoolkit/tests/dict_toolkit_defines_test.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dict_toolkit_defines.h" + +#include <gtest/gtest.h> + +namespace latinime { +namespace dicttoolkit { +namespace { + +// Initial trivial test case. +TEST(DictToolkitDefinesTest, TestKeycodeSpace) { + EXPECT_EQ(' ', KEYCODE_SPACE); +} + +} // namespace +} // namespace dicttoolkit +} // namespace latinime diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk index ca40ba8a5..55bb68344 100644 --- a/native/jni/NativeFileList.mk +++ b/native/jni/NativeFileList.mk @@ -71,6 +71,7 @@ LATIN_IME_CORE_SRC_FILES := \ ver4_patricia_trie_writing_helper.cpp \ ver4_pt_node_array_reader.cpp) \ $(addprefix suggest/policyimpl/dictionary/structure/v4/content/, \ + dynamic_language_model_probability_utils.cpp \ language_model_dict_content.cpp \ language_model_dict_content_global_counters.cpp \ shortcut_dict_content.cpp \ @@ -84,6 +85,7 @@ LATIN_IME_CORE_SRC_FILES := \ forgetting_curve_utils.cpp \ format_utils.cpp \ mmapped_buffer.cpp \ + probability_utils.cpp \ sparse_table.cpp \ trie_map.cpp ) \ suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \ @@ -135,6 +137,7 @@ LATIN_IME_CORE_TEST_FILES := \ suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer_test.cpp \ suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp \ suggest/policyimpl/dictionary/utils/format_utils_test.cpp \ + suggest/policyimpl/dictionary/utils/probability_utils_test.cpp \ suggest/policyimpl/dictionary/utils/sparse_table_test.cpp \ suggest/policyimpl/dictionary/utils/trie_map_test.cpp \ suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \ diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index bfe17cc4c..6a5df9d95 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -81,6 +81,9 @@ void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbabi } const WordAttributes wordAttributes = mDictStructurePolicy->getWordAttributesInContext( mPrevWordIds, targetWordId, nullptr /* multiBigramMap */); + if (wordAttributes.getProbability() == NOT_A_PROBABILITY) { + return; + } mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, wordAttributes.getProbability()); } diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp index 1e2494e92..8f07ce275 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -31,6 +31,7 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x100; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h index fd1d5fcff..e92c509fa 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.h +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -52,6 +52,10 @@ class ErrorTypeUtils { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; } + static bool isPerfectMatch(const ErrorType containedErrorTypes) { + return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0; + } + static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0; @@ -73,6 +77,7 @@ class ErrorTypeUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; + static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH; static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/ngram_listener.h b/native/jni/src/suggest/core/dictionary/ngram_listener.h index e9b3c1aaf..2eb5e9fd1 100644 --- a/native/jni/src/suggest/core/dictionary/ngram_listener.h +++ b/native/jni/src/suggest/core/dictionary/ngram_listener.h @@ -26,6 +26,8 @@ namespace latinime { */ class NgramListener { public: + // ngramProbability is always 0 for v403 decaying dictionary. + // TODO: Remove ngramProbability. virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0; virtual ~NgramListener() {}; diff --git a/native/jni/src/suggest/core/dictionary/property/historical_info.h b/native/jni/src/suggest/core/dictionary/property/historical_info.h index f9bd6fd8c..e5ce1ea25 100644 --- a/native/jni/src/suggest/core/dictionary/property/historical_info.h +++ b/native/jni/src/suggest/core/dictionary/property/historical_info.h @@ -38,6 +38,7 @@ class HistoricalInfo { return mTimestamp; } + // TODO: Remove int getLevel() const { return mLevel; } diff --git a/native/jni/src/suggest/core/policy/scoring.h b/native/jni/src/suggest/core/policy/scoring.h index ce3684a1c..b9dda83ad 100644 --- a/native/jni/src/suggest/core/policy/scoring.h +++ b/native/jni/src/suggest/core/policy/scoring.h @@ -30,7 +30,7 @@ class Scoring { public: virtual int calculateFinalScore(const float compoundDistance, const int inputSize, const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, - const bool boostExactMatches) const = 0; + const bool boostExactMatches, const bool hasProbabilityZero) const = 0; virtual void getMostProbableString(const DicTraverseSession *const traverseSession, const float weightOfLangModelVsSpatialModel, SuggestionResults *const outSuggestionResults) const = 0; diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp index 3283f6deb..74db95953 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -76,6 +76,52 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults); } +/* static */ bool SuggestionsOutputUtils::shouldBlockWord( + const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode, + const WordAttributes wordAttributes, const bool isLastWord) { + const bool currentWordExactMatch = + ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + // When we have to block offensive words, non-exact matched offensive words should not be + // output. + const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords(); + + const bool isBlockedOffensiveWord = shouldBlockOffensiveWords && + wordAttributes.isPossiblyOffensive(); + + // This function is called in two situations: + // + // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode + // of the search, and isLastWord will be true. + // "fuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true) + // In this case, if the current word is an exact match, we will always let the word + // through, even if the user is blocking offensive words (it's exactly what they typed!) + // + // 2) In the middle of the search, when we hit a terminal node, to decide whether or not + // to start a new search at root, to try to match the rest of the input. In this case, + // terminalDicNode will point to the terminal node we just hit, and isLastWord will be + // false. + // "fuckvthis" + // | + // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true) + // + // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this" + // when offensive words are blocked would be a bad idea). + // + // In the case of a multi-word correction where the offensive word is typed last (eg. + // for the input "allfuck"), this function will be called with isLastWord==true, but + // currentWordExactMatch==false. So we are OK in this case as well. + // "allfuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false) + if (isLastWord && currentWordExactMatch) { + return false; + } else { + return isBlockedOffensiveWord; + } +} + /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel, @@ -98,24 +144,16 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; const bool isExactMatchWithIntentionalOmission = ErrorTypeUtils::isExactMatchWithIntentionalOmission( terminalDicNode->getContainedErrorTypes()); - const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); - // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. - // (e.g. "AMD" and "and") - const bool isSafeExactMatch = isExactMatch - && !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase); const int outputTypeFlags = (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) - | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) + | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) | (isExactMatchWithIntentionalOmission ? Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); - // Entries that are blacklisted or do not represent a word should not be output. const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()); - // When we have to block offensive words, non-exact matched offensive words should not be - // output. - const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords(); - const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive() - && !isSafeExactMatch; + + const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(), + terminalDicNode, wordAttributes, true /* isLastWord */); // Increase output score of top typing suggestion to ensure autocorrection. // TODO: Better integration with java side autocorrection logic. @@ -123,11 +161,11 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; compoundDistance, traverseSession->getInputSize(), terminalDicNode->getContainedErrorTypes(), (forceCommitMultiWords && terminalDicNode->hasMultipleWords()), - boostExactMatches); + boostExactMatches, wordAttributes.getProbability() == 0); // Don't output invalid or blocked offensive words. However, we still need to submit their // shortcuts if any. - if (isValidWord && !isBlockedOffensiveWord) { + if (isValidWord && !shouldBlockThisWord) { int codePoints[MAX_WORD_LENGTH]; terminalDicNode->outputResult(codePoints); const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h index bf8497828..eca1f78b2 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.h +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h @@ -18,6 +18,7 @@ #define LATINIME_SUGGESTIONS_OUTPUT_UTILS #include "defines.h" +#include "suggest/core/dictionary/word_attributes.h" namespace latinime { @@ -25,11 +26,19 @@ class BinaryDictionaryShortcutIterator; class DicNode; class DicTraverseSession; class Scoring; +class SuggestOptions; class SuggestionResults; class SuggestionsOutputUtils { public: /** + * Returns true if we should block the incoming word, in the context of the user's + * preferences to include or not include possibly offensive words + */ + static bool shouldBlockWord(const SuggestOptions *const suggestOptions, + const DicNode *const terminalDicNode, const WordAttributes wordAttributes, + const bool isLastWord); + /** * Outputs the final list of suggestions (i.e., terminal nodes). */ static void outputSuggestions(const Scoring *const scoringPolicy, diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 68a36454e..c372d668b 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -416,6 +416,11 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext( dicNode->getPrevWordIds(), dicNode->getWordId(), traverseSession->getMultiBigramMap()); + if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(), + dicNode, wordAttributes, false /* isLastWord */)) { + return; + } + if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) { return; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 44c2f443f..7a5acd7d5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -134,15 +134,17 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { // same so we use them for both here. switch (mDictFormatVersion) { case FormatUtils::VERSION_2: - return FormatUtils::VERSION_2; case FormatUtils::VERSION_201: - return FormatUtils::VERSION_201; + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return FormatUtils::UNKNOWN_VERSION; + case FormatUtils::VERSION_202: + return FormatUtils::VERSION_202; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: return FormatUtils::VERSION_4_ONLY_FOR_TESTING; - case FormatUtils::VERSION_4: - return FormatUtils::VERSION_4; - case FormatUtils::VERSION_4_DEV: - return FormatUtils::VERSION_4_DEV; + case FormatUtils::VERSION_402: + return FormatUtils::VERSION_402; + case FormatUtils::VERSION_403: + return FormatUtils::VERSION_403; default: return FormatUtils::UNKNOWN_VERSION; } @@ -245,7 +247,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { } bool supportsBeginningOfSentence() const { - return mDictFormatVersion >= FormatUtils::VERSION_4; + return mDictFormatVersion >= FormatUtils::VERSION_402; } const int *getCodePointTable() const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index 41a8b13b8..19ed0d468 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -111,11 +111,12 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; switch (version) { case FormatUtils::VERSION_2: case FormatUtils::VERSION_201: - // Version 2 or 201 dictionary writing is not supported. + case FormatUtils::VERSION_202: + // None of the static dictionaries (v2x) support writing return false; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4: - case FormatUtils::VERSION_4_DEV: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: return buffer->writeUintAndAdvancePosition(version /* data */, HEADER_DICTIONARY_VERSION_SIZE, writingPos); default: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp index 9e1adff70..15ac88319 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp @@ -65,6 +65,8 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; if (mHasHistoricalInfo) { + // Hack for better migration. + count += level; const HistoricalInfo historicalInfo(timestamp, level, count); return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId); } else { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp index ef6166ffd..61ef4aa42 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp @@ -50,7 +50,8 @@ const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int ter Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); const int count = buffer->readUintAndAdvancePosition( Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); - const HistoricalInfo historicalInfo(timestamp, level, count); + // Hack for better migration. + const HistoricalInfo historicalInfo(timestamp, level, count + level); return ProbabilityEntry(flags, probability, &historicalInfo); } else { return ProbabilityEntry(flags, probability); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp index 08e39ce43..9455222dd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -140,7 +140,7 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, const PtNodeParams &ptNodeParams) const { - return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), ptNodeParams.getProbability() == 0); } @@ -164,7 +164,7 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI } const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { + if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) { return NOT_A_PROBABILITY; } if (prevWordIds.empty()) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index 372c9e36f..9a9a21b6b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -58,7 +58,7 @@ namespace latinime { const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); switch (dictFormatVersion) { - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_402: { return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants, backward::v402::Ver4DictBuffers, backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr, @@ -66,7 +66,7 @@ namespace latinime { dictFormatVersion, locale, attributeMap); } case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4_DEV: { + case FormatUtils::VERSION_403: { return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers, Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>( dictFormatVersion, locale, attributeMap); @@ -115,9 +115,10 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str switch (formatVersion) { case FormatUtils::VERSION_2: case FormatUtils::VERSION_201: - AKLOGE("Given path is a directory but the format is version 2 or 201. path: %s", path); + case FormatUtils::VERSION_202: + AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path); break; - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_402: { return newPolicyForV4Dict<backward::v402::Ver4DictConstants, backward::v402::Ver4DictBuffers, backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr, @@ -125,7 +126,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str headerFilePath, formatVersion, std::move(mmappedBuffer)); } case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4_DEV: { + case FormatUtils::VERSION_403: { return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers, Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>( headerFilePath, formatVersion, std::move(mmappedBuffer)); @@ -177,11 +178,14 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { case FormatUtils::VERSION_2: case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + break; + case FormatUtils::VERSION_202: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4: - case FormatUtils::VERSION_4_DEV: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: AKLOGE("Given path is a file but the format is version 4. path: %s", path); break; default: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h index 585e87a24..e52706e07 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -144,17 +144,6 @@ class PtNodeParams { return PatriciaTrieReadingUtils::isTerminal(mFlags); } - AK_FORCE_INLINE bool isBlacklisted() const { - // Note: this method will be removed in the next change. - // It is used in getProbabilityOfWord and getWordAttributes for both v402 and v403. - // * getProbabilityOfWord will be changed to no longer return NOT_A_PROBABILITY - // when isBlacklisted (i.e. to only check if isNotAWord or isDeleted) - // * getWordAttributes will be changed to always return blacklisted=false and - // isPossiblyOffensive according to the function below (instead of the current - // behaviour of checking if the probability is zero) - return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); - } - AK_FORCE_INLINE bool isPossiblyOffensive() const { return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 66fd18a52..59873612a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ - #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" #include "defines.h" @@ -317,8 +316,8 @@ const WordAttributes PatriciaTriePolicy::getWordAttributesInContext( const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, const PtNodeParams &ptNodeParams) const { - return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), - ptNodeParams.getProbability() == 0); + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.isPossiblyOffensive()); } int PatriciaTriePolicy::getProbability(const int unigramProbability, @@ -345,10 +344,9 @@ int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams = mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) { - // If this is not a word, or if it's a blacklisted entry, it should behave as - // having no probability outside of the suggestion process (where it should be used - // for shortcuts). + if (ptNodeParams.isNotAWord()) { + // If this is not a word, it should behave as having no probability outside of the + // suggestion process (where it should be used for shortcuts). return NOT_A_PROBABILITY; } if (!prevWordIds.empty()) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp new file mode 100644 index 000000000..b0fbb3e72 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" + +namespace latinime { + +// These counts are used to provide stable probabilities even if the user's input count is small. +const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_UNIGRAMS = 8192; +const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_BIGRAMS = 2; +const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_TRIGRAMS = 2; + +// These are encoded backoff weights. +// Note that we give positive value for trigrams that means the weight is more than 1. +// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight. +const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS = -32; +const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS = 0; +const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS = 8; + +// This value is used to remove too old entries from the dictionary. +const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS = + 300 * 24 * 60 * 60; // 300 days + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h new file mode 100644 index 000000000..88bc58fe8 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H +#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H + +#include <algorithm> + +#include "defines.h" +#include "suggest/core/dictionary/property/historical_info.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class DynamicLanguageModelProbabilityUtils { + public: + static float computeRawProbabilityFromCounts(const int count, const int contextCount, + const int matchedWordCountInContext) { + int minCount = 0; + switch (matchedWordCountInContext) { + case 1: + minCount = ASSUMED_MIN_COUNT_FOR_UNIGRAMS; + break; + case 2: + minCount = ASSUMED_MIN_COUNT_FOR_BIGRAMS; + break; + case 3: + minCount = ASSUMED_MIN_COUNT_FOR_TRIGRAMS; + break; + default: + AKLOGE("computeRawProbabilityFromCounts is called with invalid " + "matchedWordCountInContext (%d).", matchedWordCountInContext); + ASSERT(false); + return 0.0f; + } + return static_cast<float>(count) / static_cast<float>(std::max(contextCount, minCount)); + } + + static float backoff(const int ngramProbability, const int matchedWordCountInContext) { + int probability = NOT_A_PROBABILITY; + + switch (matchedWordCountInContext) { + case 1: + probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS; + break; + case 2: + probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS; + break; + case 3: + probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS; + break; + default: + AKLOGE("backoff is called with invalid matchedWordCountInContext (%d).", + matchedWordCountInContext); + ASSERT(false); + return NOT_A_PROBABILITY; + } + return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY); + } + + static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) { + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + if (elapsedTime < 0) { + AKLOGE("The elapsed time is negatime value. Timestamp overflow?"); + return NOT_A_PROBABILITY; + } + // TODO: Improve this logic. + // We don't modify probability depending on the elapsed time. + return probability; + } + + static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS; + } + + static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + // More recently input entries get higher priority. + return historicalInfo.getTimestamp(); + } + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils); + + static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 2, "Max supported Ngram is Trigram."); + + static const int ASSUMED_MIN_COUNT_FOR_UNIGRAMS; + static const int ASSUMED_MIN_COUNT_FOR_BIGRAMS; + static const int ASSUMED_MIN_COUNT_FOR_TRIGRAMS; + + static const int ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS; + static const int ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS; + static const int ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS; + + static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS; +}; + +} // namespace latinime +#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp index 05a3a6356..31b1ea696 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -19,11 +19,11 @@ #include <algorithm> #include <cstring> -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { -const int LanguageModelDictContent::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0; const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1; @@ -39,7 +39,8 @@ bool LanguageModelDictContent::runGC( } const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, - const int wordId, const HeaderPolicy *const headerPolicy) const { + const int wordId, const bool mustMatchAllPrevWords, + const HeaderPolicy *const headerPolicy) const { int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); int maxPrevWordCount = 0; @@ -53,7 +54,15 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; } + const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); + if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) { + // The word should be treated as a invalid word. + return WordAttributes(); + } for (int i = maxPrevWordCount; i >= 0; --i) { + if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) { + break; + } const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); if (!result.mIsValid) { continue; @@ -62,36 +71,39 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); int probability = NOT_A_PROBABILITY; if (mHasHistoricalInfo) { - const int rawProbability = ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), headerPolicy); - if (rawProbability == NOT_A_PROBABILITY) { - // The entry should not be treated as a valid entry. - continue; - } + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + int contextCount = 0; if (i == 0) { // unigram - probability = rawProbability; + contextCount = mGlobalCounters.getTotalCount(); } else { const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); if (!prevWordProbabilityEntry.isValid()) { continue; } - if (prevWordProbabilityEntry.representsBeginningOfSentence()) { - probability = rawProbability; - } else { - const int prevWordRawProbability = ForgettingCurveUtils::decodeProbability( - prevWordProbabilityEntry.getHistoricalInfo(), headerPolicy); - probability = std::min(MAX_PROBABILITY - prevWordRawProbability - + rawProbability, MAX_PROBABILITY); + if (prevWordProbabilityEntry.representsBeginningOfSentence() + && historicalInfo->getCount() == 1) { + // BoS ngram requires multiple contextCount. + continue; } + contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount(); } + const float rawProbability = + DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts( + historicalInfo->getCount(), contextCount, i + 1); + const int encodedRawProbability = + ProbabilityUtils::encodeRawProbability(rawProbability); + const int decayedProbability = + DynamicLanguageModelProbabilityUtils::getDecayedProbability( + encodedRawProbability, *historicalInfo); + probability = DynamicLanguageModelProbabilityUtils::backoff( + decayedProbability, i + 1 /* n */); } else { probability = probabilityEntry.getProbability(); } // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in // probabilityEntry. - const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), unigramProbabilityEntry.isNotAWord(), unigramProbabilityEntry.isPossiblyOffensive()); @@ -167,7 +179,8 @@ void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner( ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); if (probabilityEntry.isValid()) { const WordAttributes wordAttributes = getWordAttributes( - WordIdArrayView(*prevWordIds), wordId, headerPolicy); + WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */, + headerPolicy); outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId, wordAttributes, probabilityEntry); } @@ -231,7 +244,7 @@ bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView return false; } mGlobalCounters.updateMaxValueOfCounters( - updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount()); + updatedNgramProbabilityEntry.getHistoricalInfo()->getCount()); if (!originalNgramProbabilityEntry.isValid()) { entryCountersToUpdate->incrementNgramCount(i + 2); } @@ -242,10 +255,9 @@ bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( const ProbabilityEntry &originalProbabilityEntry, const bool isValid, const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { - const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalProbabilityEntry.getHistoricalInfo(), isValid ? - DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY, - &historicalInfo, headerPolicy); + const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(), + 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount() + + historicalInfo.getCount()); if (originalProbabilityEntry.isValid()) { return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); } else { @@ -311,7 +323,7 @@ int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWord bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, const HeaderPolicy *const headerPolicy, - MutableEntryCounters *const outEntryCounters) { + const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", @@ -328,33 +340,41 @@ bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int b } continue; } - if (mHasHistoricalInfo && !probabilityEntry.representsBeginningOfSentence() - && probabilityEntry.isValid()) { - const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - probabilityEntry.getHistoricalInfo(), headerPolicy); - if (ForgettingCurveUtils::needsToKeep(&historicalInfo, headerPolicy)) { - // Update the entry. - const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), &historicalInfo); - if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), - bitmapEntryIndex)) { - return false; - } - } else { + if (mHasHistoricalInfo && probabilityEntry.isValid()) { + const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); + if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( + *originalHistoricalInfo)) { // Remove the entry. if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { return false; } continue; } + if (needsToHalveCounters) { + const int updatedCount = originalHistoricalInfo->getCount() / 2; + if (updatedCount == 0) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), + originalHistoricalInfo->getLevel(), updatedCount); + const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), + &historicalInfoToSave); + if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), + bitmapEntryIndex)) { + return false; + } + } } - if (!probabilityEntry.representsBeginningOfSentence()) { - outEntryCounters->incrementNgramCount(prevWordCount + 1); - } + outEntryCounters->incrementNgramCount(prevWordCount + 1); if (!entry.hasNextLevelMap()) { continue; } if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), - prevWordCount + 1, headerPolicy, outEntryCounters)) { + prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { return false; } } @@ -408,11 +428,11 @@ bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPoli } const ProbabilityEntry probabilityEntry = ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); - const int probability = (mHasHistoricalInfo) ? - ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo(), - headerPolicy) : probabilityEntry.getProbability(); - outEntryInfo->emplace_back(probability, - probabilityEntry.getHistoricalInfo()->getTimestamp(), + const int priority = mHasHistoricalInfo + ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction( + *probabilityEntry.getHistoricalInfo()) + : probabilityEntry.getProbability(); + outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(), entry.key(), targetLevel, prevWordIds->data()); } return true; @@ -420,11 +440,11 @@ bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPoli bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { - if (left.mProbability != right.mProbability) { - return left.mProbability < right.mProbability; + if (left.mPriority != right.mPriority) { + return left.mPriority < right.mPriority; } - if (left.mTimestamp != right.mTimestamp) { - return left.mTimestamp > right.mTimestamp; + if (left.mCount != right.mCount) { + return left.mCount < right.mCount; } if (left.mKey != right.mKey) { return left.mKey < right.mKey; @@ -441,10 +461,9 @@ bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( return false; } -LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int probability, - const int timestamp, const int key, const int prevWordCount, const int *const prevWordIds) - : mProbability(probability), mTimestamp(timestamp), mKey(key), - mPrevWordCount(prevWordCount) { +LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority, + const int count, const int key, const int prevWordCount, const int *const prevWordIds) + : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) { memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h index 5b92b96e3..9678c35f9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h @@ -151,13 +151,14 @@ class LanguageModelDictContent { const LanguageModelDictContent *const originalContent); const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, - const HeaderPolicy *const headerPolicy) const; + const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const; ProbabilityEntry getProbabilityEntry(const int wordId) const { return getNgramProbabilityEntry(WordIdArrayView(), wordId); } bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { + mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount()); return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); } @@ -180,8 +181,15 @@ class LanguageModelDictContent { bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters) { - return updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), - 0 /* prevWordCount */, headerPolicy, outEntryCounters); + if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), + 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(), + outEntryCounters)) { + return false; + } + if (mGlobalCounters.needsToHalveCounters()) { + mGlobalCounters.halveCounters(); + } + return true; } // entryCounts should be created by updateAllProbabilityEntries. @@ -206,11 +214,12 @@ class LanguageModelDictContent { DISALLOW_ASSIGNMENT_OPERATOR(Comparator); }; - EntryInfoToTurncate(const int probability, const int timestamp, const int key, + EntryInfoToTurncate(const int priority, const int count, const int key, const int prevWordCount, const int *const prevWordIds); - int mProbability; - int mTimestamp; + int mPriority; + // TODO: Remove. + int mCount; int mKey; int mPrevWordCount; int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; @@ -219,8 +228,6 @@ class LanguageModelDictContent { DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate); }; - // TODO: Remove - static const int DUMMY_PROBABILITY_FOR_VALID_WORDS; static const int TRIE_MAP_BUFFER_INDEX; static const int GLOBAL_COUNTERS_BUFFER_INDEX; @@ -233,7 +240,8 @@ class LanguageModelDictContent { int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds); int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, - const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters); + const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters, + MutableEntryCounters *const outEntryCounters); bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, int *const outEntryCount); bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h index 9953aa425..283c2691a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h @@ -63,6 +63,10 @@ class LanguageModelDictContentGlobalCounters { mTotalCount += 1; } + void addToTotalCount(const int count) { + mTotalCount += count; + } + void updateMaxValueOfCounters(const int count) { mMaxValueOfCounters = std::max(count, mMaxValueOfCounters); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h index f4d340f86..9c4ab18e4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h @@ -105,7 +105,7 @@ class ProbabilityEntry { encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT)) | static_cast<uint8_t>(mHistoricalInfo.getLevel()); encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - | static_cast<uint8_t>(mHistoricalInfo.getCount()); + | static_cast<uint16_t>(mHistoricalInfo.getCount()); } else { encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT)) | static_cast<uint8_t>(mProbability); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp index eb6080a24..bd89b8da7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp @@ -49,8 +49,8 @@ const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; -const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; -const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2; const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1; const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h index 600b5ffe4..13d7a5714 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h @@ -47,6 +47,7 @@ class Ver4DictConstants { static const int NOT_A_TERMINAL_ADDRESS; static const int TERMINAL_ID_FIELD_SIZE; static const int TIME_STAMP_FIELD_SIZE; + // TODO: Remove static const int WORD_LEVEL_FIELD_SIZE; static const int WORD_COUNT_FIELD_SIZE; // Flags in probability entry. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index d3de322f9..1992d4a5a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -110,7 +110,7 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( return WordAttributes(); } return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, - mHeaderPolicy); + false /* mustMatchAllPrevWords */, mHeaderPolicy); } int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, @@ -118,18 +118,13 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { return NOT_A_PROBABILITY; } - const ProbabilityEntry probabilityEntry = - mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId); - if (!probabilityEntry.isValid() || probabilityEntry.isBlacklisted() - || probabilityEntry.isNotAWord()) { + const WordAttributes wordAttributes = + mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + true /* mustMatchAllPrevWords */, mHeaderPolicy); + if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { return NOT_A_PROBABILITY; } - if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - return ForgettingCurveUtils::decodeProbability(probabilityEntry.getHistoricalInfo(), - mHeaderPolicy); - } else { - return probabilityEntry.getProbability(); - } + return wordAttributes.getProbability(); } BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( @@ -151,10 +146,16 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI if (!probabilityEntry.isValid()) { continue; } - const int probability = probabilityEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), mHeaderPolicy) : - probabilityEntry.getProbability(); + int probability = NOT_A_PROBABILITY; + if (probabilityEntry.hasHistoricalInfo()) { + // TODO: Quit checking count here. + // If count <= 1, the word can be an invaild word. The actual probability should + // be checked using getWordAttributesInContext() in onVisitEntry(). + probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ? + NOT_A_PROBABILITY : 0; + } else { + probability = probabilityEntry.getProbability(); + } listener->onVisitEntry(probability, entry.getWordId()); } } @@ -386,25 +387,35 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); return false; } + if (!isValidWord) { + return true; + } wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); - if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID - && ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { - const UnigramProperty beginningOfSentenceUnigramProperty( - true /* representsBeginningOfSentence */, - true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, - HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); - if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), - &beginningOfSentenceUnigramProperty)) { - AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, + true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, + HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + // Update entries for beginning of sentence. + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( + prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, + mHeaderPolicy, &mEntryCounters)) { return false; } - // Refresh word ids. - ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { @@ -542,7 +553,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty( } } const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( - WordIdArrayView(), wordId, mHeaderPolicy); + WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 9d8e86675..edcb43678 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -44,13 +44,13 @@ const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4; TimeKeeper::setCurrentTime(); const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); switch (formatVersion) { - case FormatUtils::VERSION_4: + case FormatUtils::VERSION_402: return createEmptyV4DictFile<backward::v402::Ver4DictConstants, backward::v402::Ver4DictBuffers, backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr>( filePath, localeAsCodePointVector, attributeMap, formatVersion); case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4_DEV: + case FormatUtils::VERSION_403: return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers, Ver4DictBuffers::Ver4DictBuffersPtr>( filePath, localeAsCodePointVector, attributeMap, formatVersion); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index 0cffe569d..e225c235e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -28,15 +28,17 @@ const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; /* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { switch (formatVersion) { case VERSION_2: - return VERSION_2; case VERSION_201: - return VERSION_201; + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return UNKNOWN_VERSION; + case VERSION_202: + return VERSION_202; case VERSION_4_ONLY_FOR_TESTING: return VERSION_4_ONLY_FOR_TESTING; - case VERSION_4: - return VERSION_4; - case VERSION_4_DEV: - return VERSION_4_DEV; + case VERSION_402: + return VERSION_402; + case VERSION_403: + return VERSION_403; default: return UNKNOWN_VERSION; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index 96310086b..1616efcce 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -31,11 +31,15 @@ class FormatUtils { public: enum FORMAT_VERSION { // These MUST have the same values as the relevant constants in FormatSpec.java. + // TODO: Remove VERSION_2 and VERSION_201 when we: + // * Confirm that old versions of LatinIME download old-format dictionaries + // * We no longer need the corresponding constants on the Java side for dicttool VERSION_2 = 2, VERSION_201 = 201, + VERSION_202 = 202, VERSION_4_ONLY_FOR_TESTING = 399, - VERSION_4 = 402, - VERSION_4_DEV = 403, + VERSION_402 = 402, + VERSION_403 = 403, UNKNOWN_VERSION = -1 }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp new file mode 100644 index 000000000..e8fa06942 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f; + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h index 3b339e61a..2050af1e9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h @@ -17,6 +17,9 @@ #ifndef LATINIME_PROBABILITY_UTILS_H #define LATINIME_PROBABILITY_UTILS_H +#include <algorithm> +#include <cmath> + #include "defines.h" namespace latinime { @@ -47,8 +50,20 @@ class ProbabilityUtils { + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } + // Encode probability using the same way as we are doing for main dictionaries. + static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) { + const float probability = static_cast<float>(MAX_PROBABILITY) + + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER; + if (probability < 0.0f) { + return 0; + } + return std::min(static_cast<int>(probability + 0.5f), MAX_PROBABILITY); + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); + + static const float PROBABILITY_ENCODING_SCALER; }; } #endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index a6f9a8b23..856808a74 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120; const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f; const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f; +const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f; const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f; const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f; const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f; diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h index b8f889559..6f327a370 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h @@ -34,6 +34,7 @@ class ScoringParams { static const int THRESHOLD_SHORT_WORD_LENGTH; static const float EXACT_MATCH_PROMOTION; + static const float PERFECT_MATCH_PROMOTION; static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH; static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h index 0240bcf54..6acd767ea 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h @@ -44,23 +44,50 @@ class TypingScoring : public Scoring { AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize, const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, - const bool boostExactMatches) const { + const bool boostExactMatches, const bool hasProbabilityZero) const { const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE + static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT; float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance; if (forceCommit) { score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD; } - if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { - score += ScoringParams::EXACT_MATCH_PROMOTION; - if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) { - score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + if (hasProbabilityZero) { + // Previously, when both legitimate 0-frequency words (such as distracters) and + // offensive words were encoded in the same way, distracters would never show up + // when the user blocked offensive words (the default setting, as well as the + // setting for regression tests). + // + // When b/11031090 was fixed and a separate encoding was used for offensive words, + // 0-frequency words would no longer be blocked when they were an "exact match" + // (where case mismatches and accent mismatches would be considered an "exact + // match"). The exact match boosting functionality meant that, for example, when + // the user typed "mt" they would be suggested the word "Mt", although they most + // probably meant to type "my". + // + // For this reason, we introduced this change, which does the following: + // * Defines the "perfect match" as a really exact match, with no room for case or + // accent mismatches + // * When the target word has probability zero (as "Mt" does, because it is a + // distracter), ONLY boost its score if it is a perfect match. + // + // By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and + // they will get "my". However, if the user makes an explicit effort to type "Mt", + // we do boost the word "Mt" so that the user's input is not autocorrected to "My". + if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) { + score += ScoringParams::PERFECT_MATCH_PROMOTION; } - if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) { - score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; - } - if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { - score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } else { + if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { + score += ScoringParams::EXACT_MATCH_PROMOTION; + if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) { + score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) { + score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { + score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } } } return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE); diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp index 4469dc715..313a9af10 100644 --- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp +++ b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp @@ -52,16 +52,14 @@ TEST(LanguageModelDictContentTest, TestUnigramProbabilityWithHistoricalInfo) { const int flag = 0xF0; const int timestamp = 0x3FFFFFFF; - const int level = 3; const int count = 10; const int wordId = 100; - const HistoricalInfo historicalInfo(timestamp, level, count); + const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); const ProbabilityEntry probabilityEntry(flag, &historicalInfo); languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); const ProbabilityEntry entry = languageModelDictContent.getProbabilityEntry(wordId); EXPECT_EQ(flag, entry.getFlags()); EXPECT_EQ(timestamp, entry.getHistoricalInfo()->getTimestamp()); - EXPECT_EQ(level, entry.getHistoricalInfo()->getLevel()); EXPECT_EQ(count, entry.getHistoricalInfo()->getCount()); // Remove @@ -108,14 +106,14 @@ TEST(LanguageModelDictContentTest, TestGetWordProbability) { languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), wordId, &bigramProbabilityEntry); EXPECT_EQ(bigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, - nullptr /* headerPolicy */).getProbability()); + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); const ProbabilityEntry trigramProbabilityEntry(flag, trigramProbability); languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), prevWordIds[1], &probabilityEntry); languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(2), wordId, &trigramProbabilityEntry); EXPECT_EQ(trigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, - nullptr /* headerPolicy */).getProbability()); + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); } } // namespace diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp index 260b347ce..eb78034ba 100644 --- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp +++ b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp @@ -39,20 +39,18 @@ TEST(ProbabilityEntryTest, TestEncodeDecode) { TEST(ProbabilityEntryTest, TestEncodeDecodeWithHistoricalInfo) { const int flag = 0xF0; const int timestamp = 0x3FFFFFFF; - const int level = 3; - const int count = 10; + const int count = 0xABCD; - const HistoricalInfo historicalInfo(timestamp, level, count); + const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); const ProbabilityEntry entry(flag, &historicalInfo); const uint64_t encodedEntry = entry.encode(true /* hasHistoricalInfo */); - EXPECT_EQ(0xF03FFFFFFF030Aull, encodedEntry); + EXPECT_EQ(0xF03FFFFFFFABCDull, encodedEntry); const ProbabilityEntry decodedEntry = ProbabilityEntry::decode(encodedEntry, true /* hasHistoricalInfo */); EXPECT_EQ(flag, decodedEntry.getFlags()); EXPECT_EQ(timestamp, decodedEntry.getHistoricalInfo()->getTimestamp()); - EXPECT_EQ(level, decodedEntry.getHistoricalInfo()->getLevel()); EXPECT_EQ(count, decodedEntry.getHistoricalInfo()->getCount()); } diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp index 15f560cd1..494200568 100644 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp +++ b/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp @@ -62,14 +62,14 @@ TEST(FormatUtilsTest, TestDetectFormatVersion) { } { const std::vector<uint8_t> buffer = - getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_4, 0, 0); - EXPECT_EQ(FormatUtils::VERSION_4, FormatUtils::detectFormatVersion( + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_402, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_402, FormatUtils::detectFormatVersion( ReadOnlyByteArrayView(buffer.data(), buffer.size()))); } { const std::vector<uint8_t> buffer = - getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_4_DEV, 0, 0); - EXPECT_EQ(FormatUtils::VERSION_4_DEV, FormatUtils::detectFormatVersion( + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_403, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_403, FormatUtils::detectFormatVersion( ReadOnlyByteArrayView(buffer.data(), buffer.size()))); } diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp new file mode 100644 index 000000000..be1f278c6 --- /dev/null +++ b/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +#include <gtest/gtest.h> + +#include "defines.h" + +namespace latinime { +namespace { + +TEST(ProbabilityUtilsTest, TestEncodeRawProbability) { + EXPECT_EQ(MAX_PROBABILITY, ProbabilityUtils::encodeRawProbability(1.0f)); + EXPECT_EQ(MAX_PROBABILITY - 9, ProbabilityUtils::encodeRawProbability(0.5f)); + EXPECT_EQ(0, ProbabilityUtils::encodeRawProbability(0.0f)); +} + +} // namespace +} // namespace latinime |