From 88bc312ad34321fb3e81be2dc939a889d065f4a7 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Wed, 17 Dec 2014 16:02:09 +0900 Subject: Move dictionary code to top level dictionary dir. Bug: 18725954 Change-Id: Ia442ba4b5d84311057d83edf6e7aeb151d6a820b --- native/jni/NativeFileList.mk | 94 +-- ..._android_inputmethod_latin_BinaryDictionary.cpp | 8 +- ...oid_inputmethod_latin_BinaryDictionaryUtils.cpp | 2 +- ...ndroid_inputmethod_latin_DicTraverseSession.cpp | 2 +- native/jni/src/dictionary/header/header_policy.cpp | 183 ++++++ native/jni/src/dictionary/header/header_policy.h | 268 +++++++++ .../dictionary/header/header_read_write_utils.cpp | 248 ++++++++ .../dictionary/header/header_read_write_utils.h | 123 ++++ .../dictionary_bigrams_structure_policy.h | 42 ++ .../interface/dictionary_header_structure_policy.h | 63 ++ .../dictionary_shortcuts_structure_policy.h | 46 ++ .../dictionary_structure_with_buffer_policy.h | 124 ++++ .../jni/src/dictionary/interface/ngram_listener.h | 42 ++ .../jni/src/dictionary/property/historical_info.h | 59 ++ .../jni/src/dictionary/property/ngram_context.cpp | 123 ++++ native/jni/src/dictionary/property/ngram_context.h | 78 +++ .../jni/src/dictionary/property/ngram_property.h | 62 ++ .../jni/src/dictionary/property/unigram_property.h | 137 +++++ .../jni/src/dictionary/property/word_attributes.h | 68 +++ native/jni/src/dictionary/property/word_property.h | 62 ++ .../dictionary/structure/backward/v402/Readme.txt | 1 + .../v402/bigram/ver4_bigram_list_policy.cpp | 289 +++++++++ .../backward/v402/bigram/ver4_bigram_list_policy.h | 94 +++ .../backward/v402/content/bigram_dict_content.cpp | 226 +++++++ .../backward/v402/content/bigram_dict_content.h | 135 +++++ .../structure/backward/v402/content/bigram_entry.h | 110 ++++ .../structure/backward/v402/content/dict_content.h | 47 ++ .../v402/content/probability_dict_content.cpp | 170 ++++++ .../v402/content/probability_dict_content.h | 74 +++ .../backward/v402/content/probability_entry.h | 90 +++ .../v402/content/shortcut_dict_content.cpp | 199 +++++++ .../backward/v402/content/shortcut_dict_content.h | 101 ++++ .../backward/v402/content/single_dict_content.h | 88 +++ .../v402/content/sparse_table_dict_content.cpp | 50 ++ .../v402/content/sparse_table_dict_content.h | 124 ++++ .../content/terminal_position_lookup_table.cpp | 111 ++++ .../v402/content/terminal_position_lookup_table.h | 73 +++ .../v402/shortcut/ver4_shortcut_list_policy.h | 118 ++++ .../structure/backward/v402/ver4_dict_buffers.cpp | 157 +++++ .../structure/backward/v402/ver4_dict_buffers.h | 152 +++++ .../backward/v402/ver4_dict_constants.cpp | 81 +++ .../structure/backward/v402/ver4_dict_constants.h | 84 +++ .../v402/ver4_patricia_trie_node_reader.cpp | 110 ++++ .../backward/v402/ver4_patricia_trie_node_reader.h | 79 +++ .../v402/ver4_patricia_trie_node_writer.cpp | 442 ++++++++++++++ .../backward/v402/ver4_patricia_trie_node_writer.h | 150 +++++ .../backward/v402/ver4_patricia_trie_policy.cpp | 662 +++++++++++++++++++++ .../backward/v402/ver4_patricia_trie_policy.h | 181 ++++++ .../v402/ver4_patricia_trie_reading_utils.cpp | 39 ++ .../v402/ver4_patricia_trie_reading_utils.h | 52 ++ .../v402/ver4_patricia_trie_writing_helper.cpp | 307 ++++++++++ .../v402/ver4_patricia_trie_writing_helper.h | 140 +++++ .../backward/v402/ver4_pt_node_array_reader.cpp | 90 +++ .../backward/v402/ver4_pt_node_array_reader.h | 57 ++ ...ionary_structure_with_buffer_policy_factory.cpp | 209 +++++++ ...ctionary_structure_with_buffer_policy_factory.h | 64 ++ .../bigram/bigram_list_read_write_utils.cpp | 95 +++ .../bigram/bigram_list_read_write_utils.h | 68 +++ .../pt_common/dynamic_pt_gc_event_listeners.cpp | 144 +++++ .../pt_common/dynamic_pt_gc_event_listeners.h | 173 ++++++ .../pt_common/dynamic_pt_reading_helper.cpp | 321 ++++++++++ .../pt_common/dynamic_pt_reading_helper.h | 282 +++++++++ .../pt_common/dynamic_pt_reading_utils.cpp | 72 +++ .../structure/pt_common/dynamic_pt_reading_utils.h | 83 +++ .../pt_common/dynamic_pt_updating_helper.cpp | 299 ++++++++++ .../pt_common/dynamic_pt_updating_helper.h | 96 +++ .../pt_common/dynamic_pt_writing_utils.cpp | 132 ++++ .../structure/pt_common/dynamic_pt_writing_utils.h | 79 +++ .../pt_common/patricia_trie_reading_utils.cpp | 164 +++++ .../pt_common/patricia_trie_reading_utils.h | 133 +++++ .../structure/pt_common/pt_node_array_reader.h | 45 ++ .../structure/pt_common/pt_node_params.h | 262 ++++++++ .../structure/pt_common/pt_node_reader.h | 40 ++ .../structure/pt_common/pt_node_writer.h | 97 +++ .../shortcut/shortcut_list_reading_utils.cpp | 53 ++ .../shortcut/shortcut_list_reading_utils.h | 72 +++ .../structure/v2/bigram/bigram_list_policy.h | 59 ++ .../structure/v2/patricia_trie_policy.cpp | 526 ++++++++++++++++ .../dictionary/structure/v2/patricia_trie_policy.h | 180 ++++++ .../structure/v2/shortcut/shortcut_list_policy.h | 73 +++ .../v2/ver2_patricia_trie_node_reader.cpp | 52 ++ .../structure/v2/ver2_patricia_trie_node_reader.h | 52 ++ .../structure/v2/ver2_pt_node_array_reader.cpp | 54 ++ .../structure/v2/ver2_pt_node_array_reader.h | 43 ++ .../dynamic_language_model_probability_utils.cpp | 34 ++ .../dynamic_language_model_probability_utils.h | 77 +++ .../v4/content/language_model_dict_content.cpp | 478 +++++++++++++++ .../v4/content/language_model_dict_content.h | 258 ++++++++ ...language_model_dict_content_global_counters.cpp | 32 + .../language_model_dict_content_global_counters.h | 101 ++++ .../structure/v4/content/probability_entry.h | 176 ++++++ .../structure/v4/content/shortcut_dict_content.cpp | 182 ++++++ .../structure/v4/content/shortcut_dict_content.h | 92 +++ .../structure/v4/content/single_dict_content.h | 64 ++ .../v4/content/sparse_table_dict_content.cpp | 40 ++ .../v4/content/sparse_table_dict_content.h | 91 +++ .../v4/content/terminal_position_lookup_table.cpp | 98 +++ .../v4/content/terminal_position_lookup_table.h | 63 ++ .../v4/shortcut/ver4_shortcut_list_policy.h | 106 ++++ .../dictionary/structure/v4/ver4_dict_buffers.cpp | 194 ++++++ .../dictionary/structure/v4/ver4_dict_buffers.h | 132 ++++ .../structure/v4/ver4_dict_constants.cpp | 72 +++ .../dictionary/structure/v4/ver4_dict_constants.h | 75 +++ .../v4/ver4_patricia_trie_node_reader.cpp | 91 +++ .../structure/v4/ver4_patricia_trie_node_reader.h | 55 ++ .../v4/ver4_patricia_trie_node_writer.cpp | 354 +++++++++++ .../structure/v4/ver4_patricia_trie_node_writer.h | 108 ++++ .../structure/v4/ver4_patricia_trie_policy.cpp | 603 +++++++++++++++++++ .../structure/v4/ver4_patricia_trie_policy.h | 149 +++++ .../v4/ver4_patricia_trie_reading_utils.cpp | 28 + .../v4/ver4_patricia_trie_reading_utils.h | 37 ++ .../v4/ver4_patricia_trie_writing_helper.cpp | 185 ++++++ .../v4/ver4_patricia_trie_writing_helper.h | 76 +++ .../structure/v4/ver4_pt_node_array_reader.cpp | 79 +++ .../structure/v4/ver4_pt_node_array_reader.h | 42 ++ .../utils/binary_dictionary_bigrams_iterator.h | 69 +++ .../utils/binary_dictionary_shortcut_iterator.h | 61 ++ native/jni/src/dictionary/utils/bloom_filter.h | 69 +++ .../utils/buffer_with_extendable_buffer.cpp | 170 ++++++ .../utils/buffer_with_extendable_buffer.h | 125 ++++ .../jni/src/dictionary/utils/byte_array_utils.cpp | 25 + native/jni/src/dictionary/utils/byte_array_utils.h | 290 +++++++++ .../dictionary/utils/dict_file_writing_utils.cpp | 144 +++++ .../src/dictionary/utils/dict_file_writing_utils.h | 67 +++ native/jni/src/dictionary/utils/entry_counters.h | 89 +++ native/jni/src/dictionary/utils/file_utils.cpp | 171 ++++++ native/jni/src/dictionary/utils/file_utils.h | 60 ++ .../dictionary/utils/forgetting_curve_utils.cpp | 234 ++++++++ .../src/dictionary/utils/forgetting_curve_utils.h | 112 ++++ native/jni/src/dictionary/utils/format_utils.cpp | 71 +++ native/jni/src/dictionary/utils/format_utils.h | 59 ++ native/jni/src/dictionary/utils/mmapped_buffer.cpp | 98 +++ native/jni/src/dictionary/utils/mmapped_buffer.h | 76 +++ .../jni/src/dictionary/utils/multi_bigram_map.cpp | 100 ++++ native/jni/src/dictionary/utils/multi_bigram_map.h | 84 +++ .../jni/src/dictionary/utils/probability_utils.cpp | 23 + .../jni/src/dictionary/utils/probability_utils.h | 69 +++ native/jni/src/dictionary/utils/sparse_table.cpp | 101 ++++ native/jni/src/dictionary/utils/sparse_table.h | 60 ++ native/jni/src/dictionary/utils/trie_map.cpp | 460 ++++++++++++++ native/jni/src/dictionary/utils/trie_map.h | 399 +++++++++++++ .../src/suggest/core/dicnode/dic_node_utils.cpp | 2 +- .../binary_dictionary_bigrams_iterator.h | 69 --- .../binary_dictionary_shortcut_iterator.h | 61 -- .../jni/src/suggest/core/dictionary/bloom_filter.h | 69 --- .../jni/src/suggest/core/dictionary/dictionary.cpp | 4 +- .../jni/src/suggest/core/dictionary/dictionary.h | 10 +- .../suggest/core/dictionary/dictionary_utils.cpp | 4 +- .../src/suggest/core/dictionary/digraph_utils.cpp | 2 +- .../suggest/core/dictionary/multi_bigram_map.cpp | 100 ---- .../src/suggest/core/dictionary/multi_bigram_map.h | 84 --- .../src/suggest/core/dictionary/ngram_listener.h | 42 -- .../core/dictionary/property/historical_info.h | 59 -- .../core/dictionary/property/ngram_property.h | 62 -- .../core/dictionary/property/unigram_property.h | 137 ----- .../core/dictionary/property/word_property.h | 62 -- .../src/suggest/core/dictionary/word_attributes.h | 68 --- .../policy/dictionary_bigrams_structure_policy.h | 42 -- .../policy/dictionary_header_structure_policy.h | 63 -- .../policy/dictionary_shortcuts_structure_policy.h | 46 -- .../dictionary_structure_with_buffer_policy.h | 124 ---- .../core/result/suggestions_output_utils.cpp | 2 +- .../suggest/core/result/suggestions_output_utils.h | 2 +- .../suggest/core/session/dic_traverse_session.cpp | 6 +- .../suggest/core/session/dic_traverse_session.h | 2 +- .../jni/src/suggest/core/session/ngram_context.cpp | 123 ---- .../jni/src/suggest/core/session/ngram_context.h | 78 --- native/jni/src/suggest/core/suggest.cpp | 4 +- .../policyimpl/dictionary/header/header_policy.cpp | 183 ------ .../policyimpl/dictionary/header/header_policy.h | 268 --------- .../dictionary/header/header_read_write_utils.cpp | 248 -------- .../dictionary/header/header_read_write_utils.h | 123 ---- .../dictionary/structure/backward/v402/Readme.txt | 1 - .../v402/bigram/ver4_bigram_list_policy.cpp | 289 --------- .../backward/v402/bigram/ver4_bigram_list_policy.h | 94 --- .../backward/v402/content/bigram_dict_content.cpp | 226 ------- .../backward/v402/content/bigram_dict_content.h | 135 ----- .../structure/backward/v402/content/bigram_entry.h | 110 ---- .../structure/backward/v402/content/dict_content.h | 47 -- .../v402/content/probability_dict_content.cpp | 170 ------ .../v402/content/probability_dict_content.h | 74 --- .../backward/v402/content/probability_entry.h | 90 --- .../v402/content/shortcut_dict_content.cpp | 199 ------- .../backward/v402/content/shortcut_dict_content.h | 101 ---- .../backward/v402/content/single_dict_content.h | 88 --- .../v402/content/sparse_table_dict_content.cpp | 50 -- .../v402/content/sparse_table_dict_content.h | 124 ---- .../content/terminal_position_lookup_table.cpp | 111 ---- .../v402/content/terminal_position_lookup_table.h | 73 --- .../v402/shortcut/ver4_shortcut_list_policy.h | 118 ---- .../structure/backward/v402/ver4_dict_buffers.cpp | 157 ----- .../structure/backward/v402/ver4_dict_buffers.h | 152 ----- .../backward/v402/ver4_dict_constants.cpp | 81 --- .../structure/backward/v402/ver4_dict_constants.h | 84 --- .../v402/ver4_patricia_trie_node_reader.cpp | 110 ---- .../backward/v402/ver4_patricia_trie_node_reader.h | 79 --- .../v402/ver4_patricia_trie_node_writer.cpp | 442 -------------- .../backward/v402/ver4_patricia_trie_node_writer.h | 150 ----- .../backward/v402/ver4_patricia_trie_policy.cpp | 662 --------------------- .../backward/v402/ver4_patricia_trie_policy.h | 181 ------ .../v402/ver4_patricia_trie_reading_utils.cpp | 39 -- .../v402/ver4_patricia_trie_reading_utils.h | 52 -- .../v402/ver4_patricia_trie_writing_helper.cpp | 307 ---------- .../v402/ver4_patricia_trie_writing_helper.h | 140 ----- .../backward/v402/ver4_pt_node_array_reader.cpp | 90 --- .../backward/v402/ver4_pt_node_array_reader.h | 57 -- ...ionary_structure_with_buffer_policy_factory.cpp | 209 ------- ...ctionary_structure_with_buffer_policy_factory.h | 64 -- .../bigram/bigram_list_read_write_utils.cpp | 95 --- .../bigram/bigram_list_read_write_utils.h | 68 --- .../pt_common/dynamic_pt_gc_event_listeners.cpp | 144 ----- .../pt_common/dynamic_pt_gc_event_listeners.h | 173 ------ .../pt_common/dynamic_pt_reading_helper.cpp | 321 ---------- .../pt_common/dynamic_pt_reading_helper.h | 282 --------- .../pt_common/dynamic_pt_reading_utils.cpp | 72 --- .../structure/pt_common/dynamic_pt_reading_utils.h | 83 --- .../pt_common/dynamic_pt_updating_helper.cpp | 299 ---------- .../pt_common/dynamic_pt_updating_helper.h | 96 --- .../pt_common/dynamic_pt_writing_utils.cpp | 132 ---- .../structure/pt_common/dynamic_pt_writing_utils.h | 79 --- .../pt_common/patricia_trie_reading_utils.cpp | 164 ----- .../pt_common/patricia_trie_reading_utils.h | 133 ----- .../structure/pt_common/pt_node_array_reader.h | 45 -- .../structure/pt_common/pt_node_params.h | 262 -------- .../structure/pt_common/pt_node_reader.h | 40 -- .../structure/pt_common/pt_node_writer.h | 97 --- .../shortcut/shortcut_list_reading_utils.cpp | 53 -- .../shortcut/shortcut_list_reading_utils.h | 72 --- .../structure/v2/bigram/bigram_list_policy.h | 59 -- .../structure/v2/patricia_trie_policy.cpp | 526 ---------------- .../dictionary/structure/v2/patricia_trie_policy.h | 180 ------ .../structure/v2/shortcut/shortcut_list_policy.h | 73 --- .../v2/ver2_patricia_trie_node_reader.cpp | 52 -- .../structure/v2/ver2_patricia_trie_node_reader.h | 52 -- .../structure/v2/ver2_pt_node_array_reader.cpp | 54 -- .../structure/v2/ver2_pt_node_array_reader.h | 43 -- .../dynamic_language_model_probability_utils.cpp | 34 -- .../dynamic_language_model_probability_utils.h | 77 --- .../v4/content/language_model_dict_content.cpp | 478 --------------- .../v4/content/language_model_dict_content.h | 258 -------- ...language_model_dict_content_global_counters.cpp | 32 - .../language_model_dict_content_global_counters.h | 101 ---- .../structure/v4/content/probability_entry.h | 176 ------ .../structure/v4/content/shortcut_dict_content.cpp | 182 ------ .../structure/v4/content/shortcut_dict_content.h | 92 --- .../structure/v4/content/single_dict_content.h | 64 -- .../v4/content/sparse_table_dict_content.cpp | 40 -- .../v4/content/sparse_table_dict_content.h | 91 --- .../v4/content/terminal_position_lookup_table.cpp | 98 --- .../v4/content/terminal_position_lookup_table.h | 63 -- .../v4/shortcut/ver4_shortcut_list_policy.h | 106 ---- .../dictionary/structure/v4/ver4_dict_buffers.cpp | 194 ------ .../dictionary/structure/v4/ver4_dict_buffers.h | 132 ---- .../structure/v4/ver4_dict_constants.cpp | 72 --- .../dictionary/structure/v4/ver4_dict_constants.h | 75 --- .../v4/ver4_patricia_trie_node_reader.cpp | 91 --- .../structure/v4/ver4_patricia_trie_node_reader.h | 55 -- .../v4/ver4_patricia_trie_node_writer.cpp | 354 ----------- .../structure/v4/ver4_patricia_trie_node_writer.h | 108 ---- .../structure/v4/ver4_patricia_trie_policy.cpp | 603 ------------------- .../structure/v4/ver4_patricia_trie_policy.h | 149 ----- .../v4/ver4_patricia_trie_reading_utils.cpp | 28 - .../v4/ver4_patricia_trie_reading_utils.h | 37 -- .../v4/ver4_patricia_trie_writing_helper.cpp | 185 ------ .../v4/ver4_patricia_trie_writing_helper.h | 76 --- .../structure/v4/ver4_pt_node_array_reader.cpp | 79 --- .../structure/v4/ver4_pt_node_array_reader.h | 42 -- .../utils/buffer_with_extendable_buffer.cpp | 170 ------ .../utils/buffer_with_extendable_buffer.h | 125 ---- .../dictionary/utils/byte_array_utils.cpp | 25 - .../policyimpl/dictionary/utils/byte_array_utils.h | 290 --------- .../dictionary/utils/dict_file_writing_utils.cpp | 144 ----- .../dictionary/utils/dict_file_writing_utils.h | 67 --- .../policyimpl/dictionary/utils/entry_counters.h | 89 --- .../policyimpl/dictionary/utils/file_utils.cpp | 171 ------ .../policyimpl/dictionary/utils/file_utils.h | 60 -- .../dictionary/utils/forgetting_curve_utils.cpp | 234 -------- .../dictionary/utils/forgetting_curve_utils.h | 112 ---- .../policyimpl/dictionary/utils/format_utils.cpp | 71 --- .../policyimpl/dictionary/utils/format_utils.h | 59 -- .../policyimpl/dictionary/utils/mmapped_buffer.cpp | 98 --- .../policyimpl/dictionary/utils/mmapped_buffer.h | 76 --- .../dictionary/utils/probability_utils.cpp | 23 - .../dictionary/utils/probability_utils.h | 69 --- .../policyimpl/dictionary/utils/sparse_table.cpp | 101 ---- .../policyimpl/dictionary/utils/sparse_table.h | 60 -- .../policyimpl/dictionary/utils/trie_map.cpp | 460 -------------- .../suggest/policyimpl/dictionary/utils/trie_map.h | 399 ------------- native/jni/src/utils/jni_data_utils.h | 8 +- .../header/header_read_write_utils_test.cpp | 78 +++ ...age_model_dict_content_global_counters_test.cpp | 60 ++ .../content/language_model_dict_content_test.cpp | 120 ++++ .../v4/content/probability_entry_test.cpp | 58 ++ .../terminal_position_lookup_table_test.cpp | 76 +++ .../tests/dictionary/utils/bloom_filter_test.cpp | 80 +++ .../utils/buffer_with_extendable_buffer_test.cpp | 94 +++ .../dictionary/utils/byte_array_utils_test.cpp | 105 ++++ .../tests/dictionary/utils/format_utils_test.cpp | 97 +++ .../dictionary/utils/probability_utils_test.cpp | 33 + .../tests/dictionary/utils/sparse_table_test.cpp | 47 ++ .../jni/tests/dictionary/utils/trie_map_test.cpp | 252 ++++++++ .../suggest/core/dictionary/bloom_filter_test.cpp | 80 --- .../header/header_read_write_utils_test.cpp | 78 --- ...age_model_dict_content_global_counters_test.cpp | 60 -- .../content/language_model_dict_content_test.cpp | 120 ---- .../v4/content/probability_entry_test.cpp | 58 -- .../terminal_position_lookup_table_test.cpp | 76 --- .../utils/buffer_with_extendable_buffer_test.cpp | 94 --- .../dictionary/utils/byte_array_utils_test.cpp | 105 ---- .../dictionary/utils/format_utils_test.cpp | 97 --- .../dictionary/utils/probability_utils_test.cpp | 33 - .../dictionary/utils/sparse_table_test.cpp | 47 -- .../policyimpl/dictionary/utils/trie_map_test.cpp | 252 -------- 313 files changed, 19267 insertions(+), 19267 deletions(-) create mode 100644 native/jni/src/dictionary/header/header_policy.cpp create mode 100644 native/jni/src/dictionary/header/header_policy.h create mode 100644 native/jni/src/dictionary/header/header_read_write_utils.cpp create mode 100644 native/jni/src/dictionary/header/header_read_write_utils.h create mode 100644 native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h create mode 100644 native/jni/src/dictionary/interface/dictionary_header_structure_policy.h create mode 100644 native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h create mode 100644 native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h create mode 100644 native/jni/src/dictionary/interface/ngram_listener.h create mode 100644 native/jni/src/dictionary/property/historical_info.h create mode 100644 native/jni/src/dictionary/property/ngram_context.cpp create mode 100644 native/jni/src/dictionary/property/ngram_context.h create mode 100644 native/jni/src/dictionary/property/ngram_property.h create mode 100644 native/jni/src/dictionary/property/unigram_property.h create mode 100644 native/jni/src/dictionary/property/word_attributes.h create mode 100644 native/jni/src/dictionary/property/word_property.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/Readme.txt create mode 100644 native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/dict_content.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp create mode 100644 native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h create mode 100644 native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp create mode 100644 native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h create mode 100644 native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h create mode 100644 native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h create mode 100644 native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h create mode 100644 native/jni/src/dictionary/structure/pt_common/pt_node_params.h create mode 100644 native/jni/src/dictionary/structure/pt_common/pt_node_reader.h create mode 100644 native/jni/src/dictionary/structure/pt_common/pt_node_writer.h create mode 100644 native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp create mode 100644 native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h create mode 100644 native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h create mode 100644 native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp create mode 100644 native/jni/src/dictionary/structure/v2/patricia_trie_policy.h create mode 100644 native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h create mode 100644 native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp create mode 100644 native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h create mode 100644 native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp create mode 100644 native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h create mode 100644 native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp create mode 100644 native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h create mode 100644 native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp create mode 100644 native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h create mode 100644 native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp create mode 100644 native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h create mode 100644 native/jni/src/dictionary/structure/v4/content/probability_entry.h create mode 100644 native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp create mode 100644 native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h create mode 100644 native/jni/src/dictionary/structure/v4/content/single_dict_content.h create mode 100644 native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp create mode 100644 native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h create mode 100644 native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp create mode 100644 native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h create mode 100644 native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_dict_constants.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h create mode 100644 native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp create mode 100644 native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h create mode 100644 native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h create mode 100644 native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h create mode 100644 native/jni/src/dictionary/utils/bloom_filter.h create mode 100644 native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp create mode 100644 native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h create mode 100644 native/jni/src/dictionary/utils/byte_array_utils.cpp create mode 100644 native/jni/src/dictionary/utils/byte_array_utils.h create mode 100644 native/jni/src/dictionary/utils/dict_file_writing_utils.cpp create mode 100644 native/jni/src/dictionary/utils/dict_file_writing_utils.h create mode 100644 native/jni/src/dictionary/utils/entry_counters.h create mode 100644 native/jni/src/dictionary/utils/file_utils.cpp create mode 100644 native/jni/src/dictionary/utils/file_utils.h create mode 100644 native/jni/src/dictionary/utils/forgetting_curve_utils.cpp create mode 100644 native/jni/src/dictionary/utils/forgetting_curve_utils.h create mode 100644 native/jni/src/dictionary/utils/format_utils.cpp create mode 100644 native/jni/src/dictionary/utils/format_utils.h create mode 100644 native/jni/src/dictionary/utils/mmapped_buffer.cpp create mode 100644 native/jni/src/dictionary/utils/mmapped_buffer.h create mode 100644 native/jni/src/dictionary/utils/multi_bigram_map.cpp create mode 100644 native/jni/src/dictionary/utils/multi_bigram_map.h create mode 100644 native/jni/src/dictionary/utils/probability_utils.cpp create mode 100644 native/jni/src/dictionary/utils/probability_utils.h create mode 100644 native/jni/src/dictionary/utils/sparse_table.cpp create mode 100644 native/jni/src/dictionary/utils/sparse_table.h create mode 100644 native/jni/src/dictionary/utils/trie_map.cpp create mode 100644 native/jni/src/dictionary/utils/trie_map.h delete mode 100644 native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h delete mode 100644 native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h delete mode 100644 native/jni/src/suggest/core/dictionary/bloom_filter.h delete mode 100644 native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp delete mode 100644 native/jni/src/suggest/core/dictionary/multi_bigram_map.h delete mode 100644 native/jni/src/suggest/core/dictionary/ngram_listener.h delete mode 100644 native/jni/src/suggest/core/dictionary/property/historical_info.h delete mode 100644 native/jni/src/suggest/core/dictionary/property/ngram_property.h delete mode 100644 native/jni/src/suggest/core/dictionary/property/unigram_property.h delete mode 100644 native/jni/src/suggest/core/dictionary/property/word_property.h delete mode 100644 native/jni/src/suggest/core/dictionary/word_attributes.h delete mode 100644 native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h delete mode 100644 native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h delete mode 100644 native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h delete mode 100644 native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h delete mode 100644 native/jni/src/suggest/core/session/ngram_context.cpp delete mode 100644 native/jni/src/suggest/core/session/ngram_context.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp delete mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h create mode 100644 native/jni/tests/dictionary/header/header_read_write_utils_test.cpp create mode 100644 native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp create mode 100644 native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp create mode 100644 native/jni/tests/dictionary/structure/v4/content/probability_entry_test.cpp create mode 100644 native/jni/tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp create mode 100644 native/jni/tests/dictionary/utils/bloom_filter_test.cpp create mode 100644 native/jni/tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp create mode 100644 native/jni/tests/dictionary/utils/byte_array_utils_test.cpp create mode 100644 native/jni/tests/dictionary/utils/format_utils_test.cpp create mode 100644 native/jni/tests/dictionary/utils/probability_utils_test.cpp create mode 100644 native/jni/tests/dictionary/utils/sparse_table_test.cpp create mode 100644 native/jni/tests/dictionary/utils/trie_map_test.cpp delete mode 100644 native/jni/tests/suggest/core/dictionary/bloom_filter_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/header/header_read_write_utils_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/utils/sparse_table_test.cpp delete mode 100644 native/jni/tests/suggest/policyimpl/dictionary/utils/trie_map_test.cpp (limited to 'native') diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk index 0be7153d6..d8b69bfd7 100644 --- a/native/jni/NativeFileList.mk +++ b/native/jni/NativeFileList.mk @@ -20,34 +20,12 @@ LATIN_IME_JNI_SRC_FILES := \ jni_common.cpp LATIN_IME_CORE_SRC_FILES := \ - suggest/core/suggest.cpp \ - $(addprefix suggest/core/dicnode/, \ - dic_node.cpp \ - dic_node_utils.cpp \ - dic_nodes_cache.cpp) \ - $(addprefix suggest/core/dictionary/, \ - dictionary.cpp \ - dictionary_utils.cpp \ - digraph_utils.cpp \ - error_type_utils.cpp \ - multi_bigram_map.cpp) \ - $(addprefix suggest/core/layout/, \ - additional_proximity_chars.cpp \ - proximity_info.cpp \ - proximity_info_params.cpp \ - proximity_info_state.cpp \ - proximity_info_state_utils.cpp) \ - suggest/core/policy/weighting.cpp \ - suggest/core/session/dic_traverse_session.cpp \ - suggest/core/session/ngram_context.cpp \ - $(addprefix suggest/core/result/, \ - suggestion_results.cpp \ - suggestions_output_utils.cpp) \ - $(addprefix suggest/policyimpl/dictionary/, \ - header/header_policy.cpp \ - header/header_read_write_utils.cpp \ - structure/dictionary_structure_with_buffer_policy_factory.cpp) \ - $(addprefix suggest/policyimpl/dictionary/structure/pt_common/, \ + $(addprefix dictionary/header/, \ + header_policy.cpp \ + header_read_write_utils.cpp) \ + dictionary/property/ngram_context.cpp \ + dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp \ + $(addprefix dictionary/structure/pt_common/, \ bigram/bigram_list_read_write_utils.cpp \ dynamic_pt_gc_event_listeners.cpp \ dynamic_pt_reading_helper.cpp \ @@ -56,11 +34,11 @@ LATIN_IME_CORE_SRC_FILES := \ dynamic_pt_writing_utils.cpp \ patricia_trie_reading_utils.cpp \ shortcut/shortcut_list_reading_utils.cpp) \ - $(addprefix suggest/policyimpl/dictionary/structure/v2/, \ + $(addprefix dictionary/structure/v2/, \ patricia_trie_policy.cpp \ ver2_patricia_trie_node_reader.cpp \ ver2_pt_node_array_reader.cpp) \ - $(addprefix suggest/policyimpl/dictionary/structure/v4/, \ + $(addprefix dictionary/structure/v4/, \ ver4_dict_buffers.cpp \ ver4_dict_constants.cpp \ ver4_patricia_trie_node_reader.cpp \ @@ -69,14 +47,14 @@ LATIN_IME_CORE_SRC_FILES := \ ver4_patricia_trie_reading_utils.cpp \ ver4_patricia_trie_writing_helper.cpp \ ver4_pt_node_array_reader.cpp) \ - $(addprefix suggest/policyimpl/dictionary/structure/v4/content/, \ + $(addprefix dictionary/structure/v4/content/, \ dynamic_language_model_probability_utils.cpp \ language_model_dict_content.cpp \ language_model_dict_content_global_counters.cpp \ shortcut_dict_content.cpp \ sparse_table_dict_content.cpp \ terminal_position_lookup_table.cpp) \ - $(addprefix suggest/policyimpl/dictionary/utils/, \ + $(addprefix dictionary/utils/, \ buffer_with_extendable_buffer.cpp \ byte_array_utils.cpp \ dict_file_writing_utils.cpp \ @@ -84,9 +62,31 @@ LATIN_IME_CORE_SRC_FILES := \ forgetting_curve_utils.cpp \ format_utils.cpp \ mmapped_buffer.cpp \ + multi_bigram_map.cpp \ probability_utils.cpp \ sparse_table.cpp \ trie_map.cpp ) \ + suggest/core/suggest.cpp \ + $(addprefix suggest/core/dicnode/, \ + dic_node.cpp \ + dic_node_utils.cpp \ + dic_nodes_cache.cpp) \ + $(addprefix suggest/core/dictionary/, \ + dictionary.cpp \ + dictionary_utils.cpp \ + digraph_utils.cpp \ + error_type_utils.cpp ) \ + $(addprefix suggest/core/layout/, \ + additional_proximity_chars.cpp \ + proximity_info.cpp \ + proximity_info_params.cpp \ + proximity_info_state.cpp \ + proximity_info_state_utils.cpp) \ + suggest/core/policy/weighting.cpp \ + suggest/core/session/dic_traverse_session.cpp \ + $(addprefix suggest/core/result/, \ + suggestion_results.cpp \ + suggestions_output_utils.cpp) \ suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \ $(addprefix suggest/policyimpl/typing/, \ scoring_params.cpp \ @@ -102,7 +102,7 @@ LATIN_IME_CORE_SRC_FILES := \ time_keeper.cpp) LATIN_IME_CORE_SRC_FILES_BACKWARD_V402 := \ - $(addprefix suggest/policyimpl/dictionary/structure/backward/v402/, \ + $(addprefix dictionary/structure/backward/v402/, \ ver4_dict_buffers.cpp \ ver4_dict_constants.cpp \ ver4_patricia_trie_node_reader.cpp \ @@ -111,34 +111,34 @@ LATIN_IME_CORE_SRC_FILES_BACKWARD_V402 := \ ver4_patricia_trie_reading_utils.cpp \ ver4_patricia_trie_writing_helper.cpp \ ver4_pt_node_array_reader.cpp) \ - $(addprefix suggest/policyimpl/dictionary/structure/backward/v402/content/, \ + $(addprefix dictionary/structure/backward/v402/content/, \ bigram_dict_content.cpp \ probability_dict_content.cpp \ shortcut_dict_content.cpp \ sparse_table_dict_content.cpp \ terminal_position_lookup_table.cpp) \ - $(addprefix suggest/policyimpl/dictionary/structure/backward/v402/bigram/, \ + $(addprefix dictionary/structure/backward/v402/bigram/, \ ver4_bigram_list_policy.cpp) LATIN_IME_CORE_SRC_FILES += $(LATIN_IME_CORE_SRC_FILES_BACKWARD_V402) LATIN_IME_CORE_TEST_FILES := \ defines_test.cpp \ + dictionary/header/header_read_write_utils_test.cpp \ + dictionary/structure/v4/content/language_model_dict_content_test.cpp \ + dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp \ + dictionary/structure/v4/content/probability_entry_test.cpp \ + dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp \ + dictionary/utils/bloom_filter_test.cpp \ + dictionary/utils/buffer_with_extendable_buffer_test.cpp \ + dictionary/utils/byte_array_utils_test.cpp \ + dictionary/utils/format_utils_test.cpp \ + dictionary/utils/probability_utils_test.cpp \ + dictionary/utils/sparse_table_test.cpp \ + dictionary/utils/trie_map_test.cpp \ suggest/core/dicnode/dic_node_pool_test.cpp \ - suggest/core/dictionary/bloom_filter_test.cpp \ suggest/core/layout/geometry_utils_test.cpp \ suggest/core/layout/normal_distribution_2d_test.cpp \ - suggest/policyimpl/dictionary/header/header_read_write_utils_test.cpp \ - suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp \ - suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp \ - suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp \ - suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp \ - suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer_test.cpp \ - suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp \ - suggest/policyimpl/dictionary/utils/format_utils_test.cpp \ - suggest/policyimpl/dictionary/utils/probability_utils_test.cpp \ - suggest/policyimpl/dictionary/utils/sparse_table_test.cpp \ - suggest/policyimpl/dictionary/utils/trie_map_test.cpp \ suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \ utils/autocorrection_threshold_utils_test.cpp \ utils/char_utils_test.cpp \ diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 8f1e35e0f..3341e1163 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -22,15 +22,15 @@ #include #include "defines.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" #include "jni.h" #include "jni_common.h" #include "suggest/core/dictionary/dictionary.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/core/dictionary/property/word_property.h" #include "suggest/core/result/suggestion_results.h" -#include "suggest/core/session/ngram_context.h" #include "suggest/core/suggest_options.h" -#include "suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" #include "utils/char_utils.h" #include "utils/int_array_view.h" #include "utils/jni_data_utils.h" diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.cpp index 68bf417e5..0885f2de9 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.cpp @@ -19,9 +19,9 @@ #include "com_android_inputmethod_latin_BinaryDictionaryUtils.h" #include "defines.h" +#include "dictionary/utils/dict_file_writing_utils.h" #include "jni.h" #include "jni_common.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" #include "utils/autocorrection_threshold_utils.h" #include "utils/char_utils.h" #include "utils/jni_data_utils.h" diff --git a/native/jni/com_android_inputmethod_latin_DicTraverseSession.cpp b/native/jni/com_android_inputmethod_latin_DicTraverseSession.cpp index 3c6bff3b6..45f5492b1 100644 --- a/native/jni/com_android_inputmethod_latin_DicTraverseSession.cpp +++ b/native/jni/com_android_inputmethod_latin_DicTraverseSession.cpp @@ -19,10 +19,10 @@ #include "com_android_inputmethod_latin_DicTraverseSession.h" #include "defines.h" +#include "dictionary/property/ngram_context.h" #include "jni.h" #include "jni_common.h" #include "suggest/core/session/dic_traverse_session.h" -#include "suggest/core/session/ngram_context.h" namespace latinime { class Dictionary; diff --git a/native/jni/src/dictionary/header/header_policy.cpp b/native/jni/src/dictionary/header/header_policy.cpp new file mode 100644 index 000000000..d4f84d39f --- /dev/null +++ b/native/jni/src/dictionary/header/header_policy.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_policy.h" + +#include + +#include "utils/ngram_utils.h" + +namespace latinime { + +// Note that these are corresponding definitions in Java side in DictionaryHeader. +const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; +const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = + "REQUIRES_GERMAN_UMLAUT_PROCESSING"; +// TODO: Change attribute string to "IS_DECAYING_DICT". +const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; +const char *const HeaderPolicy::DATE_KEY = "date"; +const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; +const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = + {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"}; +const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = + {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT", + "MAX_QUADGRAM_ENTRY_COUNT"}; +const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000}; +const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; +// Historical info is information that is needed to support decaying such as timestamp, level and +// count. +const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; +const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration +const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = + "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; + +const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; +const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; +const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; + +// Used for logging. Question mark is used to indicate that the key is not found. +void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const { + if (outValueSize <= 0) return; + if (outValueSize == 1) { + outValue[0] = '\0'; + return; + } + std::vector keyCodePointVector; + HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); + DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it = + mAttributeMap.find(keyCodePointVector); + if (it == mAttributeMap.end()) { + // The key was not found. + outValue[0] = '?'; + outValue[1] = '\0'; + return; + } + const int terminalIndex = std::min(static_cast(it->second.size()), outValueSize - 1); + for (int i = 0; i < terminalIndex; ++i) { + outValue[i] = it->second[i]; + } + outValue[terminalIndex] = '\0'; +} + +const std::vector HeaderPolicy::readLocale() const { + return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY); +} + +float HeaderPolicy::readMultipleWordCostMultiplier() const { + const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); + if (demotionRate <= 0) { + return static_cast(MAX_VALUE_FOR_WEIGHTING); + } + return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast(demotionRate); +} + +bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { + return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false); +} + +bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const { + int writingPos = 0; + DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); + fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite); + if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags, + &writingPos)) { + return false; + } + // Temporarily writes a dummy header size. + int headerSizeFieldPos = writingPos; + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite, + &writingPos)) { + return false; + } + // Writes the actual header size. + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos, + &headerSizeFieldPos)) { + return false; + } + return true; +} + +namespace { + +int getIndexFromNgramType(const NgramType ngramType) { + return static_cast(ngramType); +} + +} // namespace + +void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { + for (const auto ngramType : AllNgramTypes::ASCENDING) { + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], + entryCounts.getNgramCount(ngramType)); + } + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, + extendedRegionSize); + // Set the current time as the generation time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY, + TimeKeeper::peekCurrentTime()); + HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale); + if (updatesLastDecayedTime) { + // Set current time as the last updated time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY, + TimeKeeper::peekCurrentTime()); + } +} + +/* static */ DictionaryHeaderStructurePolicy::AttributeMap + HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); + return attributeMap; +} + +/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); + entryCounters.setNgramCount(ngramType, entryCount); + } + return entryCounters.getEntryCounts(); +} + +/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int index = getIndexFromNgramType(ngramType); + const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); + entryCounters.setNgramCount(ngramType, maxEntryCount); + } + return entryCounters.getEntryCounts(); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/header/header_policy.h b/native/jni/src/dictionary/header/header_policy.h new file mode 100644 index 000000000..47cc9196a --- /dev/null +++ b/native/jni/src/dictionary/header/header_policy.h @@ -0,0 +1,268 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_POLICY_H +#define LATINIME_HEADER_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/format_utils.h" +#include "utils/char_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class HeaderPolicy : public DictionaryHeaderStructurePolicy { + public: + // Reads information from existing dictionary buffer. + HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion) + : mDictFormatVersion(formatVersion), + mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), + mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), + mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), + mLocale(readLocale()), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} + + // Constructs header information using an attribute map. + HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, + const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) + : mDictFormatVersion(dictFormatVersion), + mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(0), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} + + // Copy header information + HeaderPolicy(const HeaderPolicy *const headerPolicy) + : mDictFormatVersion(headerPolicy->mDictFormatVersion), + mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize), + mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale), + mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), + mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), + mIsDecayingDict(headerPolicy->mIsDecayingDict), + mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), + mNgramCounts(headerPolicy->mNgramCounts), + mMaxNgramCounts(headerPolicy->mMaxNgramCounts), + mExtendedRegionSize(headerPolicy->mExtendedRegionSize), + mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), + mForgettingCurveProbabilityValuesTableId( + headerPolicy->mForgettingCurveProbabilityValuesTableId), + mCodePointTable(headerPolicy->mCodePointTable) {} + + // Temporary dummy header. + HeaderPolicy() + : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), + mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), + mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), + mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), + mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), + mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} + + ~HeaderPolicy() {} + + virtual int getFormatVersionNumber() const { + // Conceptually this converts the symbolic value we use in the code into the + // hardcoded of the bytes in the file. But we want the constants to be the + // same so we use them for both here. + switch (mDictFormatVersion) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return FormatUtils::UNKNOWN_VERSION; + case FormatUtils::VERSION_202: + return FormatUtils::VERSION_202; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + return FormatUtils::VERSION_4_ONLY_FOR_TESTING; + case FormatUtils::VERSION_402: + return FormatUtils::VERSION_402; + case FormatUtils::VERSION_403: + return FormatUtils::VERSION_403; + default: + return FormatUtils::UNKNOWN_VERSION; + } + } + + AK_FORCE_INLINE bool isValid() const { + // Decaying dictionary must have historical information. + if (!mIsDecayingDict) { + return true; + } + if (mHasHistoricalInfoOfWords) { + return true; + } else { + return false; + } + } + + AK_FORCE_INLINE int getSize() const { + return mSize; + } + + AK_FORCE_INLINE float getMultiWordCostMultiplier() const { + return mMultiWordCostMultiplier; + } + + AK_FORCE_INLINE bool isDecayingDict() const { + return mIsDecayingDict; + } + + AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { + return mRequiresGermanUmlautProcessing; + } + + AK_FORCE_INLINE int getDate() const { + return mDate; + } + + AK_FORCE_INLINE int getLastDecayedTime() const { + return mLastDecayedTime; + } + + AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { + return mNgramCounts; + } + + AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { + return mMaxNgramCounts; + } + + AK_FORCE_INLINE int getExtendedRegionSize() const { + return mExtendedRegionSize; + } + + AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { + return mHasHistoricalInfoOfWords; + } + + AK_FORCE_INLINE bool shouldBoostExactMatches() const { + // TODO: Investigate better ways to handle exact matches for personalized dictionaries. + return !isDecayingDict(); + } + + const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const { + return &mAttributeMap; + } + + AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { + return mForgettingCurveProbabilityValuesTableId; + } + + void readHeaderValueOrQuestionMark(const char *const key, + int *outValue, int outValueSize) const; + + bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const; + + void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, + const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; + + AK_FORCE_INLINE const std::vector *getLocale() const { + return &mLocale; + } + + bool supportsBeginningOfSentence() const { + return mDictFormatVersion >= FormatUtils::VERSION_402; + } + + const int *getCodePointTable() const { + return mCodePointTable; + } + + private: + DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); + + static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; + static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; + static const char *const IS_DECAYING_DICT_KEY; + static const char *const DATE_KEY; + static const char *const LAST_DECAYED_TIME_KEY; + static const char *const NGRAM_COUNT_KEYS[]; + static const char *const MAX_NGRAM_COUNT_KEYS[]; + static const int DEFAULT_MAX_NGRAM_COUNTS[]; + static const char *const EXTENDED_REGION_SIZE_KEY; + static const char *const HAS_HISTORICAL_INFO_KEY; + static const char *const LOCALE_KEY; + static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; + static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; + static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; + static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; + static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; + static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; + + const FormatUtils::FORMAT_VERSION mDictFormatVersion; + const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; + const int mSize; + DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; + const std::vector mLocale; + const float mMultiWordCostMultiplier; + const bool mRequiresGermanUmlautProcessing; + const bool mIsDecayingDict; + const int mDate; + const int mLastDecayedTime; + const EntryCounts mNgramCounts; + const EntryCounts mMaxNgramCounts; + const int mExtendedRegionSize; + const bool mHasHistoricalInfoOfWords; + const int mForgettingCurveProbabilityValuesTableId; + const int *const mCodePointTable; + + const std::vector readLocale() const; + float readMultipleWordCostMultiplier() const; + bool readRequiresGermanUmlautProcessing() const; + const EntryCounts readNgramCounts() const; + const EntryCounts readMaxNgramCounts() const; + static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( + const uint8_t *const dictBuf); +}; +} // namespace latinime +#endif /* LATINIME_HEADER_POLICY_H */ diff --git a/native/jni/src/dictionary/header/header_read_write_utils.cpp b/native/jni/src/dictionary/header/header_read_write_utils.cpp new file mode 100644 index 000000000..779f8b8c3 --- /dev/null +++ b/native/jni/src/dictionary/header/header_read_write_utils.cpp @@ -0,0 +1,248 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_read_write_utils.h" + +#include +#include +#include +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. +// As such, this is the maximum number of characters will be needed to represent an int as a +// string, including the terminator; this is used as the size of a string buffer large enough to +// hold any value that is intended to fit in an integer, e.g. in the code that reads the header +// of the binary dictionary where a {key,value} string pair scheme is used. +const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; + +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048; + +const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; +const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; +const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable"; + +const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; + +typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; + +/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { + // See the format of the header in the comment in + // BinaryDictionaryFormatUtils::detectFormatVersion() + return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE + + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) { + return ByteArrayUtils::readUint16(dictBuf, + HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + const AttributeMap *const attributeMap) { + return NO_FLAGS; +} + +/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes) { + const int headerSize = getHeaderSize(dictBuf); + int pos = getHeaderOptionsPosition(); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return; + } + int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; + std::unique_ptr valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]); + while (pos < headerSize) { + // The values in the header don't use the code point table for their encoding. + const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos); + std::vector key; + key.insert(key.end(), keyBuffer, keyBuffer + keyLength); + const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos); + std::vector value; + value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength); + headerAttributes->insert(AttributeMap::value_type(key, value)); + } +} + +/* static */ const int *HeaderReadWriteUtils::readCodePointTable( + AttributeMap *const headerAttributes) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return nullptr; + } + return it->second.data(); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( + BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, + int *const writingPos) { + if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE, + writingPos)) { + return false; + } + switch (version) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + case FormatUtils::VERSION_202: + // None of the static dictionaries (v2x) support writing + return false; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: + return buffer->writeUintAndAdvancePosition(version /* data */, + HEADER_DICTIONARY_VERSION_SIZE, writingPos); + default: + return false; + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags( + BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags, + int *const writingPos) { + return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize( + BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) { + return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes( + BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes, + int *const writingPos) { + for (AttributeMap::const_iterator it = headerAttributes->begin(); + it != headerAttributes->end(); ++it) { + if (it->first.empty() || it->second.empty()) { + continue; + } + // Write a key. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + // Write a value. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + } + return true; +} + +/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute( + AttributeMap *const headerAttributes, const char *const key, + const std::vector &value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + (*headerAttributes)[keyVector] = value; +} + +/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, + const char *const key, const bool value) { + setIntAttribute(headerAttributes, key, value ? 1 : 0); +} + +/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, + const char *const key, const int value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + setIntAttributeInner(headerAttributes, &keyVector, value); +} + +/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value) { + AttributeMap::mapped_type valueVector; + char charBuf[LARGEST_INT_DIGIT_COUNT]; + snprintf(charBuf, sizeof(charBuf), "%d", value); + insertCharactersIntoVector(charBuf, &valueVector); + (*headerAttributes)[*key] = valueVector; +} + +/* static */ const std::vector HeaderReadWriteUtils::readCodePointVectorAttributeValue( + const AttributeMap *const headerAttributes, const char *const key) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return std::vector(); + } else { + return it->second; + } +} + +/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const bool defaultValue) { + const int intDefaultValue = defaultValue ? 1 : 0; + const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); + return intValue != 0; +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const int defaultValue) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue); +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const int defaultValue) { + AttributeMap::const_iterator it = headerAttributes->find(*key); + if (it != headerAttributes->end()) { + int value = 0; + bool isNegative = false; + for (size_t i = 0; i < it->second.size(); ++i) { + if (i == 0 && it->second.at(i) == '-') { + isNegative = true; + } else { + if (!isdigit(it->second.at(i))) { + // If not a number. + return defaultValue; + } + value *= 10; + value += it->second.at(i) - '0'; + } + } + return isNegative ? -value : value; + } + return defaultValue; +} + +/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters, + std::vector *const vector) { + for (int i = 0; characters[i]; ++i) { + vector->push_back(characters[i]); + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/header/header_read_write_utils.h b/native/jni/src/dictionary/header/header_read_write_utils.h new file mode 100644 index 000000000..f67d614df --- /dev/null +++ b/native/jni/src/dictionary/header/header_read_write_utils.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H +#define LATINIME_HEADER_READ_WRITE_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class HeaderReadWriteUtils { + public: + typedef uint16_t DictionaryFlags; + + static int getHeaderSize(const uint8_t *const dictBuf); + + static DictionaryFlags getFlags(const uint8_t *const dictBuf); + + static AK_FORCE_INLINE int getHeaderOptionsPosition() { + return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE + + HEADER_SIZE_FIELD_SIZE; + } + + static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + + static const int *readCodePointTable( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, + const FormatUtils::FORMAT_VERSION version, int *const writingPos); + + static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer, + const DictionaryFlags flags, int *const writingPos); + + static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer, + const int size, int *const writingPos); + + static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + int *const writingPos); + + /** + * Methods for header attributes. + */ + static void setCodePointVectorAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const std::vector &value); + + static void setBoolAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const bool value); + + static void setIntAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const int value); + + static const std::vector readCodePointVectorAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key); + + static bool readBoolAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const bool defaultValue); + + static int readIntAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const int defaultValue); + + static void insertCharactersIntoVector(const char *const characters, + DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); + + static const int LARGEST_INT_DIGIT_COUNT; + static const int MAX_ATTRIBUTE_KEY_LENGTH; + static const int MAX_ATTRIBUTE_VALUE_LENGTH; + + static const int HEADER_MAGIC_NUMBER_SIZE; + static const int HEADER_DICTIONARY_VERSION_SIZE; + static const int HEADER_FLAG_SIZE; + static const int HEADER_SIZE_FIELD_SIZE; + + static const char *const CODE_POINT_TABLE_KEY; + + // Value for the "flags" field. It's unused at the moment. + static const DictionaryFlags NO_FLAGS; + + static void setIntAttributeInner( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int value); + + static int readIntAttributeValueInner( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int defaultValue); +}; +} +#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ diff --git a/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h new file mode 100644 index 000000000..aa0d068aa --- /dev/null +++ b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of bigrams. + */ +class DictionaryBigramsStructurePolicy { + public: + virtual ~DictionaryBigramsStructurePolicy() {} + + virtual void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const pos) const = 0; + virtual bool skipAllBigrams(int *const pos) const = 0; + + protected: + DictionaryBigramsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryBigramsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h new file mode 100644 index 000000000..6da390e55 --- /dev/null +++ b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H + +#include +#include + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryHeaderStructurePolicy { + public: + typedef std::map, std::vector> AttributeMap; + + virtual ~DictionaryHeaderStructurePolicy() {} + + virtual int getFormatVersionNumber() const = 0; + + virtual int getSize() const = 0; + + virtual const AttributeMap *getAttributeMap() const = 0; + + virtual bool requiresGermanUmlautProcessing() const = 0; + + virtual float getMultiWordCostMultiplier() const = 0; + + virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const = 0; + + virtual bool shouldBoostExactMatches() const = 0; + + virtual const std::vector *getLocale() const = 0; + + virtual bool supportsBeginningOfSentence() const = 0; + + protected: + DictionaryHeaderStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryHeaderStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h new file mode 100644 index 000000000..40b6c2de1 --- /dev/null +++ b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of shortcuts. + */ +class DictionaryShortcutsStructurePolicy { + public: + virtual ~DictionaryShortcutsStructurePolicy() {} + + virtual int getStartPos(const int pos) const = 0; + + virtual void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const = 0; + + virtual void skipAllShortcuts(int *const pos) const = 0; + + protected: + DictionaryShortcutsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryShortcutsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h new file mode 100644 index 000000000..ace48491d --- /dev/null +++ b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/word_attributes.h" +#include "dictionary/property/word_property.h" +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; +class DictionaryHeaderStructurePolicy; +class MultiBigramMap; +class NgramListener; +class NgramContext; +class UnigramProperty; + +/* + * This class abstracts the structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryStructureWithBufferPolicy { + public: + typedef std::unique_ptr StructurePolicyPtr; + + virtual ~DictionaryStructureWithBufferPolicy() {} + + virtual int getRootPosition() const = 0; + + virtual void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const = 0; + + virtual int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const = 0; + + virtual int getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const = 0; + + virtual const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const = 0; + + // TODO: Remove + virtual int getProbability(const int unigramProbability, const int bigramProbability) const = 0; + + virtual int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const = 0; + + virtual void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const = 0; + + virtual BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const = 0; + + virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0; + + // Returns whether the update was success or not. + virtual bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) = 0; + + // Returns whether the update was success or not. + virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0; + + // Returns whether the update was success or not. + virtual bool addNgramEntry(const NgramProperty *const ngramProperty) = 0; + + // Returns whether the update was success or not. + virtual bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) = 0; + + // Returns whether the update was success or not. + virtual bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) = 0; + + // Returns whether the flush was success or not. + virtual bool flush(const char *const filePath) = 0; + + // Returns whether the GC and flush were success or not. + virtual bool flushWithGC(const char *const filePath) = 0; + + virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; + + // Currently, this method is used only for testing. You may want to consider creating new + // dedicated method instead of this if you want to use this in the production. + virtual void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength) = 0; + + virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0; + + // Method to iterate all words in the dictionary. + // The returned token has to be used to get the next word. If token is 0, this method newly + // starts iterating the dictionary. + virtual int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) = 0; + + virtual bool isCorrupted() const = 0; + + protected: + DictionaryStructureWithBufferPolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryStructureWithBufferPolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/dictionary/interface/ngram_listener.h b/native/jni/src/dictionary/interface/ngram_listener.h new file mode 100644 index 000000000..2eb5e9fd1 --- /dev/null +++ b/native/jni/src/dictionary/interface/ngram_listener.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_LISTENER_H +#define LATINIME_NGRAM_LISTENER_H + +#include "defines.h" + +namespace latinime { + +/** + * Interface to iterate ngram entries. + */ +class NgramListener { + public: + // ngramProbability is always 0 for v403 decaying dictionary. + // TODO: Remove ngramProbability. + virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0; + virtual ~NgramListener() {}; + + protected: + NgramListener() {} + + private: + DISALLOW_COPY_AND_ASSIGN(NgramListener); + +}; +} // namespace latinime +#endif /* LATINIME_NGRAM_LISTENER_H */ diff --git a/native/jni/src/dictionary/property/historical_info.h b/native/jni/src/dictionary/property/historical_info.h new file mode 100644 index 000000000..e5ce1ea25 --- /dev/null +++ b/native/jni/src/dictionary/property/historical_info.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HISTORICAL_INFO_H +#define LATINIME_HISTORICAL_INFO_H + +#include "defines.h" + +namespace latinime { + +class HistoricalInfo { + public: + // Invalid historical info. + HistoricalInfo() + : mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0) {} + + HistoricalInfo(const int timestamp, const int level, const int count) + : mTimestamp(timestamp), mLevel(level), mCount(count) {} + + bool isValid() const { + return mTimestamp != NOT_A_TIMESTAMP; + } + + int getTimestamp() const { + return mTimestamp; + } + + // TODO: Remove + int getLevel() const { + return mLevel; + } + + int getCount() const { + return mCount; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_ASSIGNMENT_OPERATOR(HistoricalInfo); + + const int mTimestamp; + const int mLevel; + const int mCount; +}; +} // namespace latinime +#endif /* LATINIME_HISTORICAL_INFO_H */ diff --git a/native/jni/src/dictionary/property/ngram_context.cpp b/native/jni/src/dictionary/property/ngram_context.cpp new file mode 100644 index 000000000..7b9c3eff6 --- /dev/null +++ b/native/jni/src/dictionary/property/ngram_context.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/property/ngram_context.h" + +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +NgramContext::NgramContext() : mPrevWordCount(0) {} + +NgramContext::NgramContext(const NgramContext &ngramContext) + : mPrevWordCount(ngramContext.mPrevWordCount) { + for (size_t i = 0; i < mPrevWordCount; ++i) { + mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i]; + memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); + mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i]; + } +} + +NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], + const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, + const size_t prevWordCount) + : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) { + clear(); + for (size_t i = 0; i < mPrevWordCount; ++i) { + if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { + continue; + } + memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); + mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; + mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; + } +} + +NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, + const bool isBeginningOfSentence) : mPrevWordCount(1) { + clear(); + if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { + return; + } + memmove(mPrevWordCodePoints[0], prevWordCodePoints, + sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); + mPrevWordCodePointCount[0] = prevWordCodePointCount; + mIsBeginningOfSentence[0] = isBeginningOfSentence; +} + +bool NgramContext::isValid() const { + if (mPrevWordCodePointCount[0] > 0) { + return true; + } + if (mIsBeginningOfSentence[0]) { + return true; + } + return false; +} + +const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const { + if (n <= 0 || n > mPrevWordCount) { + return CodePointArrayView(); + } + return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]); +} + +bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const { + if (n <= 0 || n > mPrevWordCount) { + return false; + } + return mIsBeginningOfSentence[n - 1]; +} + +/* static */ int NgramContext::getWordId( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { + return NOT_A_WORD_ID; + } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount, + MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_WORD_ID; + } + } + const CodePointArrayView codePointArrayView(codePoints, codePointCount); + const int wordId = dictStructurePolicy->getWordId(codePointArrayView, + false /* forceLowerCaseSearch */); + if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) { + // Return the id when when the word was found or doesn't try lower case search. + return wordId; + } + // Check bigrams for lower-cased previous word if original was not found. Useful for + // auto-capitalized words like "The [current_word]". + return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */); +} + +void NgramContext::clear() { + for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { + mPrevWordCodePointCount[i] = 0; + mIsBeginningOfSentence[i] = false; + } +} +} // namespace latinime diff --git a/native/jni/src/dictionary/property/ngram_context.h b/native/jni/src/dictionary/property/ngram_context.h new file mode 100644 index 000000000..9b36199c9 --- /dev/null +++ b/native/jni/src/dictionary/property/ngram_context.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_CONTEXT_H +#define LATINIME_NGRAM_CONTEXT_H + +#include + +#include "defines.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; + +class NgramContext { + public: + // No prev word information. + NgramContext(); + // Copy constructor to use this class with std::vector and use this class as a return value. + NgramContext(const NgramContext &ngramContext); + // Construct from previous words. + NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], + const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, + const size_t prevWordCount); + // Construct from a previous word. + NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, + const bool isBeginningOfSentence); + + size_t getPrevWordCount() const { + return mPrevWordCount; + } + bool isValid() const; + + template + const WordIdArrayView getPrevWordIds( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + WordIdArray *const prevWordIdBuffer, const bool tryLowerCaseSearch) const { + for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) { + prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy, mPrevWordCodePoints[i], + mPrevWordCodePointCount[i], mIsBeginningOfSentence[i], tryLowerCaseSearch); + } + return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount); + } + + // n is 1-indexed. + const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const; + // n is 1-indexed. + bool isNthPrevWordBeginningOfSentence(const size_t n) const; + + private: + DISALLOW_ASSIGNMENT_OPERATOR(NgramContext); + + static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence, const bool tryLowerCaseSearch); + void clear(); + + const size_t mPrevWordCount; + int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; +}; +} // namespace latinime +#endif // LATINIME_NGRAM_CONTEXT_H diff --git a/native/jni/src/dictionary/property/ngram_property.h b/native/jni/src/dictionary/property/ngram_property.h new file mode 100644 index 000000000..5f259ec59 --- /dev/null +++ b/native/jni/src/dictionary/property/ngram_property.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_PROPERTY_H +#define LATINIME_NGRAM_PROPERTY_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/ngram_context.h" + +namespace latinime { + +class NgramProperty { + public: + NgramProperty(const NgramContext &ngramContext, const std::vector &&targetCodePoints, + const int probability, const HistoricalInfo historicalInfo) + : mNgramContext(ngramContext), mTargetCodePoints(std::move(targetCodePoints)), + mProbability(probability), mHistoricalInfo(historicalInfo) {} + + const NgramContext *getNgramContext() const { + return &mNgramContext; + } + + const std::vector *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo getHistoricalInfo() const { + return mHistoricalInfo; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty); + DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty); + + const NgramContext mNgramContext; + const std::vector mTargetCodePoints; + const int mProbability; + const HistoricalInfo mHistoricalInfo; +}; +} // namespace latinime +#endif // LATINIME_NGRAM_PROPERTY_H diff --git a/native/jni/src/dictionary/property/unigram_property.h b/native/jni/src/dictionary/property/unigram_property.h new file mode 100644 index 000000000..92f61b85d --- /dev/null +++ b/native/jni/src/dictionary/property/unigram_property.h @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_UNIGRAM_PROPERTY_H +#define LATINIME_UNIGRAM_PROPERTY_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" + +namespace latinime { + +class UnigramProperty { + public: + class ShortcutProperty { + public: + ShortcutProperty(const std::vector &&targetCodePoints, const int probability) + : mTargetCodePoints(std::move(targetCodePoints)), + mProbability(probability) {} + + const std::vector *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty); + + const std::vector mTargetCodePoints; + const int mProbability; + }; + + UnigramProperty() + : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), + mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY), + mHistoricalInfo(), mShortcuts() {} + + // In contexts which do not support the Blacklisted flag (v2, v4<403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo, const std::vector &&shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(false), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} + + // Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(false), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts() {} + + // In contexts which DO support the Blacklisted flag (v403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo, const std::vector &&shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} + + // Without shortcuts, in contexts which DO support the Blacklisted flag (v403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts() {} + + bool representsBeginningOfSentence() const { + return mRepresentsBeginningOfSentence; + } + + bool isNotAWord() const { + return mIsNotAWord; + } + + bool isPossiblyOffensive() const { + return mIsPossiblyOffensive; + } + + bool isBlacklisted() const { + return mIsBlacklisted; + } + + bool hasShortcuts() const { + return !mShortcuts.empty(); + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo getHistoricalInfo() const { + return mHistoricalInfo; + } + + const std::vector &getShortcuts() const { + return mShortcuts; + } + + private: + // Default copy constructor is used for using as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); + + const bool mRepresentsBeginningOfSentence; + const bool mIsNotAWord; + const bool mIsBlacklisted; + const bool mIsPossiblyOffensive; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + const std::vector mShortcuts; +}; +} // namespace latinime +#endif // LATINIME_UNIGRAM_PROPERTY_H diff --git a/native/jni/src/dictionary/property/word_attributes.h b/native/jni/src/dictionary/property/word_attributes.h new file mode 100644 index 000000000..5351e7d7d --- /dev/null +++ b/native/jni/src/dictionary/property/word_attributes.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_ATTRIBUTES_H +#define LATINIME_WORD_ATTRIBUTES_H + +#include "defines.h" + +class WordAttributes { + public: + // Invalid word attributes. + WordAttributes() + : mProbability(NOT_A_PROBABILITY), mIsBlacklisted(false), mIsNotAWord(false), + mIsPossiblyOffensive(false) {} + + WordAttributes(const int probability, const bool isBlacklisted, const bool isNotAWord, + const bool isPossiblyOffensive) + : mProbability(probability), mIsBlacklisted(isBlacklisted), mIsNotAWord(isNotAWord), + mIsPossiblyOffensive(isPossiblyOffensive) {} + + int getProbability() const { + return mProbability; + } + + bool isBlacklisted() const { + return mIsBlacklisted; + } + + bool isNotAWord() const { + return mIsNotAWord; + } + + // Whether or not a word is possibly offensive. + // * Static dictionaries =v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag. + // * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model + // flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero) + // + // See the ::getWordAttributes function for each of these dictionary policies for more details. + bool isPossiblyOffensive() const { + return mIsPossiblyOffensive; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(WordAttributes); + + int mProbability; + bool mIsBlacklisted; + bool mIsNotAWord; + bool mIsPossiblyOffensive; +}; + + // namespace +#endif /* LATINIME_WORD_ATTRIBUTES_H */ diff --git a/native/jni/src/dictionary/property/word_property.h b/native/jni/src/dictionary/property/word_property.h new file mode 100644 index 000000000..3028e020a --- /dev/null +++ b/native/jni/src/dictionary/property/word_property.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_PROPERTY_H +#define LATINIME_WORD_PROPERTY_H + +#include + +#include "defines.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "utils/int_array_view.h" + +namespace latinime { + +// This class is used for returning information belonging to a word to java side. +class WordProperty { + public: + // Default constructor is used to create an instance that indicates an invalid word. + WordProperty() + : mCodePoints(), mUnigramProperty(), mNgrams() {} + + WordProperty(const std::vector &&codePoints, const UnigramProperty &unigramProperty, + const std::vector &ngrams) + : mCodePoints(std::move(codePoints)), mUnigramProperty(unigramProperty), + mNgrams(ngrams) {} + + const CodePointArrayView getCodePoints() const { + return CodePointArrayView(mCodePoints); + } + + const UnigramProperty &getUnigramProperty() const { + return mUnigramProperty; + } + + const std::vector &getNgramProperties() const { + return mNgrams; + } + + private: + // Default copy constructor is used for using as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); + + const std::vector mCodePoints; + const UnigramProperty mUnigramProperty; + const std::vector mNgrams; +}; +} // namespace latinime +#endif // LATINIME_WORD_PROPERTY_H diff --git a/native/jni/src/dictionary/structure/backward/v402/Readme.txt b/native/jni/src/dictionary/structure/backward/v402/Readme.txt new file mode 100644 index 000000000..9e29e836c --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/Readme.txt @@ -0,0 +1 @@ +Files under this directory have been auto generated. diff --git a/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp new file mode 100644 index 000000000..60749bce6 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp @@ -0,0 +1,289 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp + */ + +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos); + if (outBigramPos) { + // Lookup target PtNode position. + *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + } + if (outProbability) { + if (bigramEntry.hasHistoricalInfo()) { + *outProbability = + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(), + mHeaderPolicy); + } else { + *outProbability = bigramEntry.getProbability(); + } + } + if (outHasNext) { + *outHasNext = bigramEntry.hasNext(); + } +} + +bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { + // 1. The word has no bigrams yet. + // 2. The word has bigrams, and there is the target in the list. + // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. + // 4. The word has bigrams. We have to append new bigram entry to the list. + // 5. Same as 4, but the list is the last entry of the content file. + if (outAddedNewEntry) { + *outAddedNewEntry = false; + } + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Case 1. PtNode that doesn't have a bigram list. + // Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, + ngramProperty); + // Write an entry. + const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; + } + + int tailEntryPos = NOT_A_DICT_POS; + const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos, + &tailEntryPos); + if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) { + // Case 4, 5. + // Add new entry to the bigram list. + if (tailEntryPos == NOT_A_DICT_POS) { + // Case 4. Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId); + // Copy existing bigram list. + if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) { + return false; + } + } + // Write new entry at the tail position of the bigram content. + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &newBigramEntry, ngramProperty); + if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) { + return false; + } + // Update has next flag of the tail entry. + if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; + } + + // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry. + const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (!originalBigramEntry.isValid()) { + // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing + // entry is updated. + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + } + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &updatedBigramEntry, ngramProperty); + return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); +} + +bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return false; + } + const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos, + nullptr /* outTailEntryPos */); + if (entryPosToUpdate == NOT_A_DICT_POS) { + // Bigram entry doesn't exist. + return false; + } + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (targetTerminalId != bigramEntry.getTargetTerminalId()) { + // Bigram entry doesn't exist. + return false; + } + // Remove bigram entry by marking it as invalid entry and overwriting the original entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate); +} + +bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return true; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + if (targetPtNodePos == NOT_A_DICT_POS) { + // Invalidate bigram entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } else if (bigramEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + bigramEntry.getHistoricalInfo(), mHeaderPolicy); + if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) { + const BigramEntry updatedBigramEntry = + bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + *outBigramCount += 1; + } else { + // Remove entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } + } else { + *outBigramCount += 1; + } + } + return true; +} + +int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return 0; + } + int bigramCount = 0; + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.isValid()) { + bigramCount++; + } + } + return bigramCount; +} + +int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, + const int bigramListPos, int *const outTailEntryPos) const { + if (outTailEntryPos) { + *outTailEntryPos = NOT_A_DICT_POS; + } + bool hasNext = true; + int invalidEntryPos = NOT_A_DICT_POS; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) { + // Entry with same target is found. + return entryPos; + } else if (!bigramEntry.isValid()) { + // Invalid entry that can be reused is found. + invalidEntryPos = entryPos; + } + if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) { + if (outTailEntryPos) { + *outTailEntryPos = entryPos; + } + } + } + return invalidEntryPos; +} + +const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( + const BigramEntry *const originalBigramEntry, + const NgramProperty *const ngramProperty) const { + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo &historicalInfoForUpdate = ngramProperty->getHistoricalInfo(); + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalBigramEntry->getHistoricalInfo(), ngramProperty->getProbability(), + &historicalInfoForUpdate, mHeaderPolicy); + return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); + } else { + return originalBigramEntry->updateProbabilityAndGetEntry(ngramProperty->getProbability()); + } +} + +bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) { + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos); + const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h new file mode 100644 index 000000000..58c88ce8a --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/structure/backward/v402/content/bigram_entry.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramDictContent; +} // namespace v402 +} // namespace backward +class NgramProperty; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class TerminalPositionLookupTable; + +class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable, + const HeaderPolicy *const headerPolicy) + : mBigramDictContent(bigramDictContent), + mTerminalPositionLookupTable(terminalPositionLookupTable), + mHeaderPolicy(headerPolicy) {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const; + + bool skipAllBigrams(int *const pos) const { + // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. + return true; + } + + bool addNewEntry(const int terminalId, const int newTargetTerminalId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + bool removeEntry(const int terminalId, const int targetTerminalId); + + bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount); + + int getBigramEntryConut(const int terminalId); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); + + int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos, + int *const outTailEntryPos) const; + + const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, + const NgramProperty *const ngramProperty) const; + + bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos); + + BigramDictContent *const mBigramDictContent; + const TerminalPositionLookupTable *const mTerminalPositionLookupTable; + const HeaderPolicy *const mHeaderPolicy; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp new file mode 100644 index 000000000..7fa85dec2 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/bigram_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( + int *const bigramEntryPos) const { + const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); + const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); + if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { + AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " + "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, + bigramListBuffer->getTailPosition()); + ASSERT(false); + return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + Ver4DictConstants::NOT_A_TERMINAL_ID); + } + const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos); + const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0; + int probability = NOT_A_PROBABILITY; + int timestamp = NOT_A_TIMESTAMP; + int level = 0; + int count = 0; + if (mHasHistoricalInfo) { + timestamp = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos); + level = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos); + count = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos); + } else { + probability = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); + } + const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos); + const int targetTerminalId = + (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? + Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; + if (mHasHistoricalInfo) { + // Hack for better migration. + count += level; + const HistoricalInfo historicalInfo(timestamp, level, count); + return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId); + } else { + return BigramEntry(hasNext, probability, targetTerminalId); + } +} + +bool BigramDictContent::writeBigramEntryAndAdvancePosition( + const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) { + BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer(); + const int bigramFlags = createAndGetBigramFlags(bigramEntryToWrite->hasNext()); + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags, + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags); + return false; + } + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo(); + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos, + historicalInfo->getTimestamp()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos, + historicalInfo->getLevel()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos, + historicalInfo->getCount()); + return false; + } + } else { + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, + bigramEntryToWrite->getProbability()); + return false; + } + } + const int targetTerminalIdToWrite = + (bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ? + Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : + bigramEntryToWrite->getTargetTerminalId(); + if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite, + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d", + *entryWritingPos, bigramEntryToWrite->getTargetTerminalId()); + return false; + } + return true; +} + +bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos, + int *const outTailEntryPos) { + int readingPos = bigramListPos; + int writingPos = toPos; + bool hasNext = true; + while (hasNext) { + const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!hasNext) { + *outTailEntryPos = writingPos; + } + if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalBigramListPos = + originalBigramDictContent->getBigramListHeadPos(it->first); + if (originalBigramListPos == NOT_A_DICT_POS) { + // This terminal does not have a bigram list. + continue; + } + const int bigramListPos = getContentBuffer()->getTailPosition(); + int bigramEntryCount = 0; + // Copy bigram list with GC from original content. + if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, + terminalIdMap, &bigramEntryCount)) { + AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d", + originalBigramListPos, bigramListPos); + return false; + } + if (bigramEntryCount == 0) { + // All bigram entries are useless. This terminal does not have a bigram list. + continue; + } + *outBigramEntryCount += bigramEntryCount; + // Set bigram list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { + AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d", + it->second, bigramListPos); + return false; + } + } + return true; +} + +// Returns whether GC for the bigram list was succeeded or not. +bool BigramDictContent::runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntrycount) { + bool hasNext = true; + int readingPos = bigramListPos; + int writingPos = toPos; + int lastEntryPos = NOT_A_DICT_POS; + while (hasNext) { + const BigramEntry originalBigramEntry = + sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = originalBigramEntry.hasNext(); + if (originalBigramEntry.getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) { + continue; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + terminalIdMap->find(originalBigramEntry.getTargetTerminalId()); + if (it == terminalIdMap->end()) { + // Target word has been removed. + continue; + } + lastEntryPos = hasNext ? writingPos : NOT_A_DICT_POS; + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second); + if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos); + return false; + } + *outEntrycount += 1; + } + if (lastEntryPos != NOT_A_DICT_POS) { + // Update has next flag in the last written entry. + const BigramEntry bigramEntry = getBigramEntry(lastEntryPos).updateHasNextAndGetEntry( + false /* hasNext */); + if (!writeBigramEntry(&bigramEntry, lastEntryPos)) { + AKLOGE("Cannot write bigram entry to set hasNext flag after GC. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h new file mode 100644 index 000000000..14f334a12 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/bigram_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/bigram_entry.h" +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramDictContent : public SparseTableDictContent { + public: + BigramDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + BigramDictContent(const bool hasHistoricalInfo) + : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + const BigramEntry getBigramEntry(const int bigramEntryPos) const { + int readingPos = bigramEntryPos; + return getBigramEntryAndAdvancePosition(&readingPos); + } + + const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const; + + // Returns head position of bigram list for a PtNode specified by terminalId. + int getBigramListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); + } + + bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) { + int writingPos = getContentBuffer()->getTailPosition(); + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + + bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { + int writingPos = entryWritingPos; + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + + bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite, + int *const entryWritingPos); + + bool createNewBigramList(const int terminalId) { + const int bigramListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); + } + + bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos); + + bool flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION); + } + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount); + + bool isContentTailPos(const int pos) const { + return pos == getContentBuffer()->getTailPosition(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(BigramDictContent); + + int createAndGetBigramFlags(const bool hasNext) const { + return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0; + } + + int getBigramEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } else { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } + } + + bool runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntryCount); + + bool mHasHistoricalInfo; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h new file mode 100644 index 000000000..36ad855ee --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/bigram_entry.h + */ + +#ifndef LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H +#define LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramEntry { + public: + BigramEntry(const BigramEntry& bigramEntry) + : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability), + mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(), + mTargetTerminalId(targetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, + const HistoricalInfo *const historicalInfo, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo), + mTargetTerminalId(targetTerminalId) {} + + const BigramEntry getInvalidatedEntry() const { + return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID); + } + + const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const { + return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const { + return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId); + } + + const BigramEntry updateProbabilityAndGetEntry(const int probability) const { + return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateHistoricalInfoAndGetEntry( + const HistoricalInfo *const historicalInfo) const { + return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId); + } + + bool isValid() const { + return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + } + + bool hasNext() const { + return mHasNext; + } + + int getProbability() const { + return mProbability; + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + int getTargetTerminalId() const { + return mTargetTerminalId; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry); + DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry); + + const bool mHasNext; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + const int mTargetTerminalId; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h new file mode 100644 index 000000000..d3b84fa04 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_DICT_CONTENT_H + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class DictContent { + public: + virtual ~DictContent() {} + virtual bool isValid() const = 0; + + protected: + DictContent() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictContent); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp new file mode 100644 index 000000000..b167f0ab2 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/probability_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" + +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + // This method can be called with invalid terminal id during GC. + return ProbabilityEntry(0 /* flags */, NOT_A_PROBABILITY); + } + const BufferWithExtendableBuffer *const buffer = getBuffer(); + int entryPos = getEntryPos(terminalId); + const int flags = buffer->readUintAndAdvancePosition( + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos); + const int probability = buffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, &entryPos); + if (mHasHistoricalInfo) { + const int timestamp = buffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos); + const int level = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); + const int count = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); + // Hack for better migration. + const HistoricalInfo historicalInfo(timestamp, level, count + level); + return ProbabilityEntry(flags, probability, &historicalInfo); + } else { + return ProbabilityEntry(flags, probability); + } +} + +bool ProbabilityDictContent::setProbabilityEntry(const int terminalId, + const ProbabilityEntry *const probabilityEntry) { + if (terminalId < 0) { + return false; + } + const int entryPos = getEntryPos(terminalId); + if (terminalId >= mSize) { + ProbabilityEntry dummyEntry; + // Write new entry. + int writingPos = getBuffer()->getTailPosition(); + while (writingPos <= entryPos) { + // Fulfilling with dummy entries until writingPos. + if (!writeEntry(&dummyEntry, writingPos)) { + AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize); + return false; + } + writingPos += getEntrySize(); + } + mSize = terminalId + 1; + } + return writeEntry(probabilityEntry, entryPos); +} + +bool ProbabilityDictContent::flushToFile(const char *const dictPath) const { + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo); + for (int i = 0; i < mSize; ++i) { + const ProbabilityEntry probabilityEntry = getProbabilityEntry(i); + if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i); + return false; + } + } + return probabilityDictContentToWrite.flush(dictPath, + Ver4DictConstants::FREQ_FILE_EXTENSION); + } else { + return flush(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION); + } +} + +bool ProbabilityDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const ProbabilityEntry probabilityEntry = + originalProbabilityDictContent->getProbabilityEntry(it->first); + if (!setProbabilityEntry(it->second, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second); + return false; + } + } + return true; +} + +int ProbabilityDictContent::getEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE; + } else { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE; + } +} + +int ProbabilityDictContent::getEntryPos(const int terminalId) const { + return terminalId * getEntrySize(); +} + +bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry, + const int entryPos) { + BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer(); + int writingPos = entryPos; + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(), + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { + AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { + AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos); + return false; + } + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo(); + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h new file mode 100644 index 000000000..464b29f3f --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/probability_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/single_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ProbabilityEntry; + +class ProbabilityDictContent : public SingleDictContent { + public: + ProbabilityDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SingleDictContent(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable), + mHasHistoricalInfo(hasHistoricalInfo), + mSize(getBuffer()->getTailPosition() / getEntrySize()) {} + + ProbabilityDictContent(const bool hasHistoricalInfo) + : mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {} + + const ProbabilityEntry getProbabilityEntry(const int terminalId) const; + + bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry); + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent); + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent); + + int getEntrySize() const; + + int getEntryPos(const int terminalId) const; + + bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos); + + bool mHasHistoricalInfo; + int mSize; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h new file mode 100644 index 000000000..94e36bf51 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/probability_entry.h + */ + +#ifndef LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H +#define LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ProbabilityEntry { + public: + ProbabilityEntry(const ProbabilityEntry &probabilityEntry) + : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), + mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} + + // Dummy entry + ProbabilityEntry() + : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {} + + // Entry without historical information + ProbabilityEntry(const int flags, const int probability) + : mFlags(flags), mProbability(probability), mHistoricalInfo() {} + + // Entry with historical information. + ProbabilityEntry(const int flags, const int probability, + const HistoricalInfo *const historicalInfo) + : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {} + + const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const { + return ProbabilityEntry(mFlags, probability, &mHistoricalInfo); + } + + const ProbabilityEntry createEntryWithUpdatedHistoricalInfo( + const HistoricalInfo *const historicalInfo) const { + return ProbabilityEntry(mFlags, mProbability, historicalInfo); + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + int getFlags() const { + return mFlags; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); + + const int mFlags; + const int mProbability; + const HistoricalInfo mHistoricalInfo; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp new file mode 100644 index 000000000..e538a02a1 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/shortcut_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const { + const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); + if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { + AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", + *shortcutEntryPos, shortcutListBuffer->getTailPosition()); + ASSERT(false); + if (outhasNext) { + *outhasNext = false; + } + if (outCodePointCount) { + *outCodePointCount = 0; + } + return; + } + + const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + if (outProbability) { + *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; + } + if (outhasNext) { + *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + } + if (outCodePoint && outCodePointCount) { + shortcutListBuffer->readCodePointsAndAdvancePosition( + maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); + } +} + +int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); +} + +bool ShortcutDictContent::flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION); +} + +bool ShortcutDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalShortcutListPos = + originalShortcutDictContent->getShortcutListHeadPos(it->first); + if (originalShortcutListPos == NOT_A_DICT_POS) { + continue; + } + const int shortcutListPos = getContentBuffer()->getTailPosition(); + // Copy shortcut list from original content. + if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, + shortcutListPos)) { + AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", + originalShortcutListPos, shortcutListPos); + return false; + } + // Set shortcut list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { + AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", + it->second, shortcutListPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::createNewShortcutList(const int terminalId) { + const int shortcutListListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); +} + +bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { + return copyShortcutListFromDictContent(shortcutListPos, this, toPos); +} + +bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { + bool hasNext = true; + int readingPos = shortcutListPos; + int writingPos = toPos; + int codePoints[MAX_WORD_LENGTH]; + while (hasNext) { + int probability = 0; + int codePointCount = 0; + sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, + codePoints, &codePointCount, &probability, &hasNext, &readingPos); + if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, + hasNext, &writingPos)) { + AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = shortcutListBuffer->readUint( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); + return shortcutListBuffer->writeUint(shortcutFlagsToWrite, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); +} + +bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); + if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); + return false; + } + if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, + true /* writesTerminator */, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); + return false; + } + return true; +} + +// Find a shortcut entry that has specified target and return its position. +int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const { + bool hasNext = true; + int readingPos = shortcutListPos; + int targetCodePoints[MAX_WORD_LENGTH]; + while (hasNext) { + const int entryPos = readingPos; + int probability = 0; + int targetCodePointCount = 0; + getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, + &probability, &hasNext, &readingPos); + if (targetCodePointCount != codePointCount) { + continue; + } + bool matched = true; + for (int i = 0; i < codePointCount; ++i) { + if (targetCodePointsToFind[i] != targetCodePoints[i]) { + matched = false; + break; + } + } + if (matched) { + return entryPos; + } + } + return NOT_A_DICT_POS; +} + +int ShortcutDictContent::createAndGetShortcutFlags(const int probability, + const bool hasNext) const { + return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) + | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h new file mode 100644 index 000000000..3b725e896 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/shortcut_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ShortcutDictContent : public SparseTableDictContent { + public: + ShortcutDictContent(const char *const dictPath, const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + ShortcutDictContent() + : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, int *const outProbability, bool *const outhasNext, + const int shortcutEntryPos) { + int readingPos = shortcutEntryPos; + return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, + outCodePointCount, outProbability, outhasNext, &readingPos); + } + + void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const; + + // Returns head position of shortcut list for a PtNode specified by terminalId. + int getShortcutListHeadPos(const int terminalId) const; + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent); + + bool createNewShortcutList(const int terminalId); + + bool copyShortcutList(const int shortcutListPos, const int toPos); + + bool setProbability(const int probability, const int shortcutEntryPos); + + bool writeShortcutEntry(const int *const codePoint, const int codePointCount, + const int probability, const bool hasNext, const int shortcutEntryPos) { + int writingPos = shortcutEntryPos; + return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, + hasNext, &writingPos); + } + + bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos); + + int findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); + + bool copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); + + int createAndGetShortcutFlags(const int probability, const bool hasNext) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h new file mode 100644 index 000000000..89df2a1e0 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/single_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "utils/byte_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class SingleDictContent : public DictContent { + public: + SingleDictContent(const char *const dictPath, const char *const contentFileName, + const bool isUpdatable) + : mMmappedBuffer(MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableContentBuffer( + mMmappedBuffer ? mMmappedBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mIsValid(mMmappedBuffer) {} + + SingleDictContent() + : mMmappedBuffer(nullptr), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mIsValid(true) {} + + virtual ~SingleDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + BufferWithExtendableBuffer *getWritableBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictPath, const char *const contentFileNameSuffix) const { + return DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + contentFileNameSuffix, &mExpandableContentBuffer); + } + + private: + DISALLOW_COPY_AND_ASSIGN(SingleDictContent); + + const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + const bool mIsValid; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp new file mode 100644 index 000000000..280f0f85a --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/sparse_table_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool SparseTableDictContent::flush(const char *const dictPath, + const char *const lookupTableFileNameSuffix, const char *const addressTableFileNameSuffix, + const char *const contentFileNameSuffix) const { + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, lookupTableFileNameSuffix, + &mExpandableLookupTableBuffer)){ + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, addressTableFileNameSuffix, + &mExpandableAddressTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, contentFileNameSuffix, + &mExpandableContentBuffer)) { + return false; + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h new file mode 100644 index 000000000..4b5af87ad --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/sparse_table_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "dictionary/utils/sparse_table.h" +#include "utils/byte_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// TODO: Support multiple contents. +class SparseTableDictContent : public DictContent { + public: + AK_FORCE_INLINE SparseTableDictContent(const char *const dictPath, + const char *const lookupTableFileName, const char *const addressTableFileName, + const char *const contentFileName, const bool isUpdatable, + const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer( + MmappedBuffer::openBuffer(dictPath, lookupTableFileName, isUpdatable)), + mAddressTableBuffer( + MmappedBuffer::openBuffer(dictPath, addressTableFileName, isUpdatable)), + mContentBuffer( + MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableLookupTableBuffer( + mLookupTableBuffer ? mLookupTableBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableAddressTableBuffer( + mAddressTableBuffer ? mAddressTableBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableContentBuffer( + mContentBuffer ? mContentBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), + mIsValid(mLookupTableBuffer && mAddressTableBuffer && mContentBuffer) {} + + SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer(), mAddressTableBuffer(), mContentBuffer(), + mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), mIsValid(true) {} + + virtual ~SparseTableDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableLookupTableBuffer.isNearSizeLimit() + || mExpandableAddressTableBuffer.isNearSizeLimit() + || mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + SparseTable *getUpdatableAddressLookupTable() { + return &mAddressLookupTable; + } + + const SparseTable *getAddressLookupTable() const { + return &mAddressLookupTable; + } + + BufferWithExtendableBuffer *getWritableContentBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getContentBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictDirPath, const char *const lookupTableFileName, + const char *const addressTableFileName, const char *const contentFileName) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); + + const MmappedBuffer::MmappedBufferPtr mLookupTableBuffer; + const MmappedBuffer::MmappedBufferPtr mAddressTableBuffer; + const MmappedBuffer::MmappedBufferPtr mContentBuffer; + BufferWithExtendableBuffer mExpandableLookupTableBuffer; + BufferWithExtendableBuffer mExpandableAddressTableBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + SparseTable mAddressLookupTable; + const bool mIsValid; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp new file mode 100644 index 000000000..30b72bbd1 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/terminal_position_lookup_table.cpp + */ + +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + return NOT_A_DICT_POS; + } + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); + return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? + NOT_A_DICT_POS : terminalPos; +} + +bool TerminalPositionLookupTable::setTerminalPtNodePosition( + const int terminalId, const int terminalPtNodePos) { + if (terminalId < 0) { + return NOT_A_DICT_POS; + } + while (terminalId >= mSize) { + // Write new entry. + if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { + return false; + } + mSize++; + } + const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? + terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; + return getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); +} + +bool TerminalPositionLookupTable::flushToFile(const char *const dictPath) const { + // If the used buffer size is smaller than the actual buffer size, regenerate the lookup + // table and write the new table to the file. + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + TerminalPositionLookupTable lookupTableToWrite; + for (int i = 0; i < mSize; ++i) { + const int terminalPtNodePosition = getTerminalPtNodePosition(i); + if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { + AKLOGE("Cannot set terminal position to lookupTableToWrite." + " terminalId: %d, position: %d", i, terminalPtNodePosition); + return false; + } + } + return lookupTableToWrite.flush(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } else { + // We can simply use this lookup table because the buffer size has not been + // changed. + return flush(dictPath, Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } +} + +bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { + int removedEntryCount = 0; + int nextNewTerminalId = 0; + for (int i = 0; i < mSize; ++i) { + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); + if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { + // This entry is a garbage. + removedEntryCount++; + } else { + // Give a new terminal id to the entry. + if (!getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, + getEntryPos(nextNewTerminalId))) { + return false; + } + // Memorize the mapping to the old terminal id to the new terminal id. + terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); + nextNewTerminalId++; + } + } + mSize = nextNewTerminalId; + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h new file mode 100644 index 000000000..641c7496f --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/terminal_position_lookup_table.h + */ + +#ifndef LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H +#define LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H + +#include + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/single_dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class TerminalPositionLookupTable : public SingleDictContent { + public: + typedef std::unordered_map TerminalIdMap; + + TerminalPositionLookupTable(const char *const dictPath, const bool isUpdatable) + : SingleDictContent(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION, isUpdatable), + mSize(getBuffer()->getTailPosition() + / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} + + TerminalPositionLookupTable() : mSize(0) {} + + int getTerminalPtNodePosition(const int terminalId) const; + + bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); + + int getNextTerminalId() const { + return mSize; + } + + bool flushToFile(const char *const dictPath) const; + + bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); + + private: + DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); + + int getEntryPos(const int terminalId) const { + return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + } + + int mSize; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h new file mode 100644 index 000000000..8cda8c5cf --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable) + : mShortcutDictContent(shortcutDictContent) {} + + ~Ver4ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + // The first shortcut entry is located at the head position of the shortcut list. + return pos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + int probability = 0; + mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, + outCodePoint, outCodePointCount, &probability, outHasNext, pos); + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); + } + } + + void skipAllShortcuts(int *const pos) const { + // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. + } + + bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, + const int probability) { + const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (shortcutListPos == NOT_A_DICT_POS) { + // Create shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, + false /* hasNext */, writingPos); + } + const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, + codePoints, codePointCount); + if (entryPos == NOT_A_DICT_POS) { + // Add new entry to the shortcut list. + // Create new shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, + codePointCount, probability, true /* hasNext */, &writingPos)) { + AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, + writingPos); + return false; + } + return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); + } + // Overwrite existing entry. + bool hasNext = false; + mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, + 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); + if (!mShortcutDictContent->writeShortcutEntry(codePoints, + codePointCount, probability, hasNext, entryPos)) { + AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, + entryPos); + return false; + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); + + ShortcutDictContent *const mShortcutDictContent; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp new file mode 100644 index 000000000..4a9704f4d --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_buffers.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" + +#include +#include +#include +#include + +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( + const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion) { + if (!headerBuffer) { + ASSERT(false); + AKLOGE("The header buffer must be valid to open ver4 dict buffers."); + return Ver4DictBuffersPtr(nullptr); + } + // TODO: take only dictDirPath, and open both header and trie files in the constructor below + const bool isUpdatable = headerBuffer->isUpdatable(); + return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable, + formatVersion)); +} + +bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const { + // Create temporary directory. + const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + char tmpDirPath[tmpDirPathBufSize]; + FileUtils::getFilePathWithSuffix(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, + tmpDirPath); + if (FileUtils::existsDir(tmpDirPath)) { + if (!FileUtils::removeDirAndFiles(tmpDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); + ASSERT(false); + return false; + } + } + umask(S_IWGRP | S_IWOTH); + if (mkdir(tmpDirPath, S_IRWXU) == -1) { + AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); + return false; + } + // Get dictionary base path. + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); + char dictPath[dictPathBufSize]; + FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); + + // Write header file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { + AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::HEADER_FILE_EXTENSION); + return false; + } + // Write trie file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, &mExpandableTrieBuffer)) { + AKLOGE("Dictionary trie file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::TRIE_FILE_EXTENSION); + return false; + } + // Write dictionary contents. + if (!mTerminalPositionLookupTable.flushToFile(dictPath)) { + AKLOGE("Terminal position lookup table cannot be written. %s", tmpDirPath); + return false; + } + if (!mProbabilityDictContent.flushToFile(dictPath)) { + AKLOGE("Probability dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mBigramDictContent.flushToFile(dictPath)) { + AKLOGE("Bigram dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mShortcutDictContent.flushToFile(dictPath)) { + AKLOGE("Shortcut dict content cannot be written. %s", tmpDirPath); + return false; + } + // Remove existing dictionary. + if (!FileUtils::removeDirAndFiles(dictDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", dictDirPath); + ASSERT(false); + return false; + } + // Rename temporary directory. + if (rename(tmpDirPath, dictDirPath) != 0) { + AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); + ASSERT(false); + return false; + } + return true; +} + +Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath, + MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion) + : mHeaderBuffer(std::move(headerBuffer)), + mDictBuffer(MmappedBuffer::openBuffer(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)), + mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion), + mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableTrieBuffer( + mDictBuffer ? mDictBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mTerminalPositionLookupTable(dictPath, isUpdatable), + mProbabilityDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), + mBigramDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), + mShortcutDictContent(dictPath, isUpdatable), + mIsUpdatable(isUpdatable) {} + +Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) + : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy), + mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), + mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()), + mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(), + mIsUpdatable(true) {} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h new file mode 100644 index 000000000..0d09fee9a --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_buffers.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H +#define LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H + +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/mmapped_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class Ver4DictBuffers { + public: + typedef std::unique_ptr Ver4DictBuffersPtr; + + static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, + MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion); + + static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( + const HeaderPolicy *const headerPolicy, const int maxTrieSize) { + return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize)); + } + + AK_FORCE_INLINE bool isValid() const { + return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid() + && mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid() + && mBigramDictContent.isValid() && mShortcutDictContent.isValid(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mExpandableTrieBuffer.isNearSizeLimit() + || mTerminalPositionLookupTable.isNearSizeLimit() + || mProbabilityDictContent.isNearSizeLimit() + || mBigramDictContent.isNearSizeLimit() + || mShortcutDictContent.isNearSizeLimit(); + } + + AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { + return &mHeaderPolicy; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { + return &mExpandableHeaderBuffer; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE ProbabilityDictContent *getMutableProbabilityDictContent() { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE const ProbabilityDictContent *getProbabilityDictContent() const { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() { + return &mBigramDictContent; + } + + AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const { + return &mBigramDictContent; + } + + AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + bool flush(const char *const dictDirPath) const { + return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); + } + + bool flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); + + Ver4DictBuffers(const char *const dictDirPath, + const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion); + + Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); + + const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; + const MmappedBuffer::MmappedBufferPtr mDictBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mExpandableHeaderBuffer; + BufferWithExtendableBuffer mExpandableTrieBuffer; + TerminalPositionLookupTable mTerminalPositionLookupTable; + ProbabilityDictContent mProbabilityDictContent; + BigramDictContent mBigramDictContent; + ShortcutDictContent mShortcutDictContent; + const int mIsUpdatable; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp new file mode 100644 index 000000000..2948d0716 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_constants.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// These values MUST match the definitions in FormatSpec.java. +const char *const Ver4DictConstants::TRIE_FILE_EXTENSION = ".trie"; +const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; +const char *const Ver4DictConstants::FREQ_FILE_EXTENSION = ".freq"; +// tat = Terminal Address Table +const char *const Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; +const char *const Ver4DictConstants::BIGRAM_FILE_EXTENSION = ".bigram_freq"; +const char *const Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup"; +const char *const Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION = ".bigram_index_freq"; +const char *const Ver4DictConstants::SHORTCUT_FILE_EXTENSION = ".shortcut_shortcut"; +const char *const Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION = ".shortcut_lookup"; +const char *const Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION = + ".shortcut_index_shortcut"; + +// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. +const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; +// Extended region size, which is not GCed region size in dict file + additional buffer size, is +// limited to 1MB to prevent from inefficient traversing. +const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; + +const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; +const int Ver4DictConstants::PROBABILITY_SIZE = 1; +const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1; +const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; +const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; +const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; +const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; + +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; + +const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3; +// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing +// invalid terminal ID in bigram lists. +const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID = + (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1; +const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::BIGRAM_HAS_NEXT_MASK = 0x80; +const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1; + +const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h new file mode 100644 index 000000000..15581d852 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_constants.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H +#define LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// TODO: Create PtConstants under the pt_common and move some constant values there. +// Note that there are corresponding definitions in FormatSpec.java. +class Ver4DictConstants { + public: + static const char *const TRIE_FILE_EXTENSION; + static const char *const HEADER_FILE_EXTENSION; + static const char *const FREQ_FILE_EXTENSION; + static const char *const TERMINAL_ADDRESS_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_FILE_EXTENSION; + static const char *const BIGRAM_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_CONTENT_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_FILE_EXTENSION; + static const char *const SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_CONTENT_TABLE_FILE_EXTENSION; + + static const int MAX_DICTIONARY_SIZE; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + + static const int NOT_A_TERMINAL_ID; + static const int PROBABILITY_SIZE; + static const int FLAGS_IN_PROBABILITY_FILE_SIZE; + static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + static const int NOT_A_TERMINAL_ADDRESS; + static const int TERMINAL_ID_FIELD_SIZE; + static const int TIME_STAMP_FIELD_SIZE; + static const int WORD_LEVEL_FIELD_SIZE; + static const int WORD_COUNT_FIELD_SIZE; + + static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE; + static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; + + static const int BIGRAM_FLAGS_FIELD_SIZE; + static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + static const int INVALID_BIGRAM_TARGET_TERMINAL_ID; + static const int BIGRAM_PROBABILITY_MASK; + static const int BIGRAM_HAS_NEXT_MASK; + // Used when bigram list has time stamp. + static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE; + + static const int SHORTCUT_FLAGS_FIELD_SIZE; + static const int SHORTCUT_PROBABILITY_MASK; + static const int SHORTCUT_HAS_NEXT_MASK; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..871ef7aaf --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int siblingNodePos) const { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + return PtNodeParams(); + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + const int headPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + dictBuf, &pos); + const int parentPos = + DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); + int codePoints[MAX_WORD_LENGTH]; + const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos); + int terminalIdFieldPos = NOT_A_DICT_POS; + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + int probability = NOT_A_PROBABILITY; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + terminalIdFieldPos = pos; + if (usesAdditionalBuffer) { + terminalIdFieldPos += mBuffer->getOriginalBufferSize(); + } + terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); + const ProbabilityEntry probabilityEntry = + mProbabilityDictContent->getProbabilityEntry(terminalId); + if (probabilityEntry.hasHistoricalInfo()) { + probability = ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo(), mHeaderPolicy); + } else { + probability = probabilityEntry.getProbability(); + } + } + int childrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + childrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { + childrenPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + // Sibling position is the tail position of original PtNode. + int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; + // Read destination node if the read node is a moved node. + if (DynamicPtReadingUtils::isMoved(flags)) { + // The destination position is stored at the same place as the parent position. + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); + } else { + return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, + terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, + newSiblingNodePos); + } +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h new file mode 100644 index 000000000..367d6f9f8 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_reader.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class ProbabilityDictContent; + +/* + * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved + * node and reads node attributes including probability form probabilityBuffer. + */ +class Ver4PatriciaTrieNodeReader : public PtNodeReader { + public: + Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, + const ProbabilityDictContent *const probabilityDictContent, + const HeaderPolicy *const headerPolicy) + : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent), + mHeaderPolicy(headerPolicy) {} + + ~Ver4PatriciaTrieNodeReader() {} + + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, + NOT_A_DICT_POS /* siblingNodePos */); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + const ProbabilityDictContent *const mProbabilityDictContent; + const HeaderPolicy *const mHeaderPolicy; + + const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int siblingNodePos) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp new file mode 100644 index 000000000..e3ab5ec20 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp @@ -0,0 +1,442 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->isTerminal()) { + // The PtNode is a terminal. Delete entry from the terminal position lookup table. + return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); + } else { + return true; + } +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->hasChildren()) { + // Update children's parent position. + mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); + while (!mReadingHelper.isEnd()) { + const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); + int parentOffsetFieldPos = childPtNodeParams.getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), + &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + mReadingHelper.readNextSiblingNode(childPtNodeParams); + } + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + false /* isDeleted */, true /* willBecomeNonTerminal */); + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { + AKLOGE("Cannot update terminal position lookup table. terminal id: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + // Update flags. + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) { + // Update probability and historical information. + // TODO: Update other information in the unigram property. + if (!toBeUpdatedPtNodeParams->isTerminal()) { + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, + unigramProperty); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { + if (!toBeUpdatedPtNodeParams->isTerminal()) { + AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + if (originalProbabilityEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); + const ProbabilityEntry probabilityEntry = + originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); + if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { + AKLOGE("Cannot write updated probability entry. terminalId: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); + if (!isValid) { + if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); + return false; + } + } + *outNeedsToKeepPtNode = isValid; + } else { + // No need to update probability. + *outNeedsToKeepPtNode = true; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( + const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { + int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); + return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + newChildrenPosition, &childrenPosFieldPos); +} + +bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId) { + return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, + toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { + return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, + ptNodeWritingPos); +} + + +bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, + int *const ptNodeWritingPos) { + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, + ptNodeWritingPos)) { + return false; + } + // Write probability. + ProbabilityEntry newProbabilityEntry; + const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( + &newProbabilityEntry, unigramProperty); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId, + &probabilityEntryToWrite); +} + +bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { + if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, ngramProperty, outAddedNewEntry)) { + AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d", + prevWordIds[0], wordId); + return false; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(prevWordIds[0]); + const PtNodeParams sourcePtNodeParams = + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (!sourcePtNodeParams.hasBigrams()) { + // Update has bigrams flag. + return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(), + sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(), + sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(), + true /* hasBigrams */, + sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, + const int wordId) { + return mBigramPolicy->removeEntry(prevWordIds[0], wordId); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { + return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries( + sourcePtNodeParams->getTerminalId(), outBigramEntryCount); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) { + int parentPos = toBeUpdatedPtNodeParams->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = toBeUpdatedPtNodeParams->getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { + return false; + } + + // Counts bigram entries. + if (outBigramEntryCount) { + *outBigramEntryCount = mBigramPolicy->getBigramEntryConut( + toBeUpdatedPtNodeParams->getTerminalId()); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), + targetCodePoints, targetCodePointCount, shortcutProbability)) { + AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); + return false; + } + if (!ptNodeParams->hasShortcutTargets()) { + // Update has shortcut targets flag. + return updatePtNodeFlags(ptNodeParams->getHeadPos(), + ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(), + ptNodeParams->isTerminal(), true /* hasShortcutTargets */, + ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags( + const PtNodeParams *const ptNodeParams) { + const bool hasBigrams = mBuffers->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(), + ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets, + hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos) { + const int nodePos = *ptNodeWritingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, + 0 /* nodeFlags */, ptNodeWritingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { + return false; + } + // Write code points + if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, + ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { + return false; + } + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!ptNodeParams->willBecomeNonTerminal()) { + if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { + terminalId = ptNodeParams->getTerminalId(); + } else if (ptNodeParams->isTerminal()) { + // Write terminal information using a new terminal id. + // Get a new unused terminal id. + terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); + } + } + const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + if (isTerminal) { + // Update the lookup table. + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + terminalId, nodePos)) { + return false; + } + // Write terminal Id. + if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, + Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { + return false; + } + if (outTerminalId) { + *outTerminalId = terminalId; + } + } + // Write children position + if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { + return false; + } + return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(), + ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(), + ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const { + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo &historicalInfoForUpdate = unigramProperty->getHistoricalInfo(); + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalProbabilityEntry->getHistoricalInfo(), + unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); + return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( + &updatedHistoricalInfo); + } else { + return originalProbabilityEntry->createEntryWithUpdatedProbability( + unigramProperty->getProbability()); + } +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, + const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, + const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars) { + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, + hasShortcutTargets, hasBigrams, hasMultipleChars, + CHILDREN_POSITION_FIELD_SIZE); + if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { + AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); + return false; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::suppressUnigramEntry(const PtNodeParams *const ptNodeParams) { + if (!mHeaderPolicy->hasHistoricalInfoOfWords()) { + // Require historical info to suppress unigram entry. + return false; + } + const HistoricalInfo suppressedHistorycalInfo(0 /* timestamp */, 0 /* level */, 0 /* count */); + const ProbabilityEntry probabilityEntryToWrite = + ProbabilityEntry().createEntryWithUpdatedHistoricalInfo(&suppressedHistorycalInfo); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + ptNodeParams->getTerminalId(), &probabilityEntryToWrite); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h new file mode 100644 index 000000000..db3cea174 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_writer.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "utils/int_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class Ver4BigramListPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PtNodeArrayReader; +class Ver4ShortcutListPolicy; + +/* + * This class is used for helping to writes nodes of ver4 patricia trie. + */ +class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { + public: + Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, + Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy, + const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader, + Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) + : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy), + mPtNodeReader(ptNodeReader), mReadingHelper(ptNodeReader, ptNodeArrayReader), + mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} + + virtual ~Ver4PatriciaTrieNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos); + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty); + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition); + + bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId); + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos); + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); + + virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount); + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability); + + bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); + + // Suppress unigram not to use the word for generating suggestions. So, this method can be used + // only for dictionaries with historical info. Also, suppressed entries are included in unigram + // count. They will be removed from the dictionary during GC. + bool suppressUnigramEntry(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); + + bool writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos); + + // Create updated probability entry using given unigram property. In addition to the + // probability, this method updates historical information if needed. + // TODO: Update flags belonging to the unigram property. + const ProbabilityEntry createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const; + + bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, + const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, + const bool hasMultipleChars); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mTrieBuffer; + Ver4DictBuffers *const mBuffers; + const HeaderPolicy *const mHeaderPolicy; + const PtNodeReader *const mPtNodeReader; + DynamicPtReadingHelper mReadingHelper; + Ver4BigramListPolicy *const mBigramPolicy; + Ver4ShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp new file mode 100644 index 000000000..6fb9cffb7 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -0,0 +1,662 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_policy.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" + +#include + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; +const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; +const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; + +void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); + while (!readingHelper.isEnd()) { + const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); + if (!ptNodeParams.isValid()) { + break; + } + bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); + if (isTerminal && mHeaderPolicy->isDecayingDict()) { + // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose + // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a + // valid terminal DicNode. + isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; + } + readingHelper.readNextSiblingNode(ptNodeParams); + if (ptNodeParams.representsNonWordInfo()) { + // Skip PtNodes that represent non-word information. + continue; + } + const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), + wordId, ptNodeParams.getCodePointArrayView()); + } + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } +} + +int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + readingHelper.initWithPtNodePos(ptNodePos); + const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( + maxCodePointCount, outCodePoints); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); + } + return codePointCount; +} + +int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getWordId()."); + } + return getWordIdFromTerminalPtNodePos(ptNodePos); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (multiBigramMap) { + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); + } + if (!prevWordIds.empty()) { + const int probability = getProbabilityOfWord(prevWordIds, wordId); + if (probability != NOT_A_PROBABILITY) { + return getWordAttributes(probability, ptNodeParams); + } + } + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.getProbability() == 0); +} + +int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + // In the v4 format, bigramProbability is a conditional probability. + const int bigramConditionalProbability = bigramProbability; + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } + if (bigramConditionalProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } + return bigramConditionalProbability; +} + +int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID) { + return NOT_A_PROBABILITY; + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) { + return NOT_A_PROBABILITY; + } + if (prevWordIds.empty()) { + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + } + if (prevWordIds[0] == NOT_A_WORD_ID) { + return NOT_A_PROBABILITY; + } + const PtNodeParams prevWordPtNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); + if (prevWordPtNodeParams.isDeleted()) { + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + } + const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( + prevWordPtNodeParams.getTerminalId()); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == ptNodePos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { + const int bigramConditionalProbability = getBigramConditionalProbability( + prevWordPtNodeParams.getProbability(), + prevWordPtNodeParams.representsBeginningOfSentence(), + bigramsIt.getProbability()); + return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability); + } + } + return NOT_A_PROBABILITY; +} + +void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const { + if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) { + return; + } + const PtNodeParams prevWordPtNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); + if (prevWordPtNodeParams.isDeleted()) { + return; + } + const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( + prevWordPtNodeParams.getTerminalId()); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + const int bigramConditionalProbability = getBigramConditionalProbability( + prevWordPtNodeParams.getProbability(), + prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability()); + listener->onVisitEntry(bigramConditionalProbability, + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); + } +} + +int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability, + const bool isInBeginningOfSentenceContext, const int bigramProbability) const { + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + if (isInBeginningOfSentenceContext) { + return bigramProbability; + } + // Calculate conditional probability. + return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability, + MAX_PROBABILITY); + } else { + // bigramProbability is a conditional probability. + return bigramProbability; + } +} + +BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( + const int wordId) const { + const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); + return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); +} + +int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams.getTerminalId()); +} + +int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams.getTerminalId()); +} + +bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %zd", + wordCodePoints.size()); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", + shortcut.getTargetCodePoints()->size()); + return false; + } + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + bool addedNewUnigram = false; + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = wordCodePoints.size(); + memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, + &addedNewUnigram)) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { + mEntryCounters.incrementNgramCount(NgramType::Unigram); + } + if (unigramProperty->getShortcuts().size() > 0) { + // Add shortcut target. + const int wordPos = getTerminalPtNodePosFromWordId( + getWordId(codePointArrayView, false /* forceLowerCaseSearch */)); + if (wordPos == NOT_A_DICT_POS) { + AKLOGE("Cannot find terminal PtNode position to add shortcut target."); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (!mUpdatingHelper.addShortcutTarget(wordPos, + CodePointArrayView(*shortcut.getTargetCodePoints()), + shortcut.getProbability())) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " + "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), + shortcut.getProbability()); + return false; + } + } + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int ptNodePos = getTerminalPtNodePosFromWordId( + getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); + if (ptNodePos == NOT_A_DICT_POS) { + return false; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + return mNodeWriter.suppressUnigramEntry(&ptNodeParams); +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + const NgramContext *const ngramContext = ngramProperty->getNgramContext(); + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); + return false; + } + if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert the ngram to the dictionary. " + "length: %zd", ngramProperty->getTargetCodePoints()->size()); + return false; + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (prevWordIds.empty()) { + return false; + } + if (prevWordIds[0] == NOT_A_WORD_ID) { + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, true /* isNotAWord */, + false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo()); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } else { + return false; + } + } + const int wordPos = getTerminalPtNodePosFromWordId(getWordId( + CodePointArrayView(*ngramProperty->getTargetCodePoints()), + false /* forceLowerCaseSearch */)); + if (wordPos == NOT_A_DICT_POS) { + return false; + } + bool addedNewBigram = false; + const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); + if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), + wordPos, ngramProperty, &addedNewBigram)) { + if (addedNewBigram) { + mEntryCounters.incrementNgramCount(NgramType::Bigram); + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", + wordCodePoints.size()); + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSerch */); + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { + return false; + } + const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints, + false /* forceLowerCaseSearch */)); + if (wordPos == NOT_A_DICT_POS) { + return false; + } + const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); + if (mUpdatingHelper.removeNgramEntry( + PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { + mEntryCounters.decrementNgramCount(NgramType::Bigram); + return true; + } else { + return false; + } +} + + +bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( + const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, + const bool isValidWord, const HistoricalInfo historicalInfo) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY; + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo); + if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { + AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) + ? NOT_A_PROBABILITY : probability; + const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram, + historicalInfo); + if (!addNgramEntry(&ngramProperty)) { + AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); + return false; + } + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { + AKLOGE("Cannot flush the dictionary to file."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { + AKLOGE("Cannot flush the dictionary to file with GC."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBuffers->isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() + > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size of the trie exceeds the limit. + return true; + } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mDictBuffer->getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy->isDecayingDict()) { + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), + mHeaderPolicy); + } + return false; +} + +void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, + char *const outResult, const int maxResultLength) { + const int compareLength = queryLength + 1 /* terminator */; + if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mEntryCounters.getNgramCount(NgramType::Unigram)); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Unigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Bigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } +} + +const WordProperty Ver4PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int ptNodePos = getTerminalPtNodePosFromWordId( + getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); + if (ptNodePos == NOT_A_DICT_POS) { + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + const ProbabilityEntry probabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + ptNodeParams.getTerminalId()); + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + // Fetch bigram information. + std::vector ngrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + if (bigramListPos != NOT_A_DICT_POS) { + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); + const TerminalPositionLookupTable *const terminalPositionLookupTable = + mBuffers->getTerminalPositionLookupTable(); + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + const int word1TerminalId = bigramEntry.getTargetTerminalId(); + const int word1TerminalPtNodePos = + terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); + if (word1TerminalPtNodePos == NOT_A_DICT_POS) { + continue; + } + const int codePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH, + bigramWord1CodePoints); + const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); + const int rawBigramProbability = bigramEntry.hasHistoricalInfo() + ? ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mHeaderPolicy) + : bigramEntry.getProbability(); + const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(), + ptNodeParams.representsBeginningOfSentence(), rawBigramProbability); + ngrams.emplace_back( + NgramContext(wordCodePoints.data(), wordCodePoints.size(), + ptNodeParams.representsBeginningOfSentence()), + CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(), + probability, *historicalInfo); + } + } + // Fetch shortcut information. + std::vector shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers->getShortcutDictContent(); + bool hasNext = true; + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + shortcuts.emplace_back( + CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), + shortcutProbability); + } + } + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), + ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + *outCodePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { + return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; +} + +int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { + return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h new file mode 100644 index 000000000..bce5f6bea --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "utils/int_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class DicNode; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class DicNodeVector; +namespace backward { +namespace v402 { + +// Word id = Position of a PtNode that represents the word. +// Max supported n-gram is bigram. +class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) + : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), + mDictBuffer(mBuffers->getWritableTrieBuffer()), + mBigramPolicy(mBuffers->getMutableBigramDictContent(), + mBuffers->getTerminalPositionLookupTable(), mHeaderPolicy), + mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()), + mNodeReader(mDictBuffer, mBuffers->getProbabilityDictContent(), mHeaderPolicy), + mPtNodeArrayReader(mDictBuffer), + mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, + &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), + mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), + mWritingHelper(mBuffers.get()), + mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), + mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; + + virtual int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; + + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; + + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; + + void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const; + + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return mHeaderPolicy; + } + + bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty); + + bool removeUnigramEntry(const CodePointArrayView wordCodePoints); + + bool addNgramEntry(const NgramProperty *const ngramProperty); + + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints); + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); + + bool flush(const char *const filePath); + + bool flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC) const; + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength); + + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); + + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + // When the dictionary size is near the maximum size, we have to refuse dynamic operations to + // prevent the dictionary from overflowing. + static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int DUMMY_PROBABILITY_FOR_VALID_WORDS; + + const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; + const HeaderPolicy *const mHeaderPolicy; + BufferWithExtendableBuffer *const mDictBuffer; + Ver4BigramListPolicy mBigramPolicy; + Ver4ShortcutListPolicy mShortcutPolicy; + Ver4PatriciaTrieNodeReader mNodeReader; + Ver4PtNodeArrayReader mPtNodeArrayReader; + Ver4PatriciaTrieNodeWriter mNodeWriter; + DynamicPtUpdatingHelper mUpdatingHelper; + Ver4PatriciaTrieWritingHelper mWritingHelper; + MutableEntryCounters mEntryCounters; + std::vector mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; + int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; + int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; + int getBigramConditionalProbability(const int prevWordUnigramProbability, + const bool isInBeginningOfSentenceContext, const int bigramProbability) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..b8a4cf847 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( + const uint8_t *const buffer, int *pos) { + return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h new file mode 100644 index 000000000..c3e736bdc --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_reading_utils.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { + +class Ver4PatriciaTrieReadingUtils { + public: + static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..c0af9eae6 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" + +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, + const EntryCounts &entryCounts) const { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + entryCounts, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " + "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), + entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize); + return false; + } + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const dictDirPath) { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( + Ver4DictBuffers::createVer4DictBuffers(headerPolicy, + Ver4DictConstants::MAX_DICTIONARY_SIZE)); + int unigramCount = 0; + int bigramCount = 0; + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { + return false; + } + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + MutableEntryCounters entryCounters; + entryCounters.setNgramCount(NgramType::Unigram, unigramCount); + entryCounters.setNgramCount(NgramType::Bigram, bigramCount); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { + return false; + } + return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, + int *const outUnigramCount, int *const outBigramCount) { + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), + mBuffers->getProbabilityDictContent(), headerPolicy); + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); + Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), + mBuffers->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), + mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); + + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + &ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount(); + const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram); + if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { + if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { + AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, + maxUnigramCount); + return false; + } + } + + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability + traversePolicyToUpdateBigramProbability(&ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateBigramProbability)) { + return false; + } + const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); + const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram); + if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { + if (!truncateBigrams(maxBigramCount)) { + AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); + return false; + } + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instances for the GCed dictionary. + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), + buffersToWrite->getProbabilityDictContent(), headerPolicy); + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); + Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), + buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), + buffersToWrite->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, + &newShortcutPolicy); + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + // Run GC for probability dict content. + if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, + mBuffers->getProbabilityDictContent())) { + return false; + } + // Run GC for bigram dict content. + if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, + mBuffers->getBigramDictContent(), outBigramCount)) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { + return false; + } + *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( + const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue, DictProbabilityComparator> + priorityQueue; + for (int i = 0; i < nextTerminalId; ++i) { + const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); + if (terminalPos == NOT_A_DICT_POS) { + continue; + } + const ProbabilityEntry probabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); + const int probability = probabilityEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + probabilityEntry.getProbability(); + priorityQueue.push(DictProbability(terminalPos, probability, + probabilityEntry.getHistoricalInfo()->getTimestamp())); + } + + // Delete unigrams. + while (static_cast(priorityQueue.size()) > maxUnigramCount) { + const int ptNodePos = priorityQueue.top().getDictPos(); + priorityQueue.pop(); + const PtNodeParams ptNodeParams = + ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.representsNonWordInfo()) { + continue; + } + if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); + return false; + } + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue, DictProbabilityComparator> + priorityQueue; + BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); + for (int i = 0; i < nextTerminalId; ++i) { + const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); + if (bigramListPos == NOT_A_DICT_POS) { + continue; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int probability = bigramEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + bigramEntry.getProbability(); + priorityQueue.push(DictProbability(entryPos, probability, + bigramEntry.getHistoricalInfo()->getTimestamp())); + } + } + + // Delete bigrams. + while (static_cast(priorityQueue.size()) > maxBigramCount) { + const int entryPos = priorityQueue.top().getDictPos(); + const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); + const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { + AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); + return false; + } + priorityQueue.pop(); + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + } + return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h new file mode 100644 index 000000000..f2b873826 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_writing_helper.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/utils/entry_counters.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PatriciaTrieNodeWriter; + +class Ver4PatriciaTrieWritingHelper { + public: + Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) + : mBuffers(buffers) {} + + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; + + // This method cannot be const because the original dictionary buffer will be updated to detect + // useless PtNodes during GC. + bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); + + class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) + : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); + + Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; + const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbability { + public: + DictProbability(const int dictPos, const int probability, const int timestamp) + : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {} + + int getDictPos() const { + return mDictPos; + } + + int getProbability() const { + return mProbability; + } + + int getTimestamp() const { + return mTimestamp; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability); + + int mDictPos; + int mProbability; + int mTimestamp; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbabilityComparator { + public: + bool operator()(const DictProbability &left, const DictProbability &right) { + if (left.getProbability() != right.getProbability()) { + return left.getProbability() > right.getProbability(); + } + if (left.getTimestamp() != right.getTimestamp()) { + return left.getTimestamp() < right.getTimestamp(); + } + return left.getDictPos() > right.getDictPos(); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator); + }; + + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, + int *const outBigramCount); + + bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount); + + bool truncateBigrams(const int maxBigramCount); + + Ver4DictBuffers *const mBuffers; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime + +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp new file mode 100644 index 000000000..d27d70816 --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_pt_node_array_reader.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" + +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + ptNodeArrayPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = ptNodeArrayPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &readingPos); + if (usesAdditionalBuffer) { + readingPos += mBuffer->getOriginalBufferSize(); + } + if (ptNodeCountInArray < 0) { + AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); + return false; + } + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + forwordLinkPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = forwordLinkPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int nextPtNodeArrayOffset = + DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); + if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { + *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; + } else { + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h new file mode 100644 index 000000000..0039bf8fc --- /dev/null +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_pt_node_array_reader.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H +#define LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { + +class Ver4PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp new file mode 100644 index 000000000..4470e8568 --- /dev/null +++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" + +#include + +#include "defines.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/v2/patricia_trie_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/ver4_patricia_trie_policy.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForExistingDictFile( + const char *const path, const int bufOffset, const int size, + const bool isUpdatable) { + if (FileUtils::existsDir(path)) { + // Given path represents a directory. + return newPolicyForDirectoryDict(path, isUpdatable); + } else { + if (isUpdatable) { + AKLOGE("One file dictionaries don't support updating. path: %s", path); + ASSERT(false); + return nullptr; + } + return newPolicyForFileDict(path, bufOffset, size); + } +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict( + const int formatVersion, const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); + switch (dictFormatVersion) { + case FormatUtils::VERSION_402: { + return newPolicyForOnMemoryV4Dict( + dictFormatVersion, locale, attributeMap); + } + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_403: { + return newPolicyForOnMemoryV4Dict( + dictFormatVersion, locale, attributeMap); + } + default: + AKLOGE("DICT: dictionary format %d is not supported for on memory dictionary", + formatVersion); + break; + } + return nullptr; +} + +template +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForOnMemoryV4Dict( + const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + HeaderPolicy headerPolicy(formatVersion, locale, attributeMap); + DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, + DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); + if (!DynamicPtWritingUtils::writeEmptyDictionary( + dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { + AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); + return nullptr; + } + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new StructurePolicy(std::move(dictBuffers))); +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForDirectoryDict( + const char *const path, const bool isUpdatable) { + const int headerFilePathBufSize = PATH_MAX + 1 /* terminator */; + char headerFilePath[headerFilePathBufSize]; + getHeaderFilePathInDictDir(path, headerFilePathBufSize, headerFilePath); + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // MmappedBufferPtr if the instance has the responsibility. + MmappedBuffer::MmappedBufferPtr mmappedBuffer = + MmappedBuffer::openBuffer(headerFilePath, isUpdatable); + if (!mmappedBuffer) { + return nullptr; + } + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion( + mmappedBuffer->getReadOnlyByteArrayView()); + switch (formatVersion) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + case FormatUtils::VERSION_202: + AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path); + break; + case FormatUtils::VERSION_402: { + return newPolicyForV4Dict( + headerFilePath, formatVersion, std::move(mmappedBuffer)); + } + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_403: { + return newPolicyForV4Dict( + headerFilePath, formatVersion, std::move(mmappedBuffer)); + } + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); + break; + } + ASSERT(false); + return nullptr; +} + +template +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForV4Dict( + const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, + MmappedBuffer::MmappedBufferPtr &&mmappedBuffer) { + const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */; + char dictPath[dictDirPathBufSize]; + if (!FileUtils::getFilePathWithoutSuffix(headerFilePath, + DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) { + AKLOGE("Dictionary file name is not valid as a ver4 dictionary. header path: %s", + headerFilePath); + ASSERT(false); + return nullptr; + } + DictBuffersPtr dictBuffers = + DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion); + if (!dictBuffers || !dictBuffers->isValid()) { + AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s", + dictPath); + ASSERT(false); + return nullptr; + } + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new StructurePolicy(std::move(dictBuffers))); +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForFileDict( + const char *const path, const int bufOffset, const int size) { + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // MmappedBufferPtr if the instance has the responsibility. + MmappedBuffer::MmappedBufferPtr mmappedBuffer( + MmappedBuffer::openBuffer(path, bufOffset, size, false /* isUpdatable */)); + if (!mmappedBuffer) { + return nullptr; + } + switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + break; + case FormatUtils::VERSION_202: + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new PatriciaTriePolicy(std::move(mmappedBuffer))); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: + AKLOGE("Given path is a file but the format is version 4. path: %s", path); + break; + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); + break; + } + ASSERT(false); + return nullptr; +} + +/* static */ void DictionaryStructureWithBufferPolicyFactory::getHeaderFilePathInDictDir( + const char *const dictDirPath, const int outHeaderFileBufSize, + char *const outHeaderFilePath) { + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + snprintf(outHeaderFilePath, outHeaderFileBufSize, "%s/%s%s", dictDirPath, + dictName, Ver4DictConstants::HEADER_FILE_EXTENSION); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h new file mode 100644 index 000000000..b0c04c0b1 --- /dev/null +++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H +#define LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicyFactory { + public: + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForExistingDictFile(const char *const path, const int bufOffset, + const int size, const bool isUpdatable); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForOnMemoryDict(const int formatVersion, const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); + + template + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForOnMemoryV4Dict(const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForDirectoryDict(const char *const path, const bool isUpdatable); + + template + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr newPolicyForV4Dict( + const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, + MmappedBuffer::MmappedBufferPtr &&mmappedBuffer); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForFileDict(const char *const path, const int bufOffset, const int size); + + static void getHeaderFilePathInDictDir(const char *const dirPath, + const int outHeaderFileBufSize, char *const outHeaderFilePath); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H diff --git a/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp new file mode 100644 index 000000000..64f9b6663 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" + +#include "dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = + 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; +// Flag for presence of more attributes +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT = + 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; + +/* static */ bool BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + const ReadOnlyByteArrayView buffer, BigramFlags *const outBigramFlags, + int *const outTargetPtNodePos, int *const bigramEntryPos) { + if (static_cast(buffer.size()) <= *bigramEntryPos) { + AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %zd, " + "bigramEntryPos: %d.", buffer.size(), *bigramEntryPos); + return false; + } + const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), + bigramEntryPos); + if (outBigramFlags) { + *outBigramFlags = bigramFlags; + } + const int targetPos = getBigramAddressAndAdvancePosition(buffer, bigramFlags, bigramEntryPos); + if (outTargetPtNodePos) { + *outTargetPtNodePos = targetPos; + } + return true; +} + +/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const ReadOnlyByteArrayView buffer, + int *const bigramListPos) { + BigramFlags flags; + do { + if (!getBigramEntryPropertiesAndAdvancePosition(buffer, &flags, 0 /* outTargetPtNodePos */, + bigramListPos)) { + return false; + } + } while(hasNext(flags)); + return true; +} + +/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition( + const ReadOnlyByteArrayView buffer, const BigramFlags flags, int *const pos) { + int offset = 0; + const int origin = *pos; + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer.data(), pos); + break; + } + if (isOffsetNegative(flags)) { + return origin - offset; + } else { + return origin + offset; + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h new file mode 100644 index 000000000..a0f7d5e83 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H +#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H + +#include +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class BigramListReadWriteUtils { +public: + typedef uint8_t BigramFlags; + + static bool getBigramEntryPropertiesAndAdvancePosition(const ReadOnlyByteArrayView buffer, + BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, + int *const bigramEntryPos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // Bigrams reading methods + static bool skipExistingBigrams(const ReadOnlyByteArrayView buffer, int *const bigramListPos); + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); + + static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; + + static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; + } + + static int getBigramAddressAndAdvancePosition(const ReadOnlyByteArrayView buffer, + const BigramFlags flags, int *const pos); +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp new file mode 100644 index 000000000..b5e2e9dae --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" + +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" + +namespace latinime { + +bool DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless + // children. + bool isUselessPtNode = !ptNodeParams->isTerminal(); + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { + bool needsToKeepPtNode = true; + if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + ptNodeParams, &needsToKeepPtNode)) { + AKLOGE("Cannot update PtNode probability or get needs to keep PtNode after GC."); + return false; + } + if (!needsToKeepPtNode) { + isUselessPtNode = true; + } + } + if (mChildrenValue > 0) { + isUselessPtNode = false; + } else if (ptNodeParams->isTerminal()) { + // Remove children as all children are useless. + if (!mPtNodeWriter->updateChildrenPosition(ptNodeParams, + NOT_A_DICT_POS /* newChildrenPosition */)) { + return false; + } + } + if (isUselessPtNode) { + // Current PtNode is no longer needed. Mark it as deleted. + if (!mPtNodeWriter->markPtNodeAsDeleted(ptNodeParams)) { + return false; + } + } else { + mValueStack.back() += 1; + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { + mValidUnigramCount += 1; + } + } + return true; +} + +bool DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isDeleted()) { + int bigramEntryCount = 0; + if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams, + &bigramEntryCount)) { + return false; + } + mValidBigramEntryCount += bigramEntryCount; + } + return true; +} + +// Writes dummy PtNode array size when the head of PtNode array is read. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onDescend(const int ptNodeArrayPos) { + mValidPtNodeCount = 0; + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( + PtNodeWriter::PtNodeArrayPositionRelocationMap::value_type(ptNodeArrayPos, writingPos)); + // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. + // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. + mPtNodeArraySizeFieldPos = writingPos; + return DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, 0 /* arraySize */, &writingPos); +} + +// Write PtNode array terminal and actual PtNode array size. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onReadingPtNodeArrayTail() { + int writingPos = mBufferToWrite->getTailPosition(); + // Write PtNode array terminal. + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( + mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Write actual PtNode array size. + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { + return false; + } + return true; +} + +// Write valid PtNode to buffer and memorize mapping from the old position to the new position. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (ptNodeParams->isDeleted()) { + // Current PtNode is not written in new buffer because it has been deleted. + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + PtNodeWriter::PtNodePositionRelocationMap::value_type( + ptNodeParams->getHeadPos(), NOT_A_DICT_POS)); + return true; + } + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + PtNodeWriter::PtNodePositionRelocationMap::value_type( + ptNodeParams->getHeadPos(), writingPos)); + mValidPtNodeCount++; + // Writes current PtNode. + return mPtNodeWriter->writePtNodeAndAdvancePosition(ptNodeParams, &writingPos); +} + +bool DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + // Updates parent position. + int bigramCount = 0; + if (!mPtNodeWriter->updateAllPositionFields(ptNodeParams, mDictPositionRelocationMap, + &bigramCount)) { + return false; + } + mBigramCount += bigramCount; + if (ptNodeParams->isTerminal()) { + mUnigramCount++; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h new file mode 100644 index 000000000..8c7ad965b --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H +#define LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +class PtNodeParams; + +class DynamicPtGcEventListeners { + public: + // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or + // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC. + // TODO: Concatenate non-terminal PtNodes. + class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + PtNodeWriter *const ptNodeWriter) + : mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0), + mValidUnigramCount(0) {} + + ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; + + bool onAscend() { + if (mValueStack.empty()) { + return false; + } + mChildrenValue = mValueStack.back(); + mValueStack.pop_back(); + return true; + } + + bool onDescend(const int ptNodeArrayPos) { + mValueStack.push_back(0); + mChildrenValue = 0; + return true; + } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + int getValidUnigramCount() const { + return mValidUnigramCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS( + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + + PtNodeWriter *const mPtNodeWriter; + std::vector mValueStack; + int mChildrenValue; + int mValidUnigramCount; + }; + + // TODO: Remove when we stop supporting v402 format. + // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram + // entries. + class TraversePolicyToUpdateBigramProbability + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateBigramProbability(PtNodeWriter *const ptNodeWriter) + : mPtNodeWriter(ptNodeWriter), mValidBigramEntryCount(0) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + int getValidBigramEntryCount() const { + return mValidBigramEntryCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); + + PtNodeWriter *const mPtNodeWriter; + int mValidBigramEntryCount; + }; + + class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToPlaceAndWriteValidPtNodesToBuffer( + PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const bufferToWrite, + PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) + : mPtNodeWriter(ptNodeWriter), mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0), + mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos); + + bool onReadingPtNodeArrayTail(); + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer); + + PtNodeWriter *const mPtNodeWriter; + BufferWithExtendableBuffer *const mBufferToWrite; + PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; + int mValidPtNodeCount; + int mPtNodeArraySizeFieldPos; + }; + + class TraversePolicyToUpdateAllPositionFields + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPositionFields(PtNodeWriter *const ptNodeWriter, + const PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) + : mPtNodeWriter(ptNodeWriter), + mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0), + mBigramCount(0) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + int getUnigramCount() const { + return mUnigramCount; + } + + int getBigramCount() const { + return mBigramCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); + + PtNodeWriter *const mPtNodeWriter; + const PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; + int mUnigramCount; + int mBigramCount; + }; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtGcEventListeners); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp new file mode 100644 index 000000000..294bc6ea9 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp @@ -0,0 +1,321 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" + +#include "dictionary/structure/pt_common/pt_node_array_reader.h" +#include "utils/char_utils.h" + +namespace latinime { + +// To avoid infinite loop caused by invalid or malicious forward links. +const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; + +bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode( + const PtNodeParams *const ptNodeParams) { + if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) { + mTerminalPositions->push_back(ptNodeParams->getHeadPos()); + } + return true; +} + +// Visits all PtNodes in post-order depth first manner. +// For example, visits c -> b -> y -> x -> a for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + if (!alreadyVisitedChildren) { + if (ptNodeParams.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(ptNodeParams.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(ptNodeParams); + } else { + alreadyVisitedChildren = true; + } + } else { + if (!listener->onVisitingPtNode(&ptNodeParams)) { + return false; + } + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + // All PtNodes in current linked PtNode arrays have been visited. + // Return to the parent. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + if (!listener->onAscend()) { + return false; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + } else { + // Process sibling PtNode. + alreadyVisitedChildren = false; + } + } + } + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order +// that PtNodes are written in the dictionary buffer. +// For example, visits a -> b -> x -> c -> y for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedAllPtNodesInArray = false; + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + if (isEnd()) { + // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + } + pushReadingStateToStack(); + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + if (alreadyVisitedAllPtNodesInArray) { + if (alreadyVisitedChildren) { + // Move to next sibling PtNode's children. + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + // Return to the parent PTNode. + if (!listener->onAscend()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + alreadyVisitedAllPtNodesInArray = true; + } else { + alreadyVisitedChildren = false; + } + } else { + if (ptNodeParams.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(ptNodeParams.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(ptNodeParams); + // Push state to return the head of PtNode array. + pushReadingStateToStack(); + alreadyVisitedAllPtNodesInArray = false; + alreadyVisitedChildren = false; + } else { + alreadyVisitedChildren = true; + } + } + } else { + if (!listener->onVisitingPtNode(&ptNodeParams)) { + return false; + } + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + // Return to the head of current PtNode array. + popReadingStateFromStack(); + alreadyVisitedAllPtNodesInArray = true; + } + } + } + popReadingStateFromStack(); + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount, + int *const outCodePoints) { + // This method traverses parent nodes from the terminal by following parent pointers; thus, + // node code points are stored in the buffer in the reverse order. + int reverseCodePoints[maxCodePointCount]; + const PtNodeParams terminalPtNodeParams(getPtNodeParams()); + // First, read the terminal node and get its probability. + if (!isValidTerminalNode(terminalPtNodeParams)) { + // Node at the ptNodePos is not a valid terminal node. + return 0; + } + // Then, following parent node link to the dictionary root and fetch node code points. + int totalCodePointCount = 0; + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + totalCodePointCount = getTotalCodePointCount(ptNodeParams); + if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) { + // The ptNodePos is not a valid terminal node position in the dictionary. + return 0; + } + // Store node code points to buffer in the reverse order. + fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(), + reverseCodePoints); + // Follow parent node toward the root node. + readParentNode(ptNodeParams); + } + if (isError()) { + // The node position or the dictionary is invalid. + return 0; + } + // Reverse the stored code points to output them. + for (int i = 0; i < totalCodePointCount; ++i) { + outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1]; + } + return totalCodePointCount; +} + +int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord, + const size_t length, const bool forceLowerCaseSearch) { + int searchCodePoints[length]; + for (size_t i = 0; i < length; ++i) { + searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; + } + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + const int matchedCodePointCount = getPrevTotalCodePointCount(); + if (getTotalCodePointCount(ptNodeParams) > length + || !isMatchedCodePoint(ptNodeParams, 0 /* index */, + searchCodePoints[matchedCodePointCount])) { + // Current node has too many code points or its first code point is different from + // target code point. Skip this node and read the next sibling node. + readNextSiblingNode(ptNodeParams); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) { + // Different code point is found. The given word is not included in the dictionary. + return NOT_A_DICT_POS; + } + } + // All characters are matched. + if (length == getTotalCodePointCount(ptNodeParams)) { + if (!ptNodeParams.isTerminal()) { + return NOT_A_DICT_POS; + } + // Terminal position is found. + return ptNodeParams.getHeadPos(); + } + if (!ptNodeParams.hasChildren()) { + return NOT_A_DICT_POS; + } + // Advance to the children nodes. + readChildNode(ptNodeParams); + } + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + return NOT_A_DICT_POS; +} + +// Read node array size and process empty node arrays. Nodes and arrays are counted up in this +// method to avoid an infinite loop. +void DynamicPtReadingHelper::nextPtNodeArray() { + int ptNodeCountInArray = 0; + int firstPtNodePos = NOT_A_DICT_POS; + if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid( + mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) { + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos; + mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray; + mReadingState.mPos = firstPtNodePos; + // Count up nodes and node arrays to avoid infinite loop. + mReadingState.mTotalPtNodeIndexInThisArrayChain += + mReadingState.mRemainingPtNodeCountInThisArray; + mReadingState.mPtNodeArrayIndexInThisArrayChain++; + if (mReadingState.mRemainingPtNodeCountInThisArray < 0 + || mReadingState.mTotalPtNodeIndexInThisArrayChain + > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mReadingState.mPtNodeArrayIndexInThisArrayChain + > MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + // Invalid dictionary. + AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" + "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", + mReadingState.mRemainingPtNodeCountInThisArray, + mReadingState.mTotalPtNodeIndexInThisArrayChain, + MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, + mReadingState.mPtNodeArrayIndexInThisArrayChain, + MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + if (mReadingState.mRemainingPtNodeCountInThisArray == 0) { + // Empty node array. Try following forward link. + followForwardLink(); + } +} + +// Follow the forward link and read the next node array if exists. +void DynamicPtReadingHelper::followForwardLink() { + int nextPtNodeArrayPos = NOT_A_DICT_POS; + if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid( + mReadingState.mPos, &nextPtNodeArrayPos)) { + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; + if (nextPtNodeArrayPos != NOT_A_DICT_POS) { + // Follow the forward link. + mReadingState.mPos = nextPtNodeArrayPos; + nextPtNodeArray(); + } else { + // All node arrays have been read. + mReadingState.mPos = NOT_A_DICT_POS; + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h new file mode 100644 index 000000000..d8ddc7c2b --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_READING_HELPER_H +#define LATINIME_DYNAMIC_PT_READING_HELPER_H + +#include +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { + +class DictionaryShortcutsStructurePolicy; +class PtNodeArrayReader; + +/* + * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and + * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. + */ +class DynamicPtReadingHelper { + public: + class TraversingEventListener { + public: + virtual ~TraversingEventListener() {}; + + // Returns whether the event handling was succeeded or not. + virtual bool onAscend() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onDescend(const int ptNodeArrayPos) = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onReadingPtNodeArrayTail() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onVisitingPtNode(const PtNodeParams *const node) = 0; + + protected: + TraversingEventListener() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); + }; + + class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener { + public: + TraversePolicyToGetAllTerminalPtNodePositions(std::vector *const terminalPositions) + : mTerminalPositions(terminalPositions) {} + bool onAscend() { return true; } + bool onDescend(const int ptNodeArrayPos) { return true; } + bool onReadingPtNodeArrayTail() { return true; } + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions); + + std::vector *const mTerminalPositions; + }; + + DynamicPtReadingHelper(const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader) + : mIsError(false), mReadingState(), mPtNodeReader(ptNodeReader), + mPtNodeArrayReader(ptNodeArrayReader), mReadingStateStack() {} + + ~DynamicPtReadingHelper() {} + + AK_FORCE_INLINE bool isError() const { + return mIsError; + } + + AK_FORCE_INLINE bool isEnd() const { + return mReadingState.mPos == NOT_A_DICT_POS; + } + + // Initialize reading state with the head position of a PtNode array. + AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { + if (ptNodeArrayPos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodeArrayPos; + mReadingState.mTotalCodePointCountSinceInitialization = 0; + mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingStateStack.clear(); + nextPtNodeArray(); + } + } + + // Initialize reading state with the head position of a node. + AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { + if (ptNodePos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodePos; + mReadingState.mRemainingPtNodeCountInThisArray = 1; + mReadingState.mTotalCodePointCountSinceInitialization = 0; + mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; + mReadingStateStack.clear(); + } + } + + AK_FORCE_INLINE const PtNodeParams getPtNodeParams() const { + if (isEnd()) { + return PtNodeParams(); + } + return mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(mReadingState.mPos); + } + + AK_FORCE_INLINE bool isValidTerminalNode(const PtNodeParams &ptNodeParams) const { + return !isEnd() && !ptNodeParams.isDeleted() && ptNodeParams.isTerminal(); + } + + AK_FORCE_INLINE bool isMatchedCodePoint(const PtNodeParams &ptNodeParams, const int index, + const int codePoint) const { + return ptNodeParams.getCodePoints()[index] == codePoint; + } + + // Return code point count exclude the last read node's code points. + AK_FORCE_INLINE size_t getPrevTotalCodePointCount() const { + return mReadingState.mTotalCodePointCountSinceInitialization; + } + + // Return code point count include the last read node's code points. + AK_FORCE_INLINE size_t getTotalCodePointCount(const PtNodeParams &ptNodeParams) const { + return mReadingState.mTotalCodePointCountSinceInitialization + + ptNodeParams.getCodePointCount(); + } + + AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder(const PtNodeParams &ptNodeParams, + const int index, int *const outCodePoints) const { + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + const int *const nodeCodePoints = ptNodeParams.getCodePoints(); + for (int i = 0; i < nodeCodePointCount; ++i) { + outCodePoints[index + i] = nodeCodePoints[nodeCodePointCount - 1 - i]; + } + } + + AK_FORCE_INLINE void readNextSiblingNode(const PtNodeParams &ptNodeParams) { + mReadingState.mRemainingPtNodeCountInThisArray -= 1; + mReadingState.mPos = ptNodeParams.getSiblingNodePos(); + if (mReadingState.mRemainingPtNodeCountInThisArray <= 0) { + // All nodes in the current node array have been read. + followForwardLink(); + } + } + + // Read the first child node of the current node. + AK_FORCE_INLINE void readChildNode(const PtNodeParams &ptNodeParams) { + if (ptNodeParams.hasChildren()) { + mReadingState.mTotalCodePointCountSinceInitialization += + ptNodeParams.getCodePointCount(); + mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; + mReadingState.mPos = ptNodeParams.getChildrenPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + // Read children node array. + nextPtNodeArray(); + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + // Read the parent node of the current node. + AK_FORCE_INLINE void readParentNode(const PtNodeParams &ptNodeParams) { + if (ptNodeParams.getParentPos() != NOT_A_DICT_POS) { + mReadingState.mTotalCodePointCountSinceInitialization += + ptNodeParams.getCodePointCount(); + mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; + mReadingState.mRemainingPtNodeCountInThisArray = 1; + mReadingState.mPos = ptNodeParams.getParentPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { + return mReadingState.mPosOfLastForwardLinkField; + } + + AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { + return mReadingState.mPosOfThisPtNodeArrayHead; + } + + bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); + + bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener); + + int getCodePointsAndReturnCodePointCount(const int maxCodePointCount, int *const outCodePoints); + + int getTerminalPtNodePositionOfWord(const int *const inWord, const size_t length, + const bool forceLowerCaseSearch); + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPtReadingHelper); + + // This class encapsulates the reading state of a position in the dictionary. It points at a + // specific PtNode in the dictionary. + class PtNodeReadingState { + public: + // Note that copy constructor and assignment operator are used for this class to use + // std::vector. + PtNodeReadingState() : mPos(NOT_A_DICT_POS), mRemainingPtNodeCountInThisArray(0), + mTotalCodePointCountSinceInitialization(0), mTotalPtNodeIndexInThisArrayChain(0), + mPtNodeArrayIndexInThisArrayChain(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), + mPosOfThisPtNodeArrayHead(NOT_A_DICT_POS) {} + + int mPos; + // Remaining node count in the current array. + int mRemainingPtNodeCountInThisArray; + size_t mTotalCodePointCountSinceInitialization; + // Counter of PtNodes used to avoid infinite loops caused by broken or malicious links. + int mTotalPtNodeIndexInThisArrayChain; + // Counter of PtNode arrays used to avoid infinite loops caused by cyclic links of empty + // PtNode arrays. + int mPtNodeArrayIndexInThisArrayChain; + int mPosOfLastForwardLinkField; + int mPosOfThisPtNodeArrayHead; + }; + + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + static const int MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + static const size_t MAX_READING_STATE_STACK_SIZE; + + // TODO: Introduce error code to track what caused the error. + bool mIsError; + PtNodeReadingState mReadingState; + const PtNodeReader *const mPtNodeReader; + const PtNodeArrayReader *const mPtNodeArrayReader; + std::vector mReadingStateStack; + + void nextPtNodeArray(); + + void followForwardLink(); + + AK_FORCE_INLINE void pushReadingStateToStack() { + if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { + AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingStateStack.push_back(mReadingState); + } + } + + AK_FORCE_INLINE void popReadingStateFromStack() { + if (mReadingStateStack.empty()) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingState = mReadingStateStack.back(); + mReadingStateStack.pop_back(); + } + } +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_READING_HELPER_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp new file mode 100644 index 000000000..3eb55ed9b --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" + +#include "defines.h" +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::MASK_MOVED = 0xC0; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_MOVED = 0x40; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_DELETED = 0x80; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_WILL_BECOME_NON_TERMINAL = 0x00; + +// TODO: Make DICT_OFFSET_ZERO_OFFSET = 0. +// Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum +// value of offsets, which is 0x7FFFFF is used to represent 0 offset. +const int DynamicPtReadingUtils::DICT_OFFSET_INVALID = 0; +const int DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; + +/* static */ int DynamicPtReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, + const int pos) { + int linkAddressPos = pos; + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos); +} + +/* static */ int DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); +} + +/* static */ int DynamicPtReadingUtils::getParentPtNodePos(const int parentOffset, + const int ptNodePos) { + if (parentOffset == DICT_OFFSET_INVALID) { + return NOT_A_DICT_POS; + } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) { + return ptNodePos; + } else { + return parentOffset + ptNodePos; + } +} + +/* static */ int DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const int base = *pos; + const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + if (offset == DICT_OFFSET_INVALID) { + // The PtNode does not have children. + return NOT_A_DICT_POS; + } else if (offset == DICT_OFFSET_ZERO_OFFSET) { + return base; + } else { + return base + offset; + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h new file mode 100644 index 000000000..b13a075d5 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_READING_UTILS_H +#define LATINIME_DYNAMIC_PT_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +class DynamicPtReadingUtils { + public: + typedef uint8_t NodeFlags; + + static const int DICT_OFFSET_INVALID; + static const int DICT_OFFSET_ZERO_OFFSET; + + static int getForwardLinkPosition(const uint8_t *const buffer, const int pos); + + static AK_FORCE_INLINE bool isValidForwardLinkPosition(const int forwardLinkAddress) { + return forwardLinkAddress != 0; + } + + static int getParentPtNodePosOffsetAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + static int getParentPtNodePos(const int parentOffset, const int ptNodePos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isMoved(const NodeFlags flags) { + return FLAG_IS_MOVED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE bool isDeleted(const NodeFlags flags) { + return FLAG_IS_DELETED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE bool willBecomeNonTerminal(const NodeFlags flags) { + return FLAG_WILL_BECOME_NON_TERMINAL == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE NodeFlags updateAndGetFlags(const NodeFlags originalFlags, + const bool isMoved, const bool isDeleted, const bool willBecomeNonTerminal) { + NodeFlags flags = originalFlags; + flags = willBecomeNonTerminal ? + ((flags & (~MASK_MOVED)) | FLAG_WILL_BECOME_NON_TERMINAL) : flags; + flags = isMoved ? ((flags & (~MASK_MOVED)) | FLAG_IS_MOVED) : flags; + flags = isDeleted ? ((flags & (~MASK_MOVED)) | FLAG_IS_DELETED) : flags; + flags = (!isMoved && !isDeleted && !willBecomeNonTerminal) ? + ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags; + return flags; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtReadingUtils); + + static const NodeFlags MASK_MOVED; + static const NodeFlags FLAG_IS_NOT_MOVED; + static const NodeFlags FLAG_IS_MOVED; + static const NodeFlags FLAG_IS_DELETED; + static const NodeFlags FLAG_WILL_BECOME_NON_TERMINAL; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_READING_UTILS_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp new file mode 100644 index 000000000..ccad345c8 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp @@ -0,0 +1,299 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" + +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper, + const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, + bool *const outAddedNewUnigram) { + int parentPos = NOT_A_DICT_POS; + while (!readingHelper->isEnd()) { + const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(ptNodeParams); + continue; + } + // Check following merged node code points. + const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size(); + for (size_t j = 1; j < nodeCodePointCount; ++j) { + const size_t nextIndex = matchedCodePointCount + j; + if (nextIndex >= wordCodePoints.size() + || !readingHelper->isMatchedCodePoint(ptNodeParams, j, + wordCodePoints[matchedCodePointCount + j])) { + *outAddedNewUnigram = true; + return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, + wordCodePoints.skip(matchedCodePointCount)); + } + } + // All characters are matched. + if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) { + return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); + } + if (!ptNodeParams.hasChildren()) { + *outAddedNewUnigram = true; + return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, + wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams))); + } + // Advance to the children nodes. + parentPos = ptNodeParams.getHeadPos(); + readingHelper->readChildNode(ptNodeParams); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + int pos = readingHelper->getPosOfLastForwardLinkField(); + *outAddedNewUnigram = true; + return createAndInsertNodeIntoPtNodeArray(parentPos, + wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty, + &pos); +} + +bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, + const int wordPos, const NgramProperty *const ngramProperty, + bool *const outAddedNewEntry) { + if (prevWordsPtNodePos.empty()) { + return false; + } + ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); + int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { + prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( + prevWordsPtNodePos[i]).getTerminalId(); + } + const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); + const int wordId = + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); + return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry); +} + +bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, + const int wordPos) { + if (prevWordsPtNodePos.empty()) { + return false; + } + ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); + int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { + prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( + prevWordsPtNodePos[i]).getTerminalId(); + } + const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); + const int wordId = + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); + return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId); +} + +bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, + const CodePointArrayView targetCodePoints, const int shortcutProbability) { + const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); + return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(), + targetCodePoints.size(), shortcutProbability); +} + +bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, + const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, + int *const forwardLinkFieldPos) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, forwardLinkFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty); +} + +bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { + if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) { + // Overwrites the probability. + *outAddedNewUnigram = false; + return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); + } else { + // Make the node terminal and write the probability. + *outAddedNewUnigram = true; + const int movedPos = mBuffer->getTailPosition(); + int writingPos = movedPos; + const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, originalPtNodeParams->getParentPos(), + originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { + return false; + } + if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { + return false; + } + } + return true; +} + +bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( + const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, + const CodePointArrayView codePoints) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, + unigramProperty); +} + +bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( + const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints, + const UnigramProperty *const unigramProperty) { + int writingPos = mBuffer->getTailPosition(); + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + 1 /* arraySize */, &writingPos)) { + return false; + } + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, parentPtNodePos, ptNodeCodePoints, + unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { + return false; + } + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + return true; +} + +// Returns whether the dictionary updating was succeeded or not. +bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( + const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount, + const UnigramProperty *const unigramProperty, + const CodePointArrayView newPtNodeCodePoints) { + // When addsExtraChild is true, split the reallocating PtNode and add new child. + // Reallocating PtNode: abcde, newNode: abcxy. + // abc (1st, not terminal) __ de (2nd) + // \_ xy (extra child, terminal) + // Otherwise, this method makes 1st part terminal and write information in unigramProperty. + // Reallocating PtNode: abcde, newNode: abc. + // abc (1st, terminal) __ de (2nd) + const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount; + const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); + int writingPos = firstPartOfReallocatedPtNodePos; + // Write the 1st part of the reallocating node. The children position will be updated later + // with actual children position. + const CodePointArrayView firstPtNodeCodePoints = + reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount); + if (addsExtraChild) { + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */, + reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints, + NOT_A_PROBABILITY)); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { + return false; + } + } else { + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), + firstPtNodeCodePoints, unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { + return false; + } + } + const int actualChildrenPos = writingPos; + // Create new children PtNode array. + const size_t newPtNodeCount = addsExtraChild ? 2 : 1; + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + newPtNodeCount, &writingPos)) { + return false; + } + // Write the 2nd part of the reallocating node. + const int secondPartOfReallocatedPtNodePos = writingPos; + const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, + reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(), + reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, + reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount), + reallocatingPtNodeParams->getProbability())); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { + return false; + } + if (addsExtraChild) { + const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, firstPartOfReallocatedPtNodePos, + newPtNodeCodePoints.skip(overlappingCodePointCount), + unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, + unigramProperty, &writingPos)) { + return false; + } + } + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Update original reallocating PtNode as moved. + if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, + secondPartOfReallocatedPtNodePos)) { + return false; + } + // Load node info. Information of the 1st part will be fetched. + const PtNodeParams ptNodeParams( + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); + // Update children position. + return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); +} + +const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( + const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability); +} + +const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(flags, parentPos, codePoints, probability); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h new file mode 100644 index 000000000..e8cf98c39 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_UPDATING_HELPER_H +#define LATINIME_DYNAMIC_PT_UPDATING_HELPER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class NgramProperty; +class BufferWithExtendableBuffer; +class DynamicPtReadingHelper; +class PtNodeReader; +class PtNodeWriter; +class UnigramProperty; + +class DynamicPtUpdatingHelper { + public: + DynamicPtUpdatingHelper(BufferWithExtendableBuffer *const buffer, + const PtNodeReader *const ptNodeReader, PtNodeWriter *const ptNodeWriter) + : mBuffer(buffer), mPtNodeReader(ptNodeReader), mPtNodeWriter(ptNodeWriter) {} + + ~DynamicPtUpdatingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPtReadingHelper *const readingHelper, + const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, + bool *const outAddedNewUnigram); + + // TODO: Remove after stopping supporting v402. + // Add an n-gram entry. + bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + // TODO: Remove after stopping supporting v402. + // Remove an n-gram entry. + bool removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos); + + // Add a shortcut target. + bool addShortcutTarget(const int wordPos, const CodePointArrayView targetCodePoints, + const int shortcutProbability); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtUpdatingHelper); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mBuffer; + const PtNodeReader *const mPtNodeReader; + PtNodeWriter *const mPtNodeWriter; + + bool createAndInsertNodeIntoPtNodeArray(const int parentPos, + const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, + int *const forwardLinkFieldPos); + + bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); + + bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, + const UnigramProperty *const unigramProperty, + const CodePointArrayView remainingCodePoints); + + bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, + const CodePointArrayView ptNodeCodePoints, + const UnigramProperty *const unigramProperty); + + bool reallocatePtNodeAndAddNewPtNodes(const PtNodeParams *const reallocatingPtNodeParams, + const size_t overlappingCodePointCount, const UnigramProperty *const unigramProperty, + const CodePointArrayView newPtNodeCodePoints); + + const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, + const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal, + const int parentPos, const CodePointArrayView codePoints, const int probability) const; + + const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp new file mode 100644 index 000000000..ea760a538 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" + +#include +#include +#include + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F; +const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF; +const int DynamicPtWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1; +const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2; +const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; +const int DynamicPtWritingUtils::DICT_OFFSET_FIELD_SIZE = 3; +const int DynamicPtWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF; +const int DynamicPtWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF; +const int DynamicPtWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; +const int DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE = 1; + +/* static */ bool DynamicPtWritingUtils::writeEmptyDictionary( + BufferWithExtendableBuffer *const buffer, const int rootPos) { + int writingPos = rootPos; + if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) { + return false; + } + return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */, + &writingPos); +} + +/* static */ bool DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos) { + return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const size_t arraySize, + int *const arraySizeFieldPos) { + // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to + // simplify updating process. + // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays. + /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { + return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else */ + if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { + uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else { + AKLOGI("PtNode array size cannot be written because arraySize is too large: %zd", + arraySize); + ASSERT(false); + return false; + } +} + +/* static */ bool DynamicPtWritingUtils::writeFlagsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, + const DynamicPtReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) { + return buffer->writeUintAndAdvancePosition(nodeFlags, NODE_FLAG_FIELD_SIZE, nodeFlagsFieldPos); +} + +// Note that parentOffset is offset from node's head position. +/* static */ bool DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos, + int *const parentPosFieldPos) { + return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writeCodePointsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int *const codePoints, + const int codePointCount, int *const codePointFieldPos) { + if (codePointCount <= 0) { + AKLOGI("code points cannot be written because codePointCount is invalid: %d", + codePointCount); + ASSERT(false); + return false; + } + const bool hasMultipleCodePoints = codePointCount > 1; + return buffer->writeCodePointsAndAdvancePosition(codePoints, codePointCount, + hasMultipleCodePoints, codePointFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int childrenPosition, + int *const childrenPositionFieldPos) { + return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos), + childrenPositionFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writeDictOffset(BufferWithExtendableBuffer *const buffer, + const int targetPos, const int basePos, int *const offsetFieldPos) { + int offset = targetPos - basePos; + if (targetPos == NOT_A_DICT_POS) { + offset = DynamicPtReadingUtils::DICT_OFFSET_INVALID; + } else if (offset == 0) { + offset = DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } + if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { + AKLOGI("offset cannot be written because the offset is too large or too small: %d", + offset); + ASSERT(false); + return false; + } + uint32_t data = 0; + if (offset >= 0) { + data = offset; + } else { + data = abs(offset) | DICT_OFFSET_NEGATIVE_FLAG; + } + return buffer->writeUintAndAdvancePosition(data, DICT_OFFSET_FIELD_SIZE, offsetFieldPos); +} +} diff --git a/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h new file mode 100644 index 000000000..b4817af41 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_WRITING_UTILS_H +#define LATINIME_DYNAMIC_PT_WRITING_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DynamicPtWritingUtils { + public: + static const int NODE_FLAG_FIELD_SIZE; + + static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos); + + static bool writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos); + + static bool writePtNodeArraySizeAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const size_t arraySize, int *const arraySizeFieldPos); + + static bool writeFlags(BufferWithExtendableBuffer *const buffer, + const DynamicPtReadingUtils::NodeFlags nodeFlags, + const int nodeFlagsFieldPos) { + int writingPos = nodeFlagsFieldPos; + return writeFlagsAndAdvancePosition(buffer, nodeFlags, &writingPos); + } + + static bool writeFlagsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const DynamicPtReadingUtils::NodeFlags nodeFlags, + int *const nodeFlagsFieldPos); + + static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int parentPosition, const int basePos, int *const parentPosFieldPos); + + static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int *const codePoints, const int codePointCount, int *const codePointFieldPos); + + static bool writeChildrenPositionAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int childrenPosition, int *const childrenPositionFieldPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtWritingUtils); + + static const size_t MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD; + static const size_t MAX_PTNODE_ARRAY_SIZE; + static const int SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + static const int DICT_OFFSET_FIELD_SIZE; + static const int MAX_DICT_OFFSET_VALUE; + static const int MIN_DICT_OFFSET_VALUE; + static const int DICT_OFFSET_NEGATIVE_FLAG; + + static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, + const int basePos, int *const offsetFieldPos); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_WRITING_UTILS_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..e2807c492 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +typedef PatriciaTrieReadingUtils PtReadingUtils; + +const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; + +// Flag for single/multiple char group +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; +// Flag for terminal PtNodes +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; +// Flag for shortcut targets presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; +// Flag for bigram presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; +// Flag for non-words (typically, shortcut only entries) +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; +// Flag for possibly offensive words +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; + +/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + if (firstByte < 0x80) { + return firstByte; + } else { + return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( + buffer, pos); + } +} + +/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, + const int *const codePointTable, int *const pos) { + return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos); +} + +// Returns the number of read characters. +/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, const int maxLength, const int *const codePointTable, + int *const outBuffer, int *const pos) { + int length = 0; + if (hasMultipleChars(flags)) { + length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable, + outBuffer, pos); + } else { + const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos); + if (codePoint == NOT_A_CODE_POINT) { + // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is + // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR + // when the PtNode has a single code point. + length = 0; + AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", + *pos - 1, codePoint, buffer[*pos - 1]); + ASSERT(false); + } else if (maxLength > 0) { + outBuffer[0] = codePoint; + length = 1; + } + } + return length; +} + +// Returns the number of skipped characters. +/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, const int *const codePointTable, int *const pos) { + if (hasMultipleChars(flags)) { + return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); + } else { + if (maxLength > 0) { + getCodePointAndAdvancePosition(buffer, codePointTable, pos); + return 1; + } else { + return 0; + } + } +} + +/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, const NodeFlags flags, int *const pos) { + const int base = *pos; + int offset = 0; + switch (MASK_CHILDREN_POSITION_TYPE & flags) { + case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); + break; + default: + // If we come here, it means we asked for the children of a word with + // no children. + return NOT_A_DICT_POS; + } + return base + offset; +} + +/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable, + NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, + int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, + int *const outBigramPos, int *const outSiblingPos) { + int readingPos = ptNodePos; + const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); + *outFlags = flags; + *outCodePointCount = getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos); + *outProbability = isTerminal(flags) ? + readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; + *outChildrenPos = hasChildrenInFlags(flags) ? + readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS; + *outShortcutPos = NOT_A_DICT_POS; + if (hasShortcutTargets(flags)) { + *outShortcutPos = readingPos; + shortcutPolicy->skipAllShortcuts(&readingPos); + } + *outBigramPos = NOT_A_DICT_POS; + if (hasBigrams(flags)) { + *outBigramPos = readingPos; + bigramPolicy->skipAllBigrams(&readingPos); + } + *outSiblingPos = readingPos; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h new file mode 100644 index 000000000..6a2bf5d3c --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_PATRICIA_TRIE_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +class DictionaryShortcutsStructurePolicy; +class DictionaryBigramsStructurePolicy; + +class PatriciaTrieReadingUtils { + public: + typedef uint8_t NodeFlags; + + static int getPtNodeArraySizeAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int getCodePointAndAdvancePosition(const uint8_t *const buffer, + const int *const codePointTable, int *const pos); + + // Returns the number of read characters. + static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos); + + // Returns the number of skipped characters. + static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, const int *const codePointTable, int *const pos); + + static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) { + return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0; + } + + static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) { + return (flags & FLAG_IS_NOT_A_WORD) != 0; + } + + static AK_FORCE_INLINE bool isTerminal(const NodeFlags flags) { + return (flags & FLAG_IS_TERMINAL) != 0; + } + + static AK_FORCE_INLINE bool hasShortcutTargets(const NodeFlags flags) { + return (flags & FLAG_HAS_SHORTCUT_TARGETS) != 0; + } + + static AK_FORCE_INLINE bool hasBigrams(const NodeFlags flags) { + return (flags & FLAG_HAS_BIGRAMS) != 0; + } + + static AK_FORCE_INLINE bool hasMultipleChars(const NodeFlags flags) { + return (flags & FLAG_HAS_MULTIPLE_CHARS) != 0; + } + + static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) { + return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags); + } + + static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive, + const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, + const bool hasBigrams, const bool hasMultipleChars, + const int childrenPositionFieldSize) { + NodeFlags nodeFlags = 0; + nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags; + nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags; + nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags; + nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags; + nodeFlags = hasBigrams ? (nodeFlags | FLAG_HAS_BIGRAMS) : nodeFlags; + nodeFlags = hasMultipleChars ? (nodeFlags | FLAG_HAS_MULTIPLE_CHARS) : nodeFlags; + if (childrenPositionFieldSize == 1) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + } else if (childrenPositionFieldSize == 2) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + } else if (childrenPositionFieldSize == 3) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + } else { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + } + return nodeFlags; + } + + static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, + const int *const codePointTable, NodeFlags *const outFlags, + int *const outCodePointCount, int *const outCodePoint, int *const outProbability, + int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos, + int *const outSiblingPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); + + static const NodeFlags MASK_CHILDREN_POSITION_TYPE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + + static const NodeFlags FLAG_HAS_MULTIPLE_CHARS; + static const NodeFlags FLAG_IS_TERMINAL; + static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS; + static const NodeFlags FLAG_HAS_BIGRAMS; + static const NodeFlags FLAG_IS_NOT_A_WORD; + static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE; +}; +} // namespace latinime +#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h new file mode 100644 index 000000000..6078d8285 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_ARRAY_READER_H +#define LATINIME_PT_NODE_ARRAY_READER_H + +#include "defines.h" + +namespace latinime { + +// Interface class used to read PtNode array information. +class PtNodeArrayReader { + public: + virtual ~PtNodeArrayReader() {} + + // Returns if the position is valid or not. + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const = 0; + + // Returns if the position is valid or not. NOT_A_DICT_POS is set to outNextPtNodeArrayPos when + // the next array doesn't exist. + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const = 0; + + protected: + PtNodeArrayReader() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeArrayReader); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h new file mode 100644 index 000000000..905deb1bc --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h @@ -0,0 +1,262 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_PARAMS_H +#define LATINIME_PT_NODE_PARAMS_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/char_utils.h" +#include "utils/int_array_view.h" + +namespace latinime { + +// This class has information of a PtNode. This class is immutable. +class PtNodeParams { + public: + // Invalid PtNode. + PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mHasMovedFlag(false), + mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mCodePoints(), + mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {} + + PtNodeParams(const PtNodeParams& ptNodeParams) + : mHeadPos(ptNodeParams.mHeadPos), mFlags(ptNodeParams.mFlags), + mHasMovedFlag(ptNodeParams.mHasMovedFlag), mParentPos(ptNodeParams.mParentPos), + mCodePointCount(ptNodeParams.mCodePointCount), mCodePoints(), + mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos), + mTerminalId(ptNodeParams.mTerminalId), + mProbabilityFieldPos(ptNodeParams.mProbabilityFieldPos), + mProbability(ptNodeParams.mProbability), + mChildrenPosFieldPos(ptNodeParams.mChildrenPosFieldPos), + mChildrenPos(ptNodeParams.mChildrenPos), + mBigramLinkedNodePos(ptNodeParams.mBigramLinkedNodePos), + mShortcutPos(ptNodeParams.mShortcutPos), mBigramPos(ptNodeParams.mBigramPos), + mSiblingPos(ptNodeParams.mSiblingPos) { + memcpy(mCodePoints, ptNodeParams.getCodePoints(), sizeof(int) * mCodePointCount); + } + + // PtNode read from version 2 dictionary. + PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, + const int codePointCount, const int *const codePoints, const int probability, + const int childrenPos, const int shortcutPos, const int bigramPos, + const int siblingPos) + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(false), mParentPos(NOT_A_DICT_POS), + mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), + mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(childrenPos), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(shortcutPos), + mBigramPos(bigramPos), mSiblingPos(siblingPos) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + // PtNode with a terminal id. + PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, + const int parentPos, const int codePointCount, const int *const codePoints, + const int terminalIdFieldPos, const int terminalId, const int probability, + const int childrenPosFieldPos, const int childrenPos, const int siblingPos) + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), + mCodePointCount(codePointCount), mCodePoints(), + mTerminalIdFieldPos(terminalIdFieldPos), mTerminalId(terminalId), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(childrenPosFieldPos), mChildrenPos(childrenPos), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(terminalId), + mBigramPos(terminalId), mSiblingPos(siblingPos) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + // Construct new params by updating existing PtNode params. + PtNodeParams(const PtNodeParams *const ptNodeParams, + const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, + const CodePointArrayView codePoints, const int probability) + : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true), + mParentPos(parentPos), mCodePointCount(codePoints.size()), mCodePoints(), + mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()), + mTerminalId(ptNodeParams->getTerminalId()), + mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()), + mProbability(probability), + mChildrenPosFieldPos(ptNodeParams->getChildrenPosFieldPos()), + mChildrenPos(ptNodeParams->getChildrenPos()), + mBigramLinkedNodePos(ptNodeParams->getBigramLinkedNodePos()), + mShortcutPos(ptNodeParams->getShortcutPos()), + mBigramPos(ptNodeParams->getBigramsPos()), + mSiblingPos(ptNodeParams->getSiblingNodePos()) { + memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); + } + + PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, + const CodePointArrayView codePoints, const int probability) + : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), + mCodePointCount(codePoints.size()), mCodePoints(), + mTerminalIdFieldPos(NOT_A_DICT_POS), + mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) { + memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); + } + + AK_FORCE_INLINE bool isValid() const { + return mCodePointCount > 0; + } + + // Head position of the PtNode + AK_FORCE_INLINE int getHeadPos() const { + return mHeadPos; + } + + // Flags + AK_FORCE_INLINE bool isDeleted() const { + return mHasMovedFlag && DynamicPtReadingUtils::isDeleted(mFlags); + } + + AK_FORCE_INLINE bool willBecomeNonTerminal() const { + return mHasMovedFlag && DynamicPtReadingUtils::willBecomeNonTerminal(mFlags); + } + + AK_FORCE_INLINE bool hasChildren() const { + return mChildrenPos != NOT_A_DICT_POS; + } + + AK_FORCE_INLINE bool isTerminal() const { + return PatriciaTrieReadingUtils::isTerminal(mFlags); + } + + AK_FORCE_INLINE bool isPossiblyOffensive() const { + return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); + } + + AK_FORCE_INLINE bool isNotAWord() const { + return PatriciaTrieReadingUtils::isNotAWord(mFlags); + } + + AK_FORCE_INLINE bool hasBigrams() const { + return PatriciaTrieReadingUtils::hasBigrams(mFlags); + } + + AK_FORCE_INLINE bool hasShortcutTargets() const { + return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags); + } + + AK_FORCE_INLINE bool representsNonWordInfo() const { + return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0]) + && isNotAWord(); + } + + AK_FORCE_INLINE int representsBeginningOfSentence() const { + return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE + && isNotAWord(); + } + + // Parent node position + AK_FORCE_INLINE int getParentPos() const { + return mParentPos; + } + + AK_FORCE_INLINE const CodePointArrayView getCodePointArrayView() const { + return CodePointArrayView(mCodePoints, mCodePointCount); + } + + // TODO: Remove + // Number of code points + AK_FORCE_INLINE uint8_t getCodePointCount() const { + return mCodePointCount; + } + + // TODO: Remove + AK_FORCE_INLINE const int *getCodePoints() const { + return mCodePoints; + } + + // Probability + AK_FORCE_INLINE int getTerminalIdFieldPos() const { + return mTerminalIdFieldPos; + } + + AK_FORCE_INLINE int getTerminalId() const { + return mTerminalId; + } + + // Probability + AK_FORCE_INLINE int getProbabilityFieldPos() const { + return mProbabilityFieldPos; + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + // Children PtNode array position + AK_FORCE_INLINE int getChildrenPosFieldPos() const { + return mChildrenPosFieldPos; + } + + AK_FORCE_INLINE int getChildrenPos() const { + return mChildrenPos; + } + + // Bigram linked node position. + AK_FORCE_INLINE int getBigramLinkedNodePos() const { + return mBigramLinkedNodePos; + } + + // Shortcutlist position + AK_FORCE_INLINE int getShortcutPos() const { + return mShortcutPos; + } + + // Bigrams position + AK_FORCE_INLINE int getBigramsPos() const { + return mBigramPos; + } + + // Sibling node position + AK_FORCE_INLINE int getSiblingNodePos() const { + return mSiblingPos; + } + + private: + // This class have a public copy constructor to be used as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(PtNodeParams); + + const int mHeadPos; + const PatriciaTrieReadingUtils::NodeFlags mFlags; + const bool mHasMovedFlag; + const int mParentPos; + const uint8_t mCodePointCount; + int mCodePoints[MAX_WORD_LENGTH]; + const int mTerminalIdFieldPos; + const int mTerminalId; + const int mProbabilityFieldPos; + const int mProbability; + const int mChildrenPosFieldPos; + const int mChildrenPos; + const int mBigramLinkedNodePos; + const int mShortcutPos; + const int mBigramPos; + const int mSiblingPos; +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_PARAMS_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h new file mode 100644 index 000000000..15da19e0b --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_READER_H +#define LATINIME_PT_NODE_READER_H + +#include "defines.h" + +#include "dictionary/structure/pt_common/pt_node_params.h" + +namespace latinime { + +// Interface class used to read PtNode information. +class PtNodeReader { + public: + virtual ~PtNodeReader() {} + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos( + const int ptNodePos) const = 0; + + protected: + PtNodeReader() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeReader); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h new file mode 100644 index 000000000..e6cad25aa --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_WRITER_H +#define LATINIME_PT_NODE_WRITER_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class NgramProperty; +class UnigramProperty; + +// Interface class used to write PtNode information. +class PtNodeWriter { + public: + typedef std::unordered_map PtNodeArrayPositionRelocationMap; + typedef std::unordered_map PtNodePositionRelocationMap; + struct DictPositionRelocationMap { + public: + DictPositionRelocationMap() + : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} + + PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; + PtNodePositionRelocationMap mPtNodePositionRelocationMap; + + private: + DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); + }; + + virtual ~PtNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) = 0; + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; + + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) = 0; + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, + bool *const outNeedsToKeepPtNode) = 0; + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition) = 0; + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos) = 0; + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0; + + virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) = 0; + + virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0; + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) = 0; + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) = 0; + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) = 0; + + protected: + PtNodeWriter() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeWriter); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_WRITER_H */ diff --git a/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp new file mode 100644 index 000000000..14428edd4 --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// Flag for presence of more attributes +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; +// The numeric value of the shortcut probability that means 'whitelist'. +const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; + +/* static */ ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); +} + +/* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer( + const ReadOnlyByteArrayView buffer, int *const pos) { + // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. + return ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos) + - SHORTCUT_LIST_SIZE_FIELD_SIZE; +} + +/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer, + const int maxLength, int *const outWord, int *const pos) { + // TODO: Use codePointTable for shortcuts. + return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, + nullptr /* codePointTable */, outWord, pos); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h new file mode 100644 index 000000000..71cb8cc2c --- /dev/null +++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_READING_UTILS_H +#define LATINIME_SHORTCUT_LIST_READING_UTILS_H + +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class ShortcutListReadingUtils { + public: + typedef uint8_t ShortcutFlags; + + static ShortcutFlags getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const ShortcutFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // This method returns the size of the shortcut list region excluding the shortcut list size + // field at the beginning. + static int getShortcutListSizeAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos); + + static AK_FORCE_INLINE int getShortcutListSizeFieldSize() { + return SHORTCUT_LIST_SIZE_FIELD_SIZE; + } + + static AK_FORCE_INLINE void skipShortcuts(const ReadOnlyByteArrayView buffer, int *const pos) { + const int shortcutListSize = getShortcutListSizeAndForwardPointer(buffer, pos); + *pos += shortcutListSize; + } + + static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) { + return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; + } + + static int readShortcutTarget(const ReadOnlyByteArrayView buffer, const int maxLength, + int *const outWord, int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListReadingUtils); + + static const ShortcutFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const ShortcutFlags MASK_ATTRIBUTE_PROBABILITY; + static const int SHORTCUT_LIST_SIZE_FIELD_SIZE; + static const int WHITELIST_SHORTCUT_PROBABILITY; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_READING_UTILS_H diff --git a/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h new file mode 100644 index 000000000..25081fa04 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_POLICY_H +#define LATINIME_BIGRAM_LIST_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + BigramListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} + + ~BigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const pos) const { + BigramListReadWriteUtils::BigramFlags flags; + if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBuffer, &flags, + outBigramPos, pos)) { + AKLOGE("Cannot read bigram entry. bufSize: %zd, pos: %d. ", mBuffer.size(), *pos); + *outProbability = NOT_A_PROBABILITY; + *outHasNext = false; + return; + } + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); + } + + bool skipAllBigrams(int *const pos) const { + return BigramListReadWriteUtils::skipExistingBigrams(mBuffer, pos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy); + + const ReadOnlyByteArrayView mBuffer; +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp new file mode 100644 index 000000000..4e8b96b08 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp @@ -0,0 +1,526 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v2/patricia_trie_policy.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/char_utils.h" + +namespace latinime { + +void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + int nextPos = dicNode->getChildrenPtNodeArrayPos(); + if (!isValidPos(nextPos)) { + AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd", + nextPos, mBuffer.size()); + mIsCorrupted = true; + ASSERT(false); + return; + } + const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mBuffer.data(), &nextPos); + for (int i = 0; i < childCount; i++) { + if (!isValidPos(nextPos)) { + AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d", + nextPos, mBuffer.size(), i, childCount); + mIsCorrupted = true; + ASSERT(false); + return; + } + nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); + } +} + +int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount, + outCodePoints, nullptr /* outUnigramProbability */); +} +// This retrieves code points and the probability of the word by its id. +// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, +// it is possible to check for this with advantageous complexity. For each PtNode array, we search +// for PtNodes with children and compare the children position with the position we look for. +// When we shoot the position we look for, it means the word we look for is in the children +// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a +// PtNode array with the last PtNode's children position still less than what we are searching for, +// we must descend the last PtNode's children (for example, if the word we are searching for starts +// with a z, it's the last PtNode of the root array, so all children addresses will be smaller +// than the position we look for, and we have to descend the z PtNode). +/* Parameters : + * wordId: Id of the word we are searching for. + * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramProbability: a pointer to an int to write the probability into. + * Return value : the code point count, of 0 if the word was not found. + */ +// TODO: Split this function to be more readable +int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int wordId, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + int pos = getRootPosition(); + int wordPos = 0; + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); + if (outUnigramProbability) { + *outUnigramProbability = NOT_A_PROBABILITY; + } + // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will + // only traverse PtNodes that are actually a part of the terminal we are searching, so each + // time we enter this loop we are one depth level further than last time. + // The only reason we count PtNodes is because we want to reduce the probability of infinite + // looping in case there is a bug. Since we know there is an upper bound to the depth we are + // supposed to traverse, it does not hurt to count iterations. + for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { + int lastCandidatePtNodePos = 0; + // Let's loop through PtNodes in this PtNode array searching for either the terminal + // or one of its ascendants. + if (!isValidPos(pos)) { + AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd", + pos, mBuffer.size()); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) { + const int startPos = pos; + if (!isValidPos(pos)) { + AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size()); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos); + const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &pos); + if (ptNodePos == startPos) { + // We found the position. Copy the rest of the code points in the buffer and return + // the length. + outCodePoints[wordPos] = character; + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &pos); + // We count code points in order to avoid infinite loops if the file is broken + // or if there is some other bug + int charCount = maxCodePointCount; + while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &pos); + } + } + if (outUnigramProbability) { + *outUnigramProbability = + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( + mBuffer.data(), &pos); + } + return ++wordPos; + } + // We need to skip past this PtNode, so skip any remaining code points after the + // first and possibly the probability. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH, + codePointTable, &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos); + } + // The fact that this PtNode has children is very important. Since we already know + // that this PtNode does not match, if it has no children we know it is irrelevant + // to what we are searching for. + const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); + // We will write in `found' whether we have passed the children position we are + // searching for. For example if we search for "beer", the children of b are less + // than the address we are searching for and the children of c are greater. When we + // come here for c, we realize this is too big, and that we should descend b. + bool found; + if (hasChildren) { + int currentPos = pos; + // Here comes the tricky part. First, read the children position. + const int childrenPos = PatriciaTrieReadingUtils + ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags, + ¤tPos); + if (childrenPos > ptNodePos) { + // If the children pos is greater than the position, it means the previous + // PtNode, which position is stored in lastCandidatePtNodePos, was the right + // one. + found = true; + } else if (1 >= ptNodeCount) { + // However if we are on the LAST PtNode of this array, and we have NOT shot the + // position we should descend THIS PtNode. So we trick the + // lastCandidatePtNodePos so that we will descend this PtNode, not the previous + // one. + lastCandidatePtNodePos = startPos; + found = true; + } else { + // Else, we should continue looking. + found = false; + } + } else { + // Even if we don't have children here, we could still be on the last PtNode of + // this array. If this is the case, we should descend the last PtNode that had + // children, and their position is already in lastCandidatePtNodePos. + found = (1 >= ptNodeCount); + } + + if (found) { + // Okay, we found the PtNode we should descend. Its position is in + // the lastCandidatePtNodePos variable, so we just re-read it. + if (0 != lastCandidatePtNodePos) { + const PatriciaTrieReadingUtils::NodeFlags lastFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( + mBuffer.data(), &lastCandidatePtNodePos); + const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); + // We copy all the characters in this PtNode to the buffer + outCodePoints[wordPos] = lastChar; + if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); + int charCount = maxCodePointCount; + while (-1 != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); + } + } + ++wordPos; + // Now we only need to branch to the children address. Skip the probability if + // it's there, read pos, and break to resume the search at pos. + if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), + &lastCandidatePtNodePos); + } + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mBuffer.data(), lastFlags, &lastCandidatePtNodePos); + break; + } else { + // Here is a little tricky part: we come here if we found out that all children + // addresses in this PtNode are bigger than the address we are searching for. + // Should we conclude the word is not in the dictionary? No! It could still be + // one of the remaining PtNodes in this array, so we have to keep looking in + // this array until we find it (or we realize it's not there either, in which + // case it's actually not in the dictionary). Pass the end of this PtNode, + // ready to start the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mBuffer.data(), flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + if (!mBigramListPolicy.skipAllBigrams(&pos)) { + AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), + pos); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + } + } + } else { + // If we did not find it, we should record the last children address for the next + // iteration. + if (hasChildren) lastCandidatePtNodePos = startPos; + // Now skip the end of this PtNode (children pos and the attributes if any) so that + // our pos is after the end of this PtNode, at the start of the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mBuffer.data(), flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + if (!mBigramListPolicy.skipAllBigrams(&pos)) { + AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + } + } + + } + } + // If we have looked through all the PtNodes and found no match, the ptNodePos is + // not the position of a terminal in this dictionary. + return 0; +} + +// This function gets the position of the terminal PtNode of the exact matching word in the +// dictionary. If no match is found, it returns NOT_A_WORD_ID. +int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getWordId()."); + } + return getWordIdFromTerminalPtNodePos(ptNodePos); +} + +const WordAttributes PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (multiBigramMap) { + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); + } + if (!prevWordIds.empty()) { + const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId); + if (bigramProbability != NOT_A_PROBABILITY) { + return getWordAttributes(bigramProbability, ptNodeParams); + } + } + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.isPossiblyOffensive()); +} + +int PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + // Due to space constraints, the probability for bigrams is approximate - the lower the unigram + // probability, the worse the precision. The theoritical maximum error in resulting probability + // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means + // that sometimes, we'll see some bigrams interverted here, but it can't get too bad. + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } +} + +int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID) { + return NOT_A_PROBABILITY; + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.isNotAWord()) { + // If this is not a word, it should behave as having no probability outside of the + // suggestion process (where it should be used for shortcuts). + return NOT_A_PROBABILITY; + } + if (!prevWordIds.empty()) { + const int bigramsPosition = getBigramsPositionOfPtNode( + getTerminalPtNodePosFromWordId(prevWordIds[0])); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == ptNodePos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { + return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); + } + } + return NOT_A_PROBABILITY; + } + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); +} + +void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const { + if (prevWordIds.empty()) { + return; + } + const int bigramsPosition = getBigramsPositionOfPtNode( + getTerminalPtNodePosFromWordId(prevWordIds[0])); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + listener->onVisitEntry(bigramsIt.getProbability(), + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); + } +} + +BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const { + const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); + return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos); +} + +int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos(); +} + +int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos(); +} + +int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, + const int ptNodePos, DicNodeVector *childDicNodes) const { + PatriciaTrieReadingUtils::NodeFlags flags; + int mergedNodeCodePointCount = 0; + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + int probability = NOT_A_PROBABILITY; + int childrenPos = NOT_A_DICT_POS; + int shortcutPos = NOT_A_DICT_POS; + int bigramPos = NOT_A_DICT_POS; + int siblingPos = NOT_A_DICT_POS; + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); + PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy, + &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount, + mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, + &siblingPos); + // Skip PtNodes don't start with Unicode code point because they represent non-word information. + if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { + const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId, + CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount)); + } + return siblingPos; +} + +const WordProperty PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("getWordProperty was called for invalid word."); + return WordProperty(); + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + // Fetch bigram information. + std::vector ngrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos); + while (bigramsIt.hasNext()) { + // Fetch the next bigram information and forward the iterator. + bigramsIt.next(); + // Skip the entry if the entry has been deleted. This never happens for ver2 dicts. + if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { + int word1Probability = NOT_A_PROBABILITY; + const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH, + bigramWord1CodePoints, &word1Probability); + const int probability = getProbability(word1Probability, bigramsIt.getProbability()); + ngrams.emplace_back( + NgramContext(wordCodePoints.data(), wordCodePoints.size(), + ptNodeParams.representsBeginningOfSentence()), + CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(), + probability, HistoricalInfo()); + } + } + // Fetch shortcut information. + std::vector shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTargetCodePoints[MAX_WORD_LENGTH]; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos); + bool hasNext = true; + while (hasNext) { + const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos); + hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); + const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( + mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); + const int shortcutProbability = + ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); + shortcuts.emplace_back( + CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(), + shortcutProbability); + } + } + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), + ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); +} + +int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; + if (token == 0) { + // Start iterating the dictionary. + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + *outCodePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { + return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; +} + +int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { + return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; +} + +bool PatriciaTriePolicy::isValidPos(const int pos) const { + return pos >= 0 && pos < static_cast(mBuffer.size()); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h new file mode 100644 index 000000000..8edfa7d10 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_POLICY_H +#define LATINIME_PATRICIA_TRIE_POLICY_H + +#include +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/v2/bigram/bigram_list_policy.h" +#include "dictionary/structure/v2/shortcut/shortcut_list_policy.h" +#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h" +#include "dictionary/structure/v2/ver2_pt_node_array_reader.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "utils/byte_array_view.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +// Word id = Position of a PtNode that represents the word. +// Max supported n-gram is bigram. +class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) + : mMmappedBuffer(std::move(mmappedBuffer)), + mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), + FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())), + mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())), + mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer), + mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy, + mHeaderPolicy.getCodePointTable()), + mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(), + mIsCorrupted(false) {} + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; + + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; + + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; + + void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const; + + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + + bool removeUnigramEntry(const CodePointArrayView wordCodePoints) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + + bool addNgramEntry(const NgramProperty *const ngramProperty) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + + bool flush(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flush() is called for non-updatable dictionary."); + return false; + } + + bool flushWithGC(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + + bool needsToRunGC(const bool mindsBlockByGC) const { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength) { + // getProperty is not supported for this class. + if (maxResultLength > 0) { + outResult[0] = '\0'; + } + } + + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); + + const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; + const HeaderPolicy mHeaderPolicy; + const ReadOnlyByteArrayView mBuffer; + const BigramListPolicy mBigramListPolicy; + const ShortcutListPolicy mShortcutListPolicy; + const Ver2ParticiaTrieNodeReader mPtNodeReader; + const Ver2PtNodeArrayReader mPtNodeArrayReader; + std::vector mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; + + int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; + int getBigramsPositionOfPtNode(const int ptNodePos) const; + int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, + DicNodeVector *const childDicNodes) const; + int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; + int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; + bool isValidPos(const int pos) const; +}; +} // namespace latinime +#endif // LATINIME_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h new file mode 100644 index 000000000..995b1ed01 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_POLICY_H +#define LATINIME_SHORTCUT_LIST_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit ShortcutListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} + + ~ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int listPos = pos; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &listPos); + return listPos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + mBuffer, maxCodePointCount, outCodePoint, pos); + } + } + + void skipAllShortcuts(int *const pos) const { + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(mBuffer, pos); + *pos += shortcutListSize; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy); + + const ReadOnlyByteArrayView mBuffer; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..cbb8ead81 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h" + +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" + +namespace latinime { + +const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNodePos( + const int ptNodePos) const { + if (ptNodePos < 0 || ptNodePos >= static_cast(mBuffer.size())) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %zd", + ptNodePos, mBuffer.size()); + ASSERT(false); + return PtNodeParams(); + } + PatriciaTrieReadingUtils::NodeFlags flags; + int mergedNodeCodePointCount = 0; + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + int probability = NOT_A_PROBABILITY; + int childrenPos = NOT_A_DICT_POS; + int shortcutPos = NOT_A_DICT_POS; + int bigramPos = NOT_A_DICT_POS; + int siblingPos = NOT_A_DICT_POS; + PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortcutPolicy, + mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, + &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); + if (mergedNodeCodePointCount <= 0) { + AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); + ASSERT(false); + return PtNodeParams(); + } + return PtNodeParams(ptNodePos, flags, mergedNodeCodePointCount, mergedNodeCodePoints, + probability, childrenPos, shortcutPos, bigramPos, siblingPos); +} + +} diff --git a/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h new file mode 100644 index 000000000..dc87c7c68 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +class Ver2ParticiaTrieNodeReader : public PtNodeReader { + public: + Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer, + const DictionaryBigramsStructurePolicy *const bigramPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const int *const codePointTable) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), + mCodePointTable(codePointTable) {} + + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader); + + const ReadOnlyByteArrayView mBuffer; + const DictionaryBigramsStructurePolicy *const mBigramPolicy; + const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; + const int *const mCodePointTable; +}; +} // namespace latinime +#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp new file mode 100644 index 000000000..8b9b02df1 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v2/ver2_pt_node_array_reader.h" + +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" + +namespace latinime { + +bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= static_cast(mBuffer.size())) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %zd", + ptNodeArrayPos, mBuffer.size()); + ASSERT(false); + return false; + } + int readingPos = ptNodeArrayPos; + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mBuffer.data(), &readingPos); + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver2PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= static_cast(mBuffer.size())) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %zd", + forwordLinkPos, mBuffer.size()); + ASSERT(false); + return false; + } + // Ver2 dicts don't have forward links. + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h new file mode 100644 index 000000000..32fa96d15 --- /dev/null +++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER2_PT_NODE_ARRAY_READER_H +#define LATINIME_VER2_PT_NODE_ARRAY_READER_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class Ver2PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver2PtNodeArrayReader(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver2PtNodeArrayReader); + + const ReadOnlyByteArrayView mBuffer; +}; +} // namespace latinime +#endif /* LATINIME_VER2_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp new file mode 100644 index 000000000..165947f87 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" + +namespace latinime { + +// Used to provide stable probabilities even if the user's input count is small. +const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2, 1}; + +// Encoded backoff weights. +// Note that we give positive values for trigrams and quadgrams that means the weight is more than +// 1. +// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight. +const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, -4, 2, 8}; + +// This value is used to remove too old entries from the dictionary. +const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS = + 300 * 24 * 60 * 60; // 300 days + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h new file mode 100644 index 000000000..71824c954 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H +#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "utils/ngram_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class DynamicLanguageModelProbabilityUtils { + public: + static float computeRawProbabilityFromCounts(const int count, const int contextCount, + const NgramType ngramType) { + const int minCount = ASSUMED_MIN_COUNTS[static_cast(ngramType)]; + return static_cast(count) / static_cast(std::max(contextCount, minCount)); + } + + static float backoff(const int ngramProbability, const NgramType ngramType) { + const int probability = + ngramProbability + ENCODED_BACKOFF_WEIGHTS[static_cast(ngramType)]; + return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY); + } + + static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) { + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + if (elapsedTime < 0) { + AKLOGE("The elapsed time is negatime value. Timestamp overflow?"); + return NOT_A_PROBABILITY; + } + // TODO: Improve this logic. + // We don't modify probability depending on the elapsed time. + return probability; + } + + static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS; + } + + static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + // More recently input entries get higher priority. + return historicalInfo.getTimestamp(); + } + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils); + + static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 3, "Max supported Ngram is Quadgram."); + + static const int ASSUMED_MIN_COUNTS[]; + static const int ENCODED_BACKOFF_WEIGHTS[]; + static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS; +}; + +} // namespace latinime +#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp new file mode 100644 index 000000000..c10e4906b --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -0,0 +1,478 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content.h" + +#include +#include + +#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0; +const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1; + +bool LanguageModelDictContent::save(FILE *const file) const { + return mTrieMap.save(file) && mGlobalCounters.save(file); +} + +bool LanguageModelDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const LanguageModelDictContent *const originalContent) { + return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(), + 0 /* nextLevelBitmapEntryIndex */); +} + +const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, + const int wordId, const bool mustMatchAllPrevWords, + const HeaderPolicy *const headerPolicy) const { + int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; + bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); + int maxPrevWordCount = 0; + for (size_t i = 0; i < prevWordIds.size(); ++i) { + const int nextBitmapEntryIndex = + mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex; + if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) { + break; + } + maxPrevWordCount = i + 1; + bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; + } + + const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); + if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) { + // The word should be treated as a invalid word. + return WordAttributes(); + } + for (int i = maxPrevWordCount; i >= 0; --i) { + if (mustMatchAllPrevWords && prevWordIds.size() > static_cast(i)) { + break; + } + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); + if (!result.mIsValid) { + continue; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); + int probability = NOT_A_PROBABILITY; + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + int contextCount = 0; + if (i == 0) { + // unigram + contextCount = mGlobalCounters.getTotalCount(); + } else { + const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( + prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); + if (!prevWordProbabilityEntry.isValid()) { + continue; + } + if (prevWordProbabilityEntry.representsBeginningOfSentence() + && historicalInfo->getCount() == 1) { + // BoS ngram requires multiple contextCount. + continue; + } + contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount(); + } + const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1); + const float rawProbability = + DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts( + historicalInfo->getCount(), contextCount, ngramType); + const int encodedRawProbability = + ProbabilityUtils::encodeRawProbability(rawProbability); + const int decayedProbability = + DynamicLanguageModelProbabilityUtils::getDecayedProbability( + encodedRawProbability, *historicalInfo); + probability = DynamicLanguageModelProbabilityUtils::backoff( + decayedProbability, ngramType); + } else { + probability = probabilityEntry.getProbability(); + } + // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in + // probabilityEntry. + return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), + unigramProbabilityEntry.isNotAWord(), + unigramProbabilityEntry.isPossiblyOffensive()); + } + // Cannot find the word. + return WordAttributes(); +} + +ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( + const WordIdArrayView prevWordIds, const int wordId) const { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + return ProbabilityEntry(); + } + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); + if (!result.mIsValid) { + // Not found. + return ProbabilityEntry(); + } + return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); +} + +bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId, const ProbabilityEntry *const probabilityEntry) { + if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) { + return false; + } + const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + return false; + } + return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); +} + +bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId) { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + // Cannot find bitmap entry for the probability entry. The entry doesn't exist. + return false; + } + return mTrieMap.remove(wordId, bitmapEntryIndex); +} + +LanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries( + const WordIdArrayView prevWordIds) const { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo); +} + +std::vector + LanguageModelDictContent::exportAllNgramEntriesRelatedToWord( + const HeaderPolicy *const headerPolicy, const int wordId) const { + const TrieMap::Result result = mTrieMap.getRoot(wordId); + if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) { + // The word doesn't have any related ngram entries. + return std::vector(); + } + std::vector prevWordIds = { wordId }; + std::vector entries; + exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex, + &prevWordIds, &entries); + return entries; +} + +void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner( + const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex, + std::vector *const prevWordIds, + std::vector *const outBummpedFullEntryInfo) const { + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + const int wordId = entry.key(); + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + if (probabilityEntry.isValid()) { + const WordAttributes wordAttributes = getWordAttributes( + WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */, + headerPolicy); + outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId, + wordAttributes, probabilityEntry); + } + if (entry.hasNextLevelMap()) { + prevWordIds->push_back(wordId); + exportAllNgramEntriesRelatedToWordInner(headerPolicy, + entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo); + prevWordIds->pop_back(); + } + } +} + +bool LanguageModelDictContent::truncateEntries(const EntryCounts ¤tEntryCounts, + const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const outEntryCounters) { + for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) { + const int totalWordCount = prevWordCount + 1; + const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount); + if (currentEntryCounts.getNgramCount(ngramType) + <= maxEntryCounts.getNgramCount(ngramType)) { + outEntryCounters->setNgramCount(ngramType, + currentEntryCounts.getNgramCount(ngramType)); + continue; + } + int entryCount = 0; + if (!turncateEntriesInSpecifiedLevel(headerPolicy, + maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) { + return false; + } + outEntryCounters->setNgramCount(ngramType, entryCount); + } + return true; +} + +bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, + const int wordId, const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) { + if (!mHasHistoricalInfo) { + AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info."); + return false; + } + const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId); + const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom( + originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy); + if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) { + return false; + } + mGlobalCounters.incrementTotalCount(); + mGlobalCounters.updateMaxValueOfCounters( + updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount()); + for (size_t i = 0; i < prevWordIds.size(); ++i) { + if (prevWordIds[i] == NOT_A_WORD_ID) { + break; + } + // TODO: Optimize this code. + const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1); + const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry( + limitedPrevWordIds, wordId); + const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom( + originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy); + if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) { + return false; + } + mGlobalCounters.updateMaxValueOfCounters( + updatedNgramProbabilityEntry.getHistoricalInfo()->getCount()); + if (!originalNgramProbabilityEntry.isValid()) { + // (i + 2) words are used in total because the prevWords consists of (i + 1) words when + // looking at its i-th element. + entryCountersToUpdate->incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(i + 2)); + } + } + return true; +} + +const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( + const ProbabilityEntry &originalProbabilityEntry, const bool isValid, + const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { + const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(), + 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount() + + historicalInfo.getCount()); + if (originalProbabilityEntry.isValid()) { + return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); + } else { + return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo); + } +} + +bool LanguageModelDictContent::runGCInner( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) { + for (auto &entry : trieMapRange) { + const auto it = terminalIdMap->find(entry.key()); + if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) { + // The word has been removed. + continue; + } + if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) { + return false; + } + if (entry.hasNextLevelMap()) { + if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(), + mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) { + return false; + } + } + } + return true; +} + +int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) { + int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); + for (const int wordId : prevWordIds) { + const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex); + if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) { + lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + continue; + } + if (!result.mIsValid) { + if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo), + lastBitmapEntryIndex)) { + AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId, + lastBitmapEntryIndex); + return TrieMap::INVALID_INDEX; + } + } + lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId, + lastBitmapEntryIndex); + } + return lastBitmapEntryIndex; +} + +int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const { + int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); + for (const int wordId : prevWordIds) { + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); + if (!result.mIsValid) { + return TrieMap::INVALID_INDEX; + } + bitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + } + return bitmapEntryIndex; +} + +bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, + const int prevWordCount, const HeaderPolicy *const headerPolicy, + const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { + AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", + prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM); + return false; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + if (prevWordCount > 0 && probabilityEntry.isValid() + && !mTrieMap.getRoot(entry.key()).mIsValid) { + // The entry is related to a word that has been removed. Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + if (mHasHistoricalInfo && probabilityEntry.isValid()) { + const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); + if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( + *originalHistoricalInfo)) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + if (needsToHalveCounters) { + const int updatedCount = originalHistoricalInfo->getCount() / 2; + if (updatedCount == 0) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), + originalHistoricalInfo->getLevel(), updatedCount); + const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), + &historicalInfoToSave); + if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), + bitmapEntryIndex)) { + return false; + } + } + } + outEntryCounters->incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1)); + if (!entry.hasNextLevelMap()) { + continue; + } + if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), + prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { + return false; + } + } + return true; +} + +bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel( + const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, + int *const outEntryCount) { + std::vector prevWordIds; + std::vector entryInfoVector; + if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(), + &prevWordIds, &entryInfoVector)) { + return false; + } + if (static_cast(entryInfoVector.size()) <= maxEntryCount) { + *outEntryCount = static_cast(entryInfoVector.size()); + return true; + } + *outEntryCount = maxEntryCount; + const int entryCountToRemove = static_cast(entryInfoVector.size()) - maxEntryCount; + std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove, + entryInfoVector.end(), + EntryInfoToTurncate::Comparator()); + for (int i = 0; i < entryCountToRemove; ++i) { + const EntryInfoToTurncate &entryInfo = entryInfoVector[i]; + if (!removeNgramProbabilityEntry( + WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount), + entryInfo.mKey)) { + return false; + } + } + return true; +} + +bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy, + const int targetLevel, const int bitmapEntryIndex, std::vector *const prevWordIds, + std::vector *const outEntryInfo) const { + const int prevWordCount = prevWordIds->size(); + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + if (prevWordCount < targetLevel) { + if (!entry.hasNextLevelMap()) { + continue; + } + prevWordIds->push_back(entry.key()); + if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(), + prevWordIds, outEntryInfo)) { + return false; + } + prevWordIds->pop_back(); + continue; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + const int priority = mHasHistoricalInfo + ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction( + *probabilityEntry.getHistoricalInfo()) + : probabilityEntry.getProbability(); + outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(), + entry.key(), targetLevel, prevWordIds->data()); + } + return true; +} + +bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( + const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { + if (left.mPriority != right.mPriority) { + return left.mPriority < right.mPriority; + } + if (left.mCount != right.mCount) { + return left.mCount < right.mCount; + } + if (left.mKey != right.mKey) { + return left.mKey < right.mKey; + } + if (left.mPrevWordCount != right.mPrevWordCount) { + return left.mPrevWordCount > right.mPrevWordCount; + } + for (int i = 0; i < left.mPrevWordCount; ++i) { + if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) { + return left.mPrevWordIds[i] < right.mPrevWordIds[i]; + } + } + // left and rigth represent the same entry. + return false; +} + +LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority, + const int count, const int key, const int prevWordCount, const int *const prevWordIds) + : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) { + memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h new file mode 100644 index 000000000..db8c6e12b --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H +#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H + +#include +#include + +#include "defines.h" +#include "dictionary/property/word_attributes.h" +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/trie_map.h" +#include "utils/byte_array_view.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class HeaderPolicy; + +/** + * Class representing language model. + * + * This class provides methods to get and store unigram/n-gram probability information and flags. + */ +class LanguageModelDictContent { + public: + // Pair of word id and probability entry used for iteration. + class WordIdAndProbabilityEntry { + public: + WordIdAndProbabilityEntry(const int wordId, const ProbabilityEntry &probabilityEntry) + : mWordId(wordId), mProbabilityEntry(probabilityEntry) {} + + int getWordId() const { return mWordId; } + const ProbabilityEntry getProbabilityEntry() const { return mProbabilityEntry; } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(WordIdAndProbabilityEntry); + DISALLOW_ASSIGNMENT_OPERATOR(WordIdAndProbabilityEntry); + + const int mWordId; + const ProbabilityEntry mProbabilityEntry; + }; + + // Iterator. + class EntryIterator { + public: + EntryIterator(const TrieMap::TrieMapIterator &trieMapIterator, + const bool hasHistoricalInfo) + : mTrieMapIterator(trieMapIterator), mHasHistoricalInfo(hasHistoricalInfo) {} + + const WordIdAndProbabilityEntry operator*() const { + const TrieMap::TrieMapIterator::IterationResult &result = *mTrieMapIterator; + return WordIdAndProbabilityEntry( + result.key(), ProbabilityEntry::decode(result.value(), mHasHistoricalInfo)); + } + + bool operator!=(const EntryIterator &other) const { + return mTrieMapIterator != other.mTrieMapIterator; + } + + const EntryIterator &operator++() { + ++mTrieMapIterator; + return *this; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryIterator); + DISALLOW_ASSIGNMENT_OPERATOR(EntryIterator); + + TrieMap::TrieMapIterator mTrieMapIterator; + const bool mHasHistoricalInfo; + }; + + // Class represents range to use range base for loops. + class EntryRange { + public: + EntryRange(const TrieMap::TrieMapRange trieMapRange, const bool hasHistoricalInfo) + : mTrieMapRange(trieMapRange), mHasHistoricalInfo(hasHistoricalInfo) {} + + EntryIterator begin() const { + return EntryIterator(mTrieMapRange.begin(), mHasHistoricalInfo); + } + + EntryIterator end() const { + return EntryIterator(mTrieMapRange.end(), mHasHistoricalInfo); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryRange); + DISALLOW_ASSIGNMENT_OPERATOR(EntryRange); + + const TrieMap::TrieMapRange mTrieMapRange; + const bool mHasHistoricalInfo; + }; + + class DumppedFullEntryInfo { + public: + DumppedFullEntryInfo(std::vector &prevWordIds, const int targetWordId, + const WordAttributes &wordAttributes, const ProbabilityEntry &probabilityEntry) + : mPrevWordIds(prevWordIds), mTargetWordId(targetWordId), + mWordAttributes(wordAttributes), mProbabilityEntry(probabilityEntry) {} + + const WordIdArrayView getPrevWordIds() const { return WordIdArrayView(mPrevWordIds); } + int getTargetWordId() const { return mTargetWordId; } + const WordAttributes &getWordAttributes() const { return mWordAttributes; } + const ProbabilityEntry &getProbabilityEntry() const { return mProbabilityEntry; } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(DumppedFullEntryInfo); + + const std::vector mPrevWordIds; + const int mTargetWordId; + const WordAttributes mWordAttributes; + const ProbabilityEntry mProbabilityEntry; + }; + + LanguageModelDictContent(const ReadWriteByteArrayView *const buffers, + const bool hasHistoricalInfo) + : mTrieMap(buffers[TRIE_MAP_BUFFER_INDEX]), + mGlobalCounters(buffers[GLOBAL_COUNTERS_BUFFER_INDEX]), + mHasHistoricalInfo(hasHistoricalInfo) {} + + explicit LanguageModelDictContent(const bool hasHistoricalInfo) + : mTrieMap(), mGlobalCounters(), mHasHistoricalInfo(hasHistoricalInfo) {} + + bool isNearSizeLimit() const { + return mTrieMap.isNearSizeLimit() || mGlobalCounters.needsToHalveCounters(); + } + + bool save(FILE *const file) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const LanguageModelDictContent *const originalContent); + + const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, + const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const; + + ProbabilityEntry getProbabilityEntry(const int wordId) const { + return getNgramProbabilityEntry(WordIdArrayView(), wordId); + } + + bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { + mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount()); + return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); + } + + bool removeProbabilityEntry(const int wordId) { + return removeNgramProbabilityEntry(WordIdArrayView(), wordId); + } + + ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId) const; + + bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId, + const ProbabilityEntry *const probabilityEntry); + + bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId); + + EntryRange getProbabilityEntries(const WordIdArrayView prevWordIds) const; + + std::vector exportAllNgramEntriesRelatedToWord( + const HeaderPolicy *const headerPolicy, const int wordId) const; + + bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const outEntryCounters) { + if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), + 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(), + outEntryCounters)) { + return false; + } + if (mGlobalCounters.needsToHalveCounters()) { + mGlobalCounters.halveCounters(); + } + return true; + } + + // entryCounts should be created by updateAllProbabilityEntries. + bool truncateEntries(const EntryCounts ¤tEntryCounts, const EntryCounts &maxEntryCounts, + const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters); + + bool updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId, + const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const entryCountersToUpdate); + + private: + DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); + + class EntryInfoToTurncate { + public: + class Comparator { + public: + bool operator()(const EntryInfoToTurncate &left, + const EntryInfoToTurncate &right) const; + private: + DISALLOW_ASSIGNMENT_OPERATOR(Comparator); + }; + + EntryInfoToTurncate(const int priority, const int count, const int key, + const int prevWordCount, const int *const prevWordIds); + + int mPriority; + // TODO: Remove. + int mCount; + int mKey; + int mPrevWordCount; + int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate); + }; + + static const int TRIE_MAP_BUFFER_INDEX; + static const int GLOBAL_COUNTERS_BUFFER_INDEX; + + TrieMap mTrieMap; + LanguageModelDictContentGlobalCounters mGlobalCounters; + const bool mHasHistoricalInfo; + + bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex); + int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds); + int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; + bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, + const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters, + MutableEntryCounters *const outEntryCounters); + bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy, + const int maxEntryCount, const int targetLevel, int *const outEntryCount); + bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel, + const int bitmapEntryIndex, std::vector *const prevWordIds, + std::vector *const outEntryInfo) const; + const ProbabilityEntry createUpdatedEntryFrom(const ProbabilityEntry &originalProbabilityEntry, + const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy) const; + void exportAllNgramEntriesRelatedToWordInner(const HeaderPolicy *const headerPolicy, + const int bitmapEntryIndex, std::vector *const prevWordIds, + std::vector *const outBummpedFullEntryInfo) const; +}; +} // namespace latinime +#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp new file mode 100644 index 000000000..89cf0e306 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" + +#include + +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +const int LanguageModelDictContentGlobalCounters::COUNTER_VALUE_NEAR_LIMIT_THRESHOLD = + (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 64; +const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD = 1 << 30; +const int LanguageModelDictContentGlobalCounters::COUNTER_SIZE_IN_BYTES = 4; +const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_INDEX = 0; +const int LanguageModelDictContentGlobalCounters::MAX_VALUE_OF_COUNTERS_INDEX = 1; + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h new file mode 100644 index 000000000..3f87c0ea0 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H +#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H + +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class LanguageModelDictContentGlobalCounters { + public: + explicit LanguageModelDictContentGlobalCounters(const ReadWriteByteArrayView buffer) + : mBuffer(buffer, 0 /* maxAdditionalBufferSize */), + mTotalCount(readValue(mBuffer, TOTAL_COUNT_INDEX)), + mMaxValueOfCounters(readValue(mBuffer, MAX_VALUE_OF_COUNTERS_INDEX)) {} + + LanguageModelDictContentGlobalCounters() + : mBuffer(0 /* maxAdditionalBufferSize */), mTotalCount(0), mMaxValueOfCounters(0) {} + + bool needsToHalveCounters() const { + return mMaxValueOfCounters >= COUNTER_VALUE_NEAR_LIMIT_THRESHOLD + || mTotalCount >= TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; + } + + int getTotalCount() const { + return mTotalCount; + } + + bool save(FILE *const file) const { + BufferWithExtendableBuffer bufferToWrite( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!bufferToWrite.writeUint(mTotalCount, COUNTER_SIZE_IN_BYTES, + TOTAL_COUNT_INDEX * COUNTER_SIZE_IN_BYTES)) { + return false; + } + if (!bufferToWrite.writeUint(mMaxValueOfCounters, COUNTER_SIZE_IN_BYTES, + MAX_VALUE_OF_COUNTERS_INDEX * COUNTER_SIZE_IN_BYTES)) { + return false; + } + return DictFileWritingUtils::writeBufferToFileTail(file, &bufferToWrite); + } + + void incrementTotalCount() { + mTotalCount += 1; + } + + void addToTotalCount(const int count) { + mTotalCount += count; + } + + void updateMaxValueOfCounters(const int count) { + mMaxValueOfCounters = std::max(count, mMaxValueOfCounters); + } + + void halveCounters() { + mMaxValueOfCounters /= 2; + mTotalCount /= 2; + } + +private: + DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContentGlobalCounters); + + const static int COUNTER_VALUE_NEAR_LIMIT_THRESHOLD; + const static int TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; + const static int COUNTER_SIZE_IN_BYTES; + const static int TOTAL_COUNT_INDEX; + const static int MAX_VALUE_OF_COUNTERS_INDEX; + + BufferWithExtendableBuffer mBuffer; + int mTotalCount; + int mMaxValueOfCounters; + + static int readValue(const BufferWithExtendableBuffer &buffer, const int index) { + const int pos = COUNTER_SIZE_IN_BYTES * index; + if (pos + COUNTER_SIZE_IN_BYTES > buffer.getTailPosition()) { + return 0; + } + return buffer.readUint(COUNTER_SIZE_IN_BYTES, pos); + } +}; +} // namespace latinime +#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/dictionary/structure/v4/content/probability_entry.h new file mode 100644 index 000000000..473354b90 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/probability_entry.h @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_ENTRY_H +#define LATINIME_PROBABILITY_ENTRY_H + +#include +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +class ProbabilityEntry { + public: + ProbabilityEntry(const ProbabilityEntry &probabilityEntry) + : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), + mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} + + // Dummy entry + ProbabilityEntry() + : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY), + mHistoricalInfo() {} + + // Entry without historical information + ProbabilityEntry(const int flags, const int probability) + : mFlags(flags), mProbability(probability), mHistoricalInfo() {} + + // Entry with historical information. + ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo) + : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {} + + // Create from unigram property. + ProbabilityEntry(const UnigramProperty *const unigramProperty) + : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(), + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + unigramProperty->isPossiblyOffensive())), + mProbability(unigramProperty->getProbability()), + mHistoricalInfo(unigramProperty->getHistoricalInfo()) {} + + // Create from ngram property. + // TODO: Set flags. + ProbabilityEntry(const NgramProperty *const ngramProperty) + : mFlags(0), mProbability(ngramProperty->getProbability()), + mHistoricalInfo(ngramProperty->getHistoricalInfo()) {} + + bool isValid() const { + return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0; + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + uint8_t getFlags() const { + return mFlags; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + bool representsBeginningOfSentence() const { + return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0; + } + + bool isNotAWord() const { + return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0; + } + + bool isBlacklisted() const { + return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0; + } + + bool isPossiblyOffensive() const { + return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0; + } + + uint64_t encode(const bool hasHistoricalInfo) const { + uint64_t encodedEntry = static_cast(mFlags); + if (hasHistoricalInfo) { + encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT)) + | static_cast(mHistoricalInfo.getTimestamp()); + encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT)) + | static_cast(mHistoricalInfo.getLevel()); + encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) + | static_cast(mHistoricalInfo.getCount()); + } else { + encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT)) + | static_cast(mProbability); + } + return encodedEntry; + } + + static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) { + if (hasHistoricalInfo) { + const int flags = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); + const int timestamp = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); + const int level = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); + const int count = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */); + const HistoricalInfo historicalInfo(timestamp, level, count); + return ProbabilityEntry(flags, &historicalInfo); + } else { + const int flags = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, + Ver4DictConstants::PROBABILITY_SIZE); + const int probability = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */); + return ProbabilityEntry(flags, probability); + } + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); + + const uint8_t mFlags; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + + static int readFromEncodedEntry(const uint64_t encodedEntry, const int size, const int pos) { + return static_cast( + (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1)); + } + + static uint8_t createFlags(const bool representsBeginningOfSentence, + const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) { + uint8_t flags = 0; + if (representsBeginningOfSentence) { + flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + } + if (isNotAWord) { + flags |= Ver4DictConstants::FLAG_NOT_A_WORD; + } + if (isBlacklisted) { + flags |= Ver4DictConstants::FLAG_BLACKLISTED; + } + if (isPossiblyOffensive) { + flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE; + } + return flags; + } +}; +} // namespace latinime +#endif /* LATINIME_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp new file mode 100644 index 000000000..e3b419449 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/shortcut_dict_content.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const { + const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); + if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { + AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", + *shortcutEntryPos, shortcutListBuffer->getTailPosition()); + ASSERT(false); + if (outhasNext) { + *outhasNext = false; + } + if (outCodePointCount) { + *outCodePointCount = 0; + } + return; + } + + const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + if (outProbability) { + *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; + } + if (outhasNext) { + *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + } + if (outCodePoint && outCodePointCount) { + shortcutListBuffer->readCodePointsAndAdvancePosition( + maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); + } +} + +int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); +} + +bool ShortcutDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalShortcutListPos = + originalShortcutDictContent->getShortcutListHeadPos(it->first); + if (originalShortcutListPos == NOT_A_DICT_POS) { + continue; + } + const int shortcutListPos = getContentBuffer()->getTailPosition(); + // Copy shortcut list from original content. + if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, + shortcutListPos)) { + AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", + originalShortcutListPos, shortcutListPos); + return false; + } + // Set shortcut list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { + AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", + it->second, shortcutListPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::createNewShortcutList(const int terminalId) { + const int shortcutListListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); +} + +bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { + return copyShortcutListFromDictContent(shortcutListPos, this, toPos); +} + +bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { + bool hasNext = true; + int readingPos = shortcutListPos; + int writingPos = toPos; + int codePoints[MAX_WORD_LENGTH]; + while (hasNext) { + int probability = 0; + int codePointCount = 0; + sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, + codePoints, &codePointCount, &probability, &hasNext, &readingPos); + if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, + hasNext, &writingPos)) { + AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = shortcutListBuffer->readUint( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); + return shortcutListBuffer->writeUint(shortcutFlagsToWrite, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); +} + +bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); + if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); + return false; + } + if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, + true /* writesTerminator */, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); + return false; + } + return true; +} + +// Find a shortcut entry that has specified target and return its position. +int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const { + bool hasNext = true; + int readingPos = shortcutListPos; + int targetCodePoints[MAX_WORD_LENGTH]; + while (hasNext) { + const int entryPos = readingPos; + int probability = 0; + int targetCodePointCount = 0; + getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, + &probability, &hasNext, &readingPos); + if (targetCodePointCount != codePointCount) { + continue; + } + bool matched = true; + for (int i = 0; i < codePointCount; ++i) { + if (targetCodePointsToFind[i] != targetCodePoints[i]) { + matched = false; + break; + } + } + if (matched) { + return entryPos; + } + } + return NOT_A_DICT_POS; +} + +int ShortcutDictContent::createAndGetShortcutFlags(const int probability, + const bool hasNext) const { + return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) + | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h new file mode 100644 index 000000000..27de4e79e --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_DICT_CONTENT_H +#define LATINIME_SHORTCUT_DICT_CONTENT_H + +#include + +#include "defines.h" +#include "dictionary/structure/v4/content/sparse_table_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +class ReadWriteByteArrayView; + +class ShortcutDictContent : public SparseTableDictContent { + public: + ShortcutDictContent(const ReadWriteByteArrayView *const buffers) + : SparseTableDictContent(buffers, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + ShortcutDictContent() + : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, int *const outProbability, bool *const outhasNext, + const int shortcutEntryPos) { + int readingPos = shortcutEntryPos; + return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, + outCodePointCount, outProbability, outhasNext, &readingPos); + } + + void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const; + + // Returns head position of shortcut list for a PtNode specified by terminalId. + int getShortcutListHeadPos(const int terminalId) const; + + bool flushToFile(FILE *const file) const { + return flush(file); + } + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent); + + bool createNewShortcutList(const int terminalId); + + bool copyShortcutList(const int shortcutListPos, const int toPos); + + bool setProbability(const int probability, const int shortcutEntryPos); + + bool writeShortcutEntry(const int *const codePoint, const int codePointCount, + const int probability, const bool hasNext, const int shortcutEntryPos) { + int writingPos = shortcutEntryPos; + return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, + hasNext, &writingPos); + } + + bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos); + + int findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); + + bool copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); + + int createAndGetShortcutFlags(const int probability, const bool hasNext) const; +}; +} // namespace latinime +#endif /* LATINIME_SHORTCUT_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/single_dict_content.h b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h new file mode 100644 index 000000000..6faa9a28b --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SINGLE_DICT_CONTENT_H +#define LATINIME_SINGLE_DICT_CONTENT_H + +#include + +#include "defines.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class SingleDictContent { + public: + SingleDictContent(const ReadWriteByteArrayView buffer) + : mExpandableContentBuffer(buffer, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} + + SingleDictContent() + : mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE) {} + + virtual ~SingleDictContent() {} + + bool isNearSizeLimit() const { + return mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + BufferWithExtendableBuffer *getWritableBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(FILE *const file) const { + return DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer); + } + + private: + DISALLOW_COPY_AND_ASSIGN(SingleDictContent); + + BufferWithExtendableBuffer mExpandableContentBuffer; +}; +} // namespace latinime +#endif /* LATINIME_SINGLE_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp new file mode 100644 index 000000000..685365f36 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/sparse_table_dict_content.h" + +#include "dictionary/utils/dict_file_writing_utils.h" + +namespace latinime { + +const int SparseTableDictContent::LOOKUP_TABLE_BUFFER_INDEX = 0; +const int SparseTableDictContent::ADDRESS_TABLE_BUFFER_INDEX = 1; +const int SparseTableDictContent::CONTENT_BUFFER_INDEX = 2; + +bool SparseTableDictContent::flush(FILE *const file) const { + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableLookupTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableAddressTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer)) { + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h new file mode 100644 index 000000000..6245abc8e --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SPARSE_TABLE_DICT_CONTENT_H +#define LATINIME_SPARSE_TABLE_DICT_CONTENT_H + +#include + +#include "defines.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/sparse_table.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +// TODO: Support multiple contents. +class SparseTableDictContent { + public: + AK_FORCE_INLINE SparseTableDictContent(const ReadWriteByteArrayView *const buffers, + const int sparseTableBlockSize, const int sparseTableDataSize) + : mExpandableLookupTableBuffer(buffers[LOOKUP_TABLE_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableAddressTableBuffer(buffers[ADDRESS_TABLE_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableContentBuffer(buffers[CONTENT_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize) {} + + SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) + : mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize) {} + + virtual ~SparseTableDictContent() {} + + bool isNearSizeLimit() const { + return mExpandableLookupTableBuffer.isNearSizeLimit() + || mExpandableAddressTableBuffer.isNearSizeLimit() + || mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + SparseTable *getUpdatableAddressLookupTable() { + return &mAddressLookupTable; + } + + const SparseTable *getAddressLookupTable() const { + return &mAddressLookupTable; + } + + BufferWithExtendableBuffer *getWritableContentBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getContentBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(FILE *const file) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); + + static const int LOOKUP_TABLE_BUFFER_INDEX; + static const int ADDRESS_TABLE_BUFFER_INDEX; + static const int CONTENT_BUFFER_INDEX; + + BufferWithExtendableBuffer mExpandableLookupTableBuffer; + BufferWithExtendableBuffer mExpandableAddressTableBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + SparseTable mAddressLookupTable; +}; +} // namespace latinime +#endif /* LATINIME_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp new file mode 100644 index 000000000..5503151fd --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + return NOT_A_DICT_POS; + } + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); + return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? + NOT_A_DICT_POS : terminalPos; +} + +bool TerminalPositionLookupTable::setTerminalPtNodePosition( + const int terminalId, const int terminalPtNodePos) { + if (terminalId < 0) { + return false; + } + while (terminalId >= mSize) { + // Write new entry. + if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { + return false; + } + mSize++; + } + const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? + terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; + return getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); +} + +bool TerminalPositionLookupTable::flushToFile(FILE *const file) const { + // If the used buffer size is smaller than the actual buffer size, regenerate the lookup + // table and write the new table to the file. + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + TerminalPositionLookupTable lookupTableToWrite; + for (int i = 0; i < mSize; ++i) { + const int terminalPtNodePosition = getTerminalPtNodePosition(i); + if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { + AKLOGE("Cannot set terminal position to lookupTableToWrite." + " terminalId: %d, position: %d", i, terminalPtNodePosition); + return false; + } + } + return lookupTableToWrite.flush(file); + } else { + // We can simply use this lookup table because the buffer size has not been + // changed. + return flush(file); + } +} + +bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { + int removedEntryCount = 0; + int nextNewTerminalId = 0; + for (int i = 0; i < mSize; ++i) { + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); + if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { + // This entry is a garbage. + removedEntryCount++; + } else { + // Give a new terminal id to the entry. + if (!getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, + getEntryPos(nextNewTerminalId))) { + return false; + } + // Memorize the mapping to the old terminal id to the new terminal id. + terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); + nextNewTerminalId++; + } + } + mSize = nextNewTerminalId; + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h new file mode 100644 index 000000000..f45ceb52d --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H +#define LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H + +#include +#include + +#include "defines.h" +#include "dictionary/structure/v4/content/single_dict_content.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class TerminalPositionLookupTable : public SingleDictContent { + public: + typedef std::unordered_map TerminalIdMap; + + TerminalPositionLookupTable(const ReadWriteByteArrayView buffer) + : SingleDictContent(buffer), + mSize(getBuffer()->getTailPosition() + / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} + + TerminalPositionLookupTable() : mSize(0) {} + + int getTerminalPtNodePosition(const int terminalId) const; + + bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); + + int getNextTerminalId() const { + return mSize; + } + + bool flushToFile(FILE *const file) const; + + bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); + + private: + DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); + + int getEntryPos(const int terminalId) const { + return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + } + + int mSize; +}; +} // namespace latinime +#endif // LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h new file mode 100644 index 000000000..25ab22543 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_SHORTCUT_LIST_POLICY_H +#define LATINIME_VER4_SHORTCUT_LIST_POLICY_H + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/structure/v4/content/shortcut_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" + +namespace latinime { + +class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable) + : mShortcutDictContent(shortcutDictContent) {} + + ~Ver4ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + // The first shortcut entry is located at the head position of the shortcut list. + return pos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + int probability = 0; + mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, + outCodePoint, outCodePointCount, &probability, outHasNext, pos); + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); + } + } + + void skipAllShortcuts(int *const pos) const { + // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. + } + + bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, + const int probability) { + const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (shortcutListPos == NOT_A_DICT_POS) { + // Create shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, + false /* hasNext */, writingPos); + } + const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, + codePoints, codePointCount); + if (entryPos == NOT_A_DICT_POS) { + // Add new entry to the shortcut list. + // Create new shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, + codePointCount, probability, true /* hasNext */, &writingPos)) { + AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, + writingPos); + return false; + } + return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); + } + // Overwrite existing entry. + bool hasNext = false; + mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, + 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); + if (!mShortcutDictContent->writeShortcutEntry(codePoints, + codePointCount, probability, hasNext, entryPos)) { + AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, + entryPos); + return false; + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); + + ShortcutDictContent *const mShortcutDictContent; +}; +} // namespace latinime +#endif // LATINIME_VER4_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp new file mode 100644 index 000000000..b0a82839b --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_dict_buffers.h" + +#include +#include +#include +#include +#include +#include + +#include "dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( + const char *const dictPath, MmappedBuffer::MmappedBufferPtr &&headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion) { + if (!headerBuffer) { + ASSERT(false); + AKLOGE("The header buffer must be valid to open ver4 dict buffers."); + return Ver4DictBuffersPtr(nullptr); + } + // TODO: take only dictDirPath, and open both header and trie files in the constructor below + const bool isUpdatable = headerBuffer->isUpdatable(); + MmappedBuffer::MmappedBufferPtr bodyBuffer = MmappedBuffer::openBuffer(dictPath, + Ver4DictConstants::BODY_FILE_EXTENSION, isUpdatable); + if (!bodyBuffer) { + return Ver4DictBuffersPtr(nullptr); + } + std::vector buffers; + const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView(); + int position = 0; + while (position < static_cast(buffer.size())) { + const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition( + buffer.data(), &position); + buffers.push_back(buffer.subView(position, bufferSize)); + position += bufferSize; + if (bufferSize < 0 || position < 0 || position > static_cast(buffer.size())) { + AKLOGE("The dict body file is corrupted."); + return Ver4DictBuffersPtr(nullptr); + } + } + if (buffers.size() != Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE) { + AKLOGE("The dict body file is corrupted."); + return Ver4DictBuffersPtr(nullptr); + } + return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer), + formatVersion, buffers)); +} + +bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const { + // Create temporary directory. + const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + char tmpDirPath[tmpDirPathBufSize]; + FileUtils::getFilePathWithSuffix(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, + tmpDirPath); + if (FileUtils::existsDir(tmpDirPath)) { + if (!FileUtils::removeDirAndFiles(tmpDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); + ASSERT(false); + return false; + } + } + umask(S_IWGRP | S_IWOTH); + if (mkdir(tmpDirPath, S_IRWXU) == -1) { + AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); + return false; + } + // Get dictionary base path. + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); + char dictPath[dictPathBufSize]; + FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); + + // Write header file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { + AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::HEADER_FILE_EXTENSION); + return false; + } + + // Write body file. + const int bodyFilePathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictPath, + Ver4DictConstants::BODY_FILE_EXTENSION); + char bodyFilePath[bodyFilePathBufSize]; + FileUtils::getFilePathWithSuffix(dictPath, Ver4DictConstants::BODY_FILE_EXTENSION, + bodyFilePathBufSize, bodyFilePath); + + const int fd = open(bodyFilePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd == -1) { + AKLOGE("File %s cannot be opened. errno: %d", bodyFilePath, errno); + ASSERT(false); + return false; + } + FILE *const file = fdopen(fd, "wb"); + if (!file) { + AKLOGE("fdopen failed for the file %s. errno: %d", bodyFilePath, errno); + ASSERT(false); + return false; + } + + if (!flushDictBuffers(file)) { + fclose(file); + return false; + } + fclose(file); + // Remove existing dictionary. + if (!FileUtils::removeDirAndFiles(dictDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", dictDirPath); + ASSERT(false); + return false; + } + // Rename temporary directory. + if (rename(tmpDirPath, dictDirPath) != 0) { + AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); + ASSERT(false); + return false; + } + return true; +} + +bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const { + // Write trie. + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableTrieBuffer)) { + AKLOGE("Trie cannot be written."); + return false; + } + // Write terminal position lookup table. + if (!mTerminalPositionLookupTable.flushToFile(file)) { + AKLOGE("Terminal position lookup table cannot be written."); + return false; + } + // Write language model content. + if (!mLanguageModelDictContent.save(file)) { + AKLOGE("Language model dict content cannot be written."); + return false; + } + // Write shortcut dict content. + if (!mShortcutDictContent.flushToFile(file)) { + AKLOGE("Shortcut dict content cannot be written."); + return false; + } + return true; +} + +Ver4DictBuffers::Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, + MmappedBuffer::MmappedBufferPtr &&bodyBuffer, + const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &contentBuffers) + : mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(std::move(bodyBuffer)), + mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion), + mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableTrieBuffer(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mTerminalPositionLookupTable( + contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]), + mLanguageModelDictContent(&contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX], + mHeaderPolicy.hasHistoricalInfoOfWords()), + mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]), + mIsUpdatable(mDictBuffer->isUpdatable()) {} + +Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) + : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy), + mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), + mLanguageModelDictContent(headerPolicy->hasHistoricalInfoOfWords()), + mShortcutDictContent(), mIsUpdatable(true) {} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h new file mode 100644 index 000000000..c8270c93c --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_DICT_BUFFER_H +#define LATINIME_VER4_DICT_BUFFER_H + +#include +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/v4/content/language_model_dict_content.h" +#include "dictionary/structure/v4/content/shortcut_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class Ver4DictBuffers { + public: + typedef std::unique_ptr Ver4DictBuffersPtr; + + static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, + MmappedBuffer::MmappedBufferPtr &&headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion); + + static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( + const HeaderPolicy *const headerPolicy, const int maxTrieSize) { + return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize)); + } + + AK_FORCE_INLINE bool isValid() const { + return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mExpandableTrieBuffer.isNearSizeLimit() + || mTerminalPositionLookupTable.isNearSizeLimit() + || mLanguageModelDictContent.isNearSizeLimit() + || mShortcutDictContent.isNearSizeLimit(); + } + + AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { + return &mHeaderPolicy; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { + return &mExpandableHeaderBuffer; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE LanguageModelDictContent *getMutableLanguageModelDictContent() { + return &mLanguageModelDictContent; + } + + AK_FORCE_INLINE const LanguageModelDictContent *getLanguageModelDictContent() const { + return &mLanguageModelDictContent; + } + + AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + bool flush(const char *const dictDirPath) const { + return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); + } + + bool flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); + + Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, + MmappedBuffer::MmappedBufferPtr &&bodyBuffer, + const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &contentBuffers); + + Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); + + bool flushDictBuffers(FILE *const file) const; + + const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; + const MmappedBuffer::MmappedBufferPtr mDictBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mExpandableHeaderBuffer; + BufferWithExtendableBuffer mExpandableTrieBuffer; + TerminalPositionLookupTable mTerminalPositionLookupTable; + LanguageModelDictContent mLanguageModelDictContent; + ShortcutDictContent mShortcutDictContent; + const int mIsUpdatable; +}; +} // namespace latinime +#endif /* LATINIME_VER4_DICT_BUFFER_H */ diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp new file mode 100644 index 000000000..fd6907824 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +const char *const Ver4DictConstants::BODY_FILE_EXTENSION = ".body"; +const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; + +// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. +const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; +// Extended region size, which is not GCed region size in dict file + additional buffer size, is +// limited to 1MB to prevent from inefficient traversing. +const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; + +// NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT for Trie and TerminalAddressLookupTable. +// NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT for language model. +// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for shortcut. +const size_t Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE = + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT * 2 + + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT + + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; +const int Ver4DictConstants::TRIE_BUFFER_INDEX = 0; +const int Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX = + TRIE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; +const int Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX = + TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; +const int Ver4DictConstants::SHORTCUT_BUFFERS_INDEX = + LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; + +const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; +const int Ver4DictConstants::PROBABILITY_SIZE = 1; +const int Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE = 1; +const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; +const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; +const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; +const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2; + +const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1; +const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2; +const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4; +const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8; +const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10; + +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; + +const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; + +const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT = 1; +const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT = 3; +const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 2; + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h new file mode 100644 index 000000000..13d7a5714 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_DICT_CONSTANTS_H +#define LATINIME_VER4_DICT_CONSTANTS_H + +#include "defines.h" + +#include +#include + +namespace latinime { + +// TODO: Create PtConstants under the pt_common and move some constant values there. +// Note that there are corresponding definitions in FormatSpec.java. +class Ver4DictConstants { + public: + static const char *const BODY_FILE_EXTENSION; + static const char *const HEADER_FILE_EXTENSION; + static const int MAX_DICTIONARY_SIZE; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + + static const size_t NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE; + static const int TRIE_BUFFER_INDEX; + static const int TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX; + static const int LANGUAGE_MODEL_BUFFER_INDEX; + static const int BIGRAM_BUFFERS_INDEX; + static const int SHORTCUT_BUFFERS_INDEX; + + static const int NOT_A_TERMINAL_ID; + static const int PROBABILITY_SIZE; + static const int FLAGS_IN_LANGUAGE_MODEL_SIZE; + static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + static const int NOT_A_TERMINAL_ADDRESS; + static const int TERMINAL_ID_FIELD_SIZE; + static const int TIME_STAMP_FIELD_SIZE; + // TODO: Remove + static const int WORD_LEVEL_FIELD_SIZE; + static const int WORD_COUNT_FIELD_SIZE; + // Flags in probability entry. + static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + static const uint8_t FLAG_NOT_A_VALID_ENTRY; + static const uint8_t FLAG_NOT_A_WORD; + static const uint8_t FLAG_BLACKLISTED; + static const uint8_t FLAG_POSSIBLY_OFFENSIVE; + + static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; + + static const int SHORTCUT_FLAGS_FIELD_SIZE; + static const int SHORTCUT_PROBABILITY_MASK; + static const int SHORTCUT_HAS_NEXT_MASK; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); + + static const size_t NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; + static const size_t NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; + static const size_t NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; +}; +} // namespace latinime +#endif /* LATINIME_VER4_DICT_CONSTANTS_H */ diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..b38b03dcb --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/content/language_model_dict_content.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int siblingNodePos) const { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + return PtNodeParams(); + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + const int headPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + dictBuf, &pos); + const int parentPos = + DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); + int codePoints[MAX_WORD_LENGTH]; + // Code point table is not used for ver4 dictionaries. + const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, nullptr /* codePointTable */, codePoints, &pos); + int terminalIdFieldPos = NOT_A_DICT_POS; + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + terminalIdFieldPos = pos; + if (usesAdditionalBuffer) { + terminalIdFieldPos += mBuffer->getOriginalBufferSize(); + } + terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); + } + int childrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + childrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { + childrenPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + // Sibling position is the tail position of original PtNode. + int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; + // Read destination node if the read node is a moved node. + if (DynamicPtReadingUtils::isMoved(flags)) { + // The destination position is stored at the same place as the parent position. + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); + } else { + return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, + terminalIdFieldPos, terminalId, NOT_A_PROBABILITY, childrenPosFieldPos, childrenPos, + newSiblingNodePos); + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h new file mode 100644 index 000000000..4e5ae3a89 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class HeaderPolicy; +class LanguageModelDictContent; + +/* + * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved + * node and reads node attributes. + */ +class Ver4PatriciaTrieNodeReader : public PtNodeReader { + public: + explicit Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} + + ~Ver4PatriciaTrieNodeReader() {} + + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, + NOT_A_DICT_POS /* siblingNodePos */); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + + const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int siblingNodePos) const; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp new file mode 100644 index 000000000..d974b50f4 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->isTerminal()) { + // The PtNode is a terminal. Delete entry from the terminal position lookup table. + return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); + } else { + return true; + } +} + +// TODO: Quit using bigramLinkedNodePos. +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->hasChildren()) { + // Update children's parent position. + mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); + while (!mReadingHelper.isEnd()) { + const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); + int parentOffsetFieldPos = childPtNodeParams.getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), + &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + mReadingHelper.readNextSiblingNode(childPtNodeParams); + } + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + false /* isDeleted */, true /* willBecomeNonTerminal */); + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { + AKLOGE("Cannot update terminal position lookup table. terminal id: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + // Update flags. + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) { + // Update probability and historical information. + // TODO: Update other information in the unigram property. + if (!toBeUpdatedPtNodeParams->isTerminal()) { + return false; + } + const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); + return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntryOfUnigramProperty); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { + if (!toBeUpdatedPtNodeParams->isTerminal()) { + AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getLanguageModelDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + if (originalProbabilityEntry.isValid()) { + *outNeedsToKeepPtNode = true; + return true; + } + if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); + return false; + } + *outNeedsToKeepPtNode = false; + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( + const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { + int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); + return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + newChildrenPosition, &childrenPosFieldPos); +} + +bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId) { + return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, + toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { + return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, + ptNodeWritingPos); +} + +bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, + int *const ptNodeWritingPos) { + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, + ptNodeWritingPos)) { + return false; + } + // Write probability. + ProbabilityEntry newProbabilityEntry; + const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); + return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( + terminalId, &probabilityEntryOfUnigramProperty); +} + +// TODO: Support counting ngram entries. +bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) { + LanguageModelDictContent *const languageModelDictContent = + mBuffers->getMutableLanguageModelDictContent(); + const ProbabilityEntry probabilityEntry = + languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId); + const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty); + if (!languageModelDictContent->setNgramProbabilityEntry( + prevWordIds, wordId, &probabilityEntryOfNgramProperty)) { + AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d", + prevWordIds[0], prevWordIds.size(), wordId); + return false; + } + if (!probabilityEntry.isValid() && outAddedNewBigram) { + *outAddedNewBigram = true; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, + const int wordId) { + LanguageModelDictContent *const languageModelDictContent = + mBuffers->getMutableLanguageModelDictContent(); + return languageModelDictContent->removeNgramProbabilityEntry(prevWordIds, wordId); +} + +// TODO: Remove when we stop supporting v402 format. +bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { + // Do nothing. + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) { + int parentPos = toBeUpdatedPtNodeParams->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = toBeUpdatedPtNodeParams->getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { + return false; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), + targetCodePoints, targetCodePointCount, shortcutProbability)) { + AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); + return false; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos) { + const int nodePos = *ptNodeWritingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, + 0 /* nodeFlags */, ptNodeWritingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { + return false; + } + // Write code points + if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, + ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { + return false; + } + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!ptNodeParams->willBecomeNonTerminal()) { + if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { + terminalId = ptNodeParams->getTerminalId(); + } else if (ptNodeParams->isTerminal()) { + // Write terminal information using a new terminal id. + // Get a new unused terminal id. + terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); + } + } + const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + if (isTerminal) { + // Update the lookup table. + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + terminalId, nodePos)) { + return false; + } + // Write terminal Id. + if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, + Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { + return false; + } + if (outTerminalId) { + *outTerminalId = terminalId; + } + } + // Write children position + if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { + return false; + } + return updatePtNodeFlags(nodePos, isTerminal, + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal, + const bool hasMultipleChars) { + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */, + false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE); + if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { + AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h new file mode 100644 index 000000000..55856110b --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H +#define LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/structure/v4/content/probability_entry.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class HeaderPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PtNodeArrayReader; +class Ver4ShortcutListPolicy; + +/* + * This class is used for helping to writes nodes of ver4 patricia trie. + */ +class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { + public: + Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, + Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader, + Ver4ShortcutListPolicy *const shortcutPolicy) + : mTrieBuffer(trieBuffer), mBuffers(buffers), + mReadingHelper(ptNodeReader, ptNodeArrayReader), mShortcutPolicy(shortcutPolicy) {} + + virtual ~Ver4PatriciaTrieNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos); + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty); + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition); + + bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId); + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos); + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); + + virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount); + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability); + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); + + bool writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos); + + bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mTrieBuffer; + Ver4DictBuffers *const mBuffers; + DynamicPtReadingHelper mReadingHelper; + Ver4ShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp new file mode 100644 index 000000000..1dbec5545 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -0,0 +1,603 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_policy.h" + +#include +#include + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; +const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + +void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); + while (!readingHelper.isEnd()) { + const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); + if (!ptNodeParams.isValid()) { + break; + } + const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); + const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), + wordId, ptNodeParams.getCodePointArrayView()); + readingHelper.readNextSiblingNode(ptNodeParams); + } + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } +} + +int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + readingHelper.initWithPtNodePos(ptNodePos); + const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( + maxCodePointCount, outCodePoints); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); + } + return codePointCount; +} + +int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_WORD_ID; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.isDeleted()) { + return NOT_A_WORD_ID; + } + return ptNodeParams.getTerminalId(); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + false /* mustMatchAllPrevWords */, mHeaderPolicy); +} + +int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { + return NOT_A_PROBABILITY; + } + const WordAttributes wordAttributes = + mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + true /* mustMatchAllPrevWords */, mHeaderPolicy); + if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { + return NOT_A_PROBABILITY; + } + return wordAttributes.getProbability(); +} + +BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( + const int wordId) const { + const int shortcutPos = getShortcutPositionOfWord(wordId); + return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); +} + +void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const { + if (prevWordIds.empty()) { + return; + } + const auto languageModelDictContent = mBuffers->getLanguageModelDictContent(); + for (size_t i = 1; i <= prevWordIds.size(); ++i) { + for (const auto entry : languageModelDictContent->getProbabilityEntries( + prevWordIds.limit(i))) { + const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry(); + if (!probabilityEntry.isValid()) { + continue; + } + int probability = NOT_A_PROBABILITY; + if (probabilityEntry.hasHistoricalInfo()) { + // TODO: Quit checking count here. + // If count <= 1, the word can be an invaild word. The actual probability should + // be checked using getWordAttributesInContext() in onVisitEntry(). + probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ? + NOT_A_PROBABILITY : 0; + } else { + probability = probabilityEntry.getProbability(); + } + listener->onVisitEntry(probability, entry.getWordId()); + } + } +} + +int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const { + if (wordId == NOT_A_WORD_ID) { + return NOT_A_DICT_POS; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams.getTerminalId()); +} + +bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %zd", + wordCodePoints.size()); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", + shortcut.getTargetCodePoints()->size()); + return false; + } + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + bool addedNewUnigram = false; + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = wordCodePoints.size(); + memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, + &addedNewUnigram)) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { + mEntryCounters.incrementNgramCount(NgramType::Unigram); + } + if (unigramProperty->getShortcuts().size() > 0) { + // Add shortcut target. + const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("Cannot find word id to add shortcut target."); + return false; + } + const int wordPos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (!mUpdatingHelper.addShortcutTarget(wordPos, + CodePointArrayView(*shortcut.getTargetCodePoints()), + shortcut.getProbability())) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " + "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), + shortcut.getProbability()); + return false; + } + } + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { + AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); + return false; + } + if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) { + return false; + } + if (!ptNodeParams.representsNonWordInfo()) { + mEntryCounters.decrementNgramCount(NgramType::Unigram); + } + return true; +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + const NgramContext *const ngramContext = ngramProperty->getNgramContext(); + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); + return false; + } + if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert the ngram to the dictionary. " + "length: %zd", ngramProperty->getTargetCodePoints()->size()); + return false; + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (prevWordIds.empty()) { + return false; + } + for (size_t i = 0; i < prevWordIds.size(); ++i) { + if (prevWordIds[i] != NOT_A_WORD_ID) { + continue; + } + if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) { + return false; + } + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, true /* isNotAWord */, + false /* isBlacklisted */, false /* isPossiblyOffensive */, + MAX_PROBABILITY /* probability */, HistoricalInfo()); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()), + false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + bool addedNewEntry = false; + if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { + if (addedNewEntry) { + mEntryCounters.incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", + wordCodePoints.size()); + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSerch */); + if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) { + return false; + } + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { + mEntryCounters.decrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( + const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, + const bool isValidWord, const HistoricalInfo historicalInfo) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? + false : isValidWord; + int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + // The word is not in the dictionary. + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, + NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, + 0 /* count */)); + if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { + AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + if (!isValidWord) { + return true; + } + wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); + } + + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, + true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, + HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + // Update entries for beginning of sentence. + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( + prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, + mHeaderPolicy, &mEntryCounters)) { + return false; + } + } + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, + wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); + return false; + } + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { + AKLOGE("Cannot flush the dictionary to file."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { + AKLOGE("Cannot flush the dictionary to file with GC."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBuffers->isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() + > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size of the trie exceeds the limit. + return true; + } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mDictBuffer->getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy->isDecayingDict()) { + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), + mHeaderPolicy); + } + return false; +} + +void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, + char *const outResult, const int maxResultLength) { + const int compareLength = queryLength + 1 /* terminator */; + if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mEntryCounters.getNgramCount(NgramType::Unigram)); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Unigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Bigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } +} + +const WordProperty Ver4PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); + } + const LanguageModelDictContent *const languageModelDictContent = + mBuffers->getLanguageModelDictContent(); + // Fetch ngram information. + std::vector ngrams; + int ngramTargetCodePoints[MAX_WORD_LENGTH]; + int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord( + mHeaderPolicy, wordId)) { + const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(), + MAX_WORD_LENGTH, ngramTargetCodePoints); + const WordIdArrayView prevWordIds = entry.getPrevWordIds(); + for (size_t i = 0; i < prevWordIds.size(); ++i) { + ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i], + MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]); + ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry( + prevWordIds[i]).representsBeginningOfSentence(); + if (ngramPrevWordIsBeginningOfSentense[i]) { + ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker( + ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]); + } + } + const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount, + ngramPrevWordIsBeginningOfSentense, prevWordIds.size()); + const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry(); + const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo(); + // TODO: Output flags in WordAttributes. + ngrams.emplace_back(ngramContext, + CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(), + entry.getWordAttributes().getProbability(), *historicalInfo); + } + // Fetch shortcut information. + std::vector shortcuts; + int shortcutPos = getShortcutPositionOfWord(wordId); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers->getShortcutDictContent(); + bool hasNext = true; + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + shortcuts.emplace_back( + CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), + shortcutProbability); + } + } + const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( + WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); + const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), + wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), + wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), + *historicalInfo, std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + const PtNodeParams ptNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos); + *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(), + MAX_WORD_LENGTH, outCodePoints); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h new file mode 100644 index 000000000..d130a4e78 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H +#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +// Word id = Artificial id that is stored in the PtNode looked up by the word. +class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) + : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), + mDictBuffer(mBuffers->getWritableTrieBuffer()), + mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()), + mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer), + mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader, + &mShortcutPolicy), + mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), + mWritingHelper(mBuffers.get()), + mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), + mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; + + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; + + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; + + // TODO: Remove + int getProbability(const int unigramProbability, const int bigramProbability) const { + // Not used. + return NOT_A_PROBABILITY; + } + + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; + + void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const; + + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return mHeaderPolicy; + } + + bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty); + + bool removeUnigramEntry(const CodePointArrayView wordCodePoints); + + bool addNgramEntry(const NgramProperty *const ngramProperty); + + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints); + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); + + bool flush(const char *const filePath); + + bool flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC) const; + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength); + + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); + + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + // When the dictionary size is near the maximum size, we have to refuse dynamic operations to + // prevent the dictionary from overflowing. + static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + + const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; + const HeaderPolicy *const mHeaderPolicy; + BufferWithExtendableBuffer *const mDictBuffer; + Ver4ShortcutListPolicy mShortcutPolicy; + Ver4PatriciaTrieNodeReader mNodeReader; + Ver4PtNodeArrayReader mPtNodeArrayReader; + Ver4PatriciaTrieNodeWriter mNodeWriter; + DynamicPtUpdatingHelper mUpdatingHelper; + Ver4PatriciaTrieWritingHelper mWritingHelper; + MutableEntryCounters mEntryCounters; + std::vector mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; + + int getShortcutPositionOfWord(const int wordId) const; +}; +} // namespace latinime +#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..ccb70cdd3 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( + const uint8_t *const buffer, int *pos) { + return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h new file mode 100644 index 000000000..466ff55d5 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class Ver4PatriciaTrieReadingUtils { + public: + static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..6dfdf4d31 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" + +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, + const EntryCounts &entryCounts) const { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + entryCounts, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d," + "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), + entryCounts.getNgramCount(NgramType::Bigram), + entryCounts.getNgramCount(NgramType::Trigram), + extendedRegionSize); + return false; + } + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const dictDirPath) { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( + Ver4DictBuffers::createVer4DictBuffers(headerPolicy, + Ver4DictConstants::MAX_DICTIONARY_SIZE)); + MutableEntryCounters entryCounters; + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &entryCounters)) { + return false; + } + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { + return false; + } + return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, + MutableEntryCounters *const outEntryCounters) { + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer()); + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), + mBuffers, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); + + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllProbabilityEntriesForGC( + headerPolicy, outEntryCounters)) { + AKLOGE("Failed to update probabilities in language model dict content."); + return false; + } + if (headerPolicy->isDecayingDict()) { + const EntryCounts &maxEntryCounts = headerPolicy->getMaxNgramCounts(); + if (!mBuffers->getMutableLanguageModelDictContent()->truncateEntries( + outEntryCounters->getEntryCounts(), maxEntryCounts, headerPolicy, + outEntryCounters)) { + AKLOGE("Failed to truncate entries in language model dict content."); + return false; + } + } + + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + &ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instances for the GCed dictionary. + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer()); + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), + buffersToWrite->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader, + &newShortcutPolicy); + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + // Run GC for language model dict content. + if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap, + mBuffers->getLanguageModelDictContent())) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { + return false; + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h new file mode 100644 index 000000000..68dd1caa2 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/utils/entry_counters.h" + +namespace latinime { + +class HeaderPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PatriciaTrieNodeWriter; + +class Ver4PatriciaTrieWritingHelper { + public: + Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) + : mBuffers(buffers) {} + + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; + + // This method cannot be const because the original dictionary buffer will be updated to detect + // useless PtNodes during GC. + bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); + + class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) + : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); + + Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; + const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; + }; + + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + Ver4DictBuffers *const buffersToWrite, MutableEntryCounters *const outEntryCounters); + + Ver4DictBuffers *const mBuffers; +}; +} // namespace latinime + +#endif /* LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp new file mode 100644 index 000000000..63d0b4ad5 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" + +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + ptNodeArrayPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = ptNodeArrayPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &readingPos); + if (usesAdditionalBuffer) { + readingPos += mBuffer->getOriginalBufferSize(); + } + if (ptNodeCountInArray < 0) { + AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); + return false; + } + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + forwordLinkPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = forwordLinkPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int nextPtNodeArrayOffset = + DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); + if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { + *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; + } else { + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h new file mode 100644 index 000000000..ccb760bc1 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PT_NODE_ARRAY_READER_H +#define LATINIME_VER4_PT_NODE_ARRAY_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class Ver4PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h new file mode 100644 index 000000000..8a614730b --- /dev/null +++ b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryBigramsIterator { + public: + // Empty iterator. + BinaryDictionaryBigramsIterator() + : mBigramsStructurePolicy(nullptr), mPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(false) {} + + BinaryDictionaryBigramsIterator( + const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos) + : mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mHasNext(pos != NOT_A_DICT_POS) {} + + BinaryDictionaryBigramsIterator(BinaryDictionaryBigramsIterator &&bigramsIterator) + : mBigramsStructurePolicy(bigramsIterator.mBigramsStructurePolicy), + mPos(bigramsIterator.mPos), mBigramPos(bigramsIterator.mBigramPos), + mProbability(bigramsIterator.mProbability), mHasNext(bigramsIterator.mHasNext) {} + + AK_FORCE_INLINE bool hasNext() const { + return mHasNext; + } + + AK_FORCE_INLINE void next() { + mBigramsStructurePolicy->getNextBigram(&mBigramPos, &mProbability, &mHasNext, &mPos); + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + AK_FORCE_INLINE int getBigramPos() const { + return mBigramPos; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator); + + const DictionaryBigramsStructurePolicy *const mBigramsStructurePolicy; + int mPos; + int mBigramPos; + int mProbability; + bool mHasNext; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H diff --git a/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h new file mode 100644 index 000000000..a4ddd58c2 --- /dev/null +++ b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryShortcutIterator { + public: + BinaryDictionaryShortcutIterator( + const DictionaryShortcutsStructurePolicy *const shortcutStructurePolicy, + const int shortcutPos) + : mShortcutStructurePolicy(shortcutStructurePolicy), + mPos(shortcutStructurePolicy->getStartPos(shortcutPos)), + mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {} + + BinaryDictionaryShortcutIterator(const BinaryDictionaryShortcutIterator &&shortcutIterator) + : mShortcutStructurePolicy(shortcutIterator.mShortcutStructurePolicy), + mPos(shortcutIterator.mPos), + mHasNextShortcutTarget(shortcutIterator.mHasNextShortcutTarget) {} + + AK_FORCE_INLINE bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as an int string and put it to outTarget, put its length + // to outTargetLength, put whether it is whitelist to outIsWhitelist. + AK_FORCE_INLINE void nextShortcutTarget( + const int maxDepth, int *const outTarget, int *const outTargetLength, + bool *const outIsWhitelist) { + mShortcutStructurePolicy->getNextShortcut(maxDepth, outTarget, outTargetLength, + outIsWhitelist, &mHasNextShortcutTarget, &mPos); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(BinaryDictionaryShortcutIterator); + DISALLOW_ASSIGNMENT_OPERATOR(BinaryDictionaryShortcutIterator); + + const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy; + int mPos; + bool mHasNextShortcutTarget; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H diff --git a/native/jni/src/dictionary/utils/bloom_filter.h b/native/jni/src/dictionary/utils/bloom_filter.h new file mode 100644 index 000000000..1e60f49ed --- /dev/null +++ b/native/jni/src/dictionary/utils/bloom_filter.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BLOOM_FILTER_H +#define LATINIME_BLOOM_FILTER_H + +#include + +#include "defines.h" + +namespace latinime { + +// This bloom filter is used for optimizing bigram retrieval. +// Execution times with previous word "this" are as follows: +// without bloom filter (use only hash_map): +// Total 147792.34 (sum of others 147771.57) +// with bloom filter: +// Total 145900.64 (sum of others 145874.30) +// always read binary dictionary: +// Total 148603.14 (sum of others 148579.90) +class BloomFilter { + public: + BloomFilter() : mFilter() {} + + AK_FORCE_INLINE void setInFilter(const int position) { + mFilter.set(getIndex(position)); + } + + AK_FORCE_INLINE bool isInFilter(const int position) const { + return mFilter.test(getIndex(position)); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter); + + AK_FORCE_INLINE size_t getIndex(const int position) const { + return static_cast(position) % BIGRAM_FILTER_MODULO; + } + + // Size, in bits, of the bloom filter index for bigrams + // The probability of false positive is (1 - e ** (-kn/m))**k, + // where k is the number of hash functions, n the number of bigrams, and m the number of + // bits we can test. + // At the moment 100 is the maximum number of bigrams for a word with the current main + // dictionaries, so n = 100. 1024 buckets give us m = 1024. + // With 1 hash function, our false positive rate is about 9.3%, which should be enough for + // our uses since we are only using this to increase average performance. For the record, + // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, + // and m = 4096 gives 2.4%. + // This is assigned here because it is used for bitset size. + // 1021 is the largest prime under 1024. + static const size_t BIGRAM_FILTER_MODULO = 1021; + std::bitset mFilter; +}; +} // namespace latinime +#endif // LATINIME_BLOOM_FILTER_H diff --git a/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp new file mode 100644 index 000000000..217569651 --- /dev/null +++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; +// TODO: Needs to allocate larger memory corresponding to the current vector size. +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; + +uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(pos); + const int posInBuffer = readingPosIsInAdditionalBuffer ? pos - mOriginalBuffer.size() : pos; + return ByteArrayUtils::readUint(getBuffer(readingPosIsInAdditionalBuffer), size, posInBuffer); +} + +uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size, + int *const pos) const { + const uint32_t value = readUint(size, *pos); + *pos += size; + return value; +} + +void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(*pos); + if (readingPosIsInAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + // Code point table is not used for dynamic format. + *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( + getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, + nullptr /* codePointTable */, outCodePoints, pos); + if (readingPosIsInAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } +} + +bool BufferWithExtendableBuffer::extend(const int size) { + return checkAndPrepareWriting(getTailPosition(), size); +} + +bool BufferWithExtendableBuffer::writeUint(const uint32_t data, const int size, const int pos) { + int writingPos = pos; + return writeUintAndAdvancePosition(data, size, &writingPos); +} + +bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, + int *const pos) { + if (!(size >= 1 && size <= 4)) { + AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); + ASSERT(false); + return false; + } + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = + usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); + if (usesAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } + return true; +} + +bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints, + const int codePointCount, const bool writesTerminator, int *const pos) { + const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints( + codePoints, codePointCount, writesTerminator); + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = + usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); + if (usesAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePoints, codePointCount, + writesTerminator, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } + return true; +} + +bool BufferWithExtendableBuffer::extendBuffer(const size_t size) { + const size_t extendSize = std::max(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP, size); + const size_t sizeAfterExtending = + std::min(mAdditionalBuffer.size() + extendSize, mMaxAdditionalBufferSize); + if (sizeAfterExtending < mAdditionalBuffer.size() + size) { + return false; + } + mAdditionalBuffer.resize(sizeAfterExtending); + return true; +} + +bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) { + if (pos < 0 || size < 0) { + // Invalid position or size. + return false; + } + const size_t totalRequiredSize = static_cast(pos + size); + if (!isInAdditionalBuffer(pos)) { + // Here don't need to care about the additional buffer. + if (mOriginalBuffer.size() < totalRequiredSize) { + // Violate the boundary. + return false; + } + // The buffer has sufficient capacity. + return true; + } + // Hereafter, pos is in the additional buffer. + const size_t tailPosition = static_cast(getTailPosition()); + if (totalRequiredSize <= tailPosition) { + // The buffer has sufficient capacity. + return true; + } + if (static_cast(pos) != tailPosition) { + // The additional buffer must be extended from the tail position. + return false; + } + const size_t extendSize = totalRequiredSize - + std::min(mAdditionalBuffer.size() + mOriginalBuffer.size(), totalRequiredSize); + if (extendSize > 0 && !extendBuffer(extendSize)) { + // Failed to extend the buffer. + return false; + } + mUsedAdditionalBufferSize += size; + return true; +} + +bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) { + int copyingPos = 0; + const int tailPos = sourceBuffer->getTailPosition(); + const int maxDataChunkSize = sizeof(uint32_t); + while (copyingPos < tailPos) { + const int remainingSize = tailPos - copyingPos; + const int copyingSize = (remainingSize >= maxDataChunkSize) ? + maxDataChunkSize : remainingSize; + const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos); + if (!writeUint(data, copyingSize, copyingPos)) { + return false; + } + copyingPos += copyingSize; + } + return true; +} + +} diff --git a/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h new file mode 100644 index 000000000..0a141d4db --- /dev/null +++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H +#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H + +#include +#include +#include + +#include "defines.h" +#include "dictionary/utils/byte_array_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +// This is used as a buffer that can be extended for updatable dictionaries. +// To optimize performance, raw pointer is directly used for reading buffer. The position has to be +// adjusted to access additional buffer. On the other hand, this class does not provide writable +// raw pointer but provides several methods that handle boundary checking for writing data. +class BufferWithExtendableBuffer { + public: + static const size_t DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE; + + BufferWithExtendableBuffer(const ReadWriteByteArrayView originalBuffer, + const int maxAdditionalBufferSize) + : mOriginalBuffer(originalBuffer), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + // Without original buffer. + BufferWithExtendableBuffer(const int maxAdditionalBufferSize) + : mOriginalBuffer(), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + AK_FORCE_INLINE int getTailPosition() const { + return mOriginalBuffer.size() + mUsedAdditionalBufferSize; + } + + AK_FORCE_INLINE int getUsedAdditionalBufferSize() const { + return mUsedAdditionalBufferSize; + } + + /** + * For reading. + */ + AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { + return position >= static_cast(mOriginalBuffer.size()); + } + + // TODO: Resolve the issue that the address can be changed when the vector is resized. + // CAVEAT!: Be careful about array out of bound access with buffers + AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { + if (usesAdditionalBuffer) { + return mAdditionalBuffer.data(); + } else { + return mOriginalBuffer.data(); + } + } + + uint32_t readUint(const int size, const int pos) const; + + uint32_t readUintAndAdvancePosition(const int size, int *const pos) const; + + void readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const; + + AK_FORCE_INLINE int getOriginalBufferSize() const { + return mOriginalBuffer.size(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize + * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100); + } + + bool extend(const int size); + + /** + * For writing. + * + * Writing is allowed for original buffer, already written region of additional buffer and the + * tail of additional buffer. + */ + bool writeUint(const uint32_t data, const int size, const int pos); + + bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos); + + bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, + const bool writesTerminator, int *const pos); + + bool copy(const BufferWithExtendableBuffer *const sourceBuffer); + + private: + DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); + + static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; + static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + + const ReadWriteByteArrayView mOriginalBuffer; + std::vector mAdditionalBuffer; + int mUsedAdditionalBufferSize; + const size_t mMaxAdditionalBufferSize; + + // Return if the buffer is successfully extended or not. + bool extendBuffer(const size_t size); + + // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of + // the additional buffer, try extending the buffer. + bool checkAndPrepareWriting(const int pos, const int size); +}; +} +#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/native/jni/src/dictionary/utils/byte_array_utils.cpp b/native/jni/src/dictionary/utils/byte_array_utils.cpp new file mode 100644 index 000000000..d38f08217 --- /dev/null +++ b/native/jni/src/dictionary/utils/byte_array_utils.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; +const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/byte_array_utils.h b/native/jni/src/dictionary/utils/byte_array_utils.h new file mode 100644 index 000000000..abb979050 --- /dev/null +++ b/native/jni/src/dictionary/utils/byte_array_utils.h @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_UTILS_H +#define LATINIME_BYTE_ARRAY_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +/** + * Utility methods for reading byte arrays. + */ +class ByteArrayUtils { + public: + /** + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading + * + * Each method read a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) + ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; + } + + static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; + } + + static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 8) ^ buffer[pos + 1]; + } + + static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { + return buffer[pos]; + } + + static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint32(buffer, *pos); + *pos += 4; + return value; + } + + static AK_FORCE_INLINE int readSint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t value = readUint8(buffer, *pos); + if (value < 0x80) { + return readUint24AndAdvancePosition(buffer, pos); + } else { + (*pos)++; + return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); + } + } + + static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint24(buffer, *pos); + *pos += 3; + return value; + } + + static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint16_t value = readUint16(buffer, *pos); + *pos += 2; + return value; + } + + static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return buffer[(*pos)++]; + } + + static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, + const int size, const int pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + return ByteArrayUtils::readUint8(buffer, pos); + case 2: + return ByteArrayUtils::readUint16(buffer, pos); + case 3: + return ByteArrayUtils::readUint24(buffer, pos); + case 4: + return ByteArrayUtils::readUint32(buffer, pos); + default: + return 0; + } + } + + /** + * Code Point Reading + * + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + */ + static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { + int p = pos; + return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); + } + + static AK_FORCE_INLINE int readCodePointAndAdvancePosition( + const uint8_t *const buffer, const int *const codePointTable, int *const pos) { + /* + * codePointTable is an array to convert the most frequent characters in this dictionary to + * 1 byte code points. It is only made of the original code points of the most frequent + * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. + * The original code points are restored by picking the code points at the indices of the + * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. + */ + const uint8_t firstByte = readUint8(buffer, *pos); + if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte == CHARACTER_ARRAY_TERMINATOR) { + *pos += 1; + return NOT_A_CODE_POINT; + } else { + return readUint24AndAdvancePosition(buffer, pos); + } + } else { + *pos += 1; + if (codePointTable) { + return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; + } + return firstByte; + } + } + + /** + * String (array of code points) Reading + * + * Reads code points until the terminator is found. + */ + // Returns the length of the string. + static int readStringAndAdvancePosition(const uint8_t *const buffer, + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + outBuffer[length++] = codePoint; + codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + } + return length; + } + + // Advances the position and returns the length of the string. + static int advancePositionToBehindString( + const uint8_t *const buffer, const int maxLength, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + length++; + } + return length; + } + + /** + * String (array of code points) Writing + */ + static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, + const int *const codePoints, const int codePointCount, const bool writesTerminator, + int *const pos) { + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + writeUint24AndAdvancePosition(buffer, codePoint, pos); + } else { + // one byte character. + writeUint8AndAdvancePosition(buffer, codePoint, pos); + } + } + if (writesTerminator) { + writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); + } + } + + static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, + const int codePointCount, const bool writesTerminator) { + int byteCount = 0; + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + byteCount += 3; + } else { + // one byte character. + byteCount += 1; + } + } + if (writesTerminator) { + // The terminator is one byte. + byteCount += 1; + } + return byteCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); + + static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } +}; +} // namespace latinime +#endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp new file mode 100644 index 000000000..033a758ba --- /dev/null +++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/dict_file_writing_utils.h" + +#include +#include +#include +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/format_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; +// Enough size to describe buffer size. +const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4; + +/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, + const int dictVersion, const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + TimeKeeper::setCurrentTime(); + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); + switch (formatVersion) { + case FormatUtils::VERSION_402: + return createEmptyV4DictFile( + filePath, localeAsCodePointVector, attributeMap, formatVersion); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_403: + return createEmptyV4DictFile( + filePath, localeAsCodePointVector, attributeMap, formatVersion); + default: + AKLOGE("Cannot create dictionary %s because format version %d is not supported.", + filePath, dictVersion); + return false; + } +} + +template +/* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion) { + HeaderPolicy headerPolicy(formatVersion, localeAsCodePointVector, attributeMap); + DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, + DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); + headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); + if (!DynamicPtWritingUtils::writeEmptyDictionary( + dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { + AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); + return false; + } + return dictBuffers->flush(dirPath); +} + +/* static */ bool DictFileWritingUtils::flushBufferToFileWithSuffix(const char *const basePath, + const char *const suffix, const BufferWithExtendableBuffer *const buffer) { + const int filePathBufSize = FileUtils::getFilePathWithSuffixBufSize(basePath, suffix); + char filePath[filePathBufSize]; + FileUtils::getFilePathWithSuffix(basePath, suffix, filePathBufSize, filePath); + return flushBufferToFile(filePath, buffer); +} + +/* static */ bool DictFileWritingUtils::writeBufferToFileTail(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + uint8_t bufferSize[SIZE_OF_BUFFER_SIZE_FIELD]; + int writingPos = 0; + ByteArrayUtils::writeUintAndAdvancePosition(bufferSize, buffer->getTailPosition(), + SIZE_OF_BUFFER_SIZE_FIELD, &writingPos); + if (fwrite(bufferSize, SIZE_OF_BUFFER_SIZE_FIELD, 1 /* count */, file) < 1) { + return false; + } + return writeBufferToFile(file, buffer); +} + +/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer) { + const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd == -1) { + AKLOGE("File %s cannot be opened. errno: %d", filePath, errno); + ASSERT(false); + return false; + } + FILE *const file = fdopen(fd, "wb"); + if (!file) { + AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno); + ASSERT(false); + return false; + } + if (!writeBufferToFile(file, buffer)) { + fclose(file); + remove(filePath); + AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath, + buffer->getTailPosition()); + ASSERT(false); + return false; + } + fclose(file); + return true; +} + +// Returns whether the writing was succeeded or not. +/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + const int originalBufSize = buffer->getOriginalBufferSize(); + if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */), + originalBufSize, 1, file) < 1) { + return false; + } + const int additionalBufSize = buffer->getUsedAdditionalBufferSize(); + if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */), + additionalBufSize, 1, file) < 1) { + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/dictionary/utils/dict_file_writing_utils.h new file mode 100644 index 000000000..102a89da4 --- /dev/null +++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H +#define LATINIME_DICT_FILE_WRITING_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DictFileWritingUtils { + public: + static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + + static bool createEmptyDictFile(const char *const filePath, const int dictVersion, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static bool flushBufferToFileWithSuffix(const char *const basePath, const char *const suffix, + const BufferWithExtendableBuffer *const buffer); + + static bool writeBufferToFileTail(FILE *const file, + const BufferWithExtendableBuffer *const buffer); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); + + static const int SIZE_OF_BUFFER_SIZE_FIELD; + + static bool createEmptyV401DictFile(const char *const filePath, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); + + template + static bool createEmptyV4DictFile(const char *const filePath, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); + + static bool flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer); + + static bool writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer); +}; +} // namespace latinime +#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/entry_counters.h b/native/jni/src/dictionary/utils/entry_counters.h new file mode 100644 index 000000000..5e443026e --- /dev/null +++ b/native/jni/src/dictionary/utils/entry_counters.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ENTRY_COUNTERS_H +#define LATINIME_ENTRY_COUNTERS_H + +#include + +#include "defines.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +// Copyable but immutable +class EntryCounts final { + public: + EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {} + + explicit EntryCounts(const std::array &counters) + : mEntryCounts(counters) {} + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounts[static_cast(ngramType)]; + } + + const std::array &getCountArray() const { + return mEntryCounts; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts); + + // Counts from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + const std::array mEntryCounts; +}; + +class MutableEntryCounters final { + public: + MutableEntryCounters() { + mEntryCounters.fill(0); + } + + explicit MutableEntryCounters( + const std::array &counters) + : mEntryCounters(counters) {} + + const EntryCounts getEntryCounts() const { + return EntryCounts(mEntryCounters); + } + + void incrementNgramCount(const NgramType ngramType) { + ++mEntryCounters[static_cast(ngramType)]; + } + + void decrementNgramCount(const NgramType ngramType) { + --mEntryCounters[static_cast(ngramType)]; + } + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounters[static_cast(ngramType)]; + } + + void setNgramCount(const NgramType ngramType, const int count) { + mEntryCounters[static_cast(ngramType)] = count; + } + + private: + DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters); + + // Counters from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + std::array mEntryCounters; +}; +} // namespace latinime +#endif /* LATINIME_ENTRY_COUNTERS_H */ diff --git a/native/jni/src/dictionary/utils/file_utils.cpp b/native/jni/src/dictionary/utils/file_utils.cpp new file mode 100644 index 000000000..bb392fb32 --- /dev/null +++ b/native/jni/src/dictionary/utils/file_utils.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/file_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace latinime { + +// Returns -1 on error. +/* static */ int FileUtils::getFileSize(const char *const filePath) { + const int fd = open(filePath, O_RDONLY); + if (fd == -1) { + return -1; + } + struct stat statBuf; + if (fstat(fd, &statBuf) != 0) { + close(fd); + return -1; + } + close(fd); + return static_cast(statBuf.st_size); +} + +/* static */ bool FileUtils::existsDir(const char *const dirPath) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + return false; + } + closedir(dir); + return true; +} + +// Remove a directory and all files in the directory. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath) { + return removeDirAndFiles(dirPath, 5 /* maxTries */); +} + +// Remove a directory and all files in the directory, trying up to maxTimes. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath, const int maxTries) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + AKLOGE("Cannot open dir %s.", dirPath); + return true; + } + struct dirent *dirent; + while ((dirent = readdir(dir)) != NULL) { + if (dirent->d_type == DT_DIR) { + continue; + } + if (strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0) { + continue; + } + const int filePathBufSize = getFilePathBufSize(dirPath, dirent->d_name); + char filePath[filePathBufSize]; + getFilePath(dirPath, dirent->d_name, filePathBufSize, filePath); + if (remove(filePath) != 0) { + AKLOGE("Cannot remove file %s.", filePath); + closedir(dir); + return false; + } + } + closedir(dir); + if (remove(dirPath) != 0) { + if (maxTries > 0) { + // On NFS, deleting files sometimes creates new files. I'm not sure what the + // correct way of dealing with this is, but for the time being, this seems to work. + removeDirAndFiles(dirPath, maxTries - 1); + } else { + AKLOGE("Cannot remove directory %s.", dirPath); + return false; + } + } + return true; +} + +/* static */ int FileUtils::getFilePathWithSuffixBufSize(const char *const filePath, + const char *const suffix) { + return strlen(filePath) + strlen(suffix) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePathWithSuffix(const char *const filePath, + const char *const suffix, const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s%s", filePath, suffix); +} + +/* static */ int FileUtils::getFilePathBufSize(const char *const dirPath, + const char *const fileName) { + return strlen(dirPath) + 1 /* '/' */ + strlen(fileName) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s/%s", dirPath, fileName); +} + +/* static */ bool FileUtils::getFilePathWithoutSuffix(const char *const filePath, + const char *const suffix, const int outDirPathBufSize, char *const outDirPath) { + const int filePathLength = strlen(filePath); + const int suffixLength = strlen(suffix); + if (filePathLength <= suffixLength) { + AKLOGE("File path length (%s:%d) is shorter that suffix length (%s:%d).", + filePath, filePathLength, suffix, suffixLength); + return false; + } + const int resultFilePathLength = filePathLength - suffixLength; + if (outDirPathBufSize <= resultFilePathLength) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, suffix: %s, outDirPathBufSize: %d", + filePath, suffix, outDirPathBufSize); + return false; + } + if (strncmp(filePath + resultFilePathLength, suffix, suffixLength) != 0) { + AKLOGE("File Path %s does not have %s as a suffix", filePath, suffix); + return false; + } + snprintf(outDirPath, resultFilePathLength + 1 /* terminator */, "%s", filePath); + return true; +} + +/* static */ void FileUtils::getDirPath(const char *const filePath, const int outDirPathBufSize, + char *const outDirPath) { + for (int i = strlen(filePath) - 1; i >= 0; --i) { + if (filePath[i] == '/') { + if (i >= outDirPathBufSize) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, outDirPathBufSize: %d", + filePath, outDirPathBufSize); + ASSERT(false); + return; + } + snprintf(outDirPath, i + 1 /* terminator */, "%s", filePath); + return; + } + } +} + +/* static */ void FileUtils::getBasename(const char *const filePath, + const int outNameBufSize, char *const outName) { + const int filePathBufSize = strlen(filePath) + 1 /* terminator */; + char filePathBuf[filePathBufSize]; + snprintf(filePathBuf, filePathBufSize, "%s", filePath); + const char *const baseName = basename(filePathBuf); + const int baseNameLength = strlen(baseName); + if (baseNameLength >= outNameBufSize) { + AKLOGE("outNameBufSize is too small. filePath: %s, outNameBufSize: %d", + filePath, outNameBufSize); + return; + } + snprintf(outName, baseNameLength + 1 /* terminator */, "%s", baseName); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/file_utils.h b/native/jni/src/dictionary/utils/file_utils.h new file mode 100644 index 000000000..4f1b93a6a --- /dev/null +++ b/native/jni/src/dictionary/utils/file_utils.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FILE_UTILS_H +#define LATINIME_FILE_UTILS_H + +#include "defines.h" + +namespace latinime { + +class FileUtils { + public: + // Returns -1 on error. + static int getFileSize(const char *const filePath); + + static bool existsDir(const char *const dirPath); + + // Remove a directory and all files in the directory. + static bool removeDirAndFiles(const char *const dirPath); + + static int getFilePathWithSuffixBufSize(const char *const filePath, const char *const suffix); + + static void getFilePathWithSuffix(const char *const filePath, const char *const suffix, + const int filePathBufSize, char *const outFilePath); + + static int getFilePathBufSize(const char *const dirPath, const char *const fileName); + + static void getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath); + + // Returns whether the filePath have the suffix. + static bool getFilePathWithoutSuffix(const char *const filePath, const char *const suffix, + const int dirPathBufSize, char *const outDirPath); + + static void getDirPath(const char *const filePath, const int dirPathBufSize, + char *const outDirPath); + + static void getBasename(const char *const filePath, const int outNameBufSize, + char *const outName); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FileUtils); + + static bool removeDirAndFiles(const char *const dirPath, const int maxTries); +}; +} // namespace latinime +#endif /* LATINIME_FILE_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp new file mode 100644 index 000000000..d79ed911b --- /dev/null +++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/forgetting_curve_utils.h" + +#include +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; +const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; + +const int ForgettingCurveUtils::MAX_LEVEL = 15; +const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2; +const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31; +const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30; +const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1; +// TODO: Evaluate whether this should be 7.5 days. +// 15 days +const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60; + +const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2; + +const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; + +// TODO: Revise the logic to decide the initial probability depending on the given probability. +/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { + const int timestamp = newHistoricalInfo->getTimestamp(); + if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { + // Add entry as a valid word. + const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); + const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); + return HistoricalInfo(timestamp, level, count); + } else if (!originalHistoricalInfo->isValid() + || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel() + || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() + && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { + // Initial information. + int count = newHistoricalInfo->getCount(); + if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) { + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1); + return HistoricalInfo(timestamp, level, 0 /* count */); + } + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); + return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy)); + } else { + const int updatedCount = originalHistoricalInfo->getCount() + 1; + if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) { + // The count exceeds the max value the level can be incremented. + if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { + // The level is already max. + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); + } else { + // Raise the level. + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel() + 1, 0 /* count */); + } + } else { + return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); + } + } +} + +/* static */ int ForgettingCurveUtils::decodeProbability( + const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { + const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS); + return sProbabilityTable.getProbability( + headerPolicy->getForgettingCurveProbabilityValuesTableId(), + clampToValidLevelRange(historicalInfo->getLevel()), + clampToValidTimeStepCountRange(elapsedTimeStepCount)); +} + +/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy) { + return historicalInfo->getLevel() > 0 + || getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS) + < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; +} + +/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy) { + if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) { + return HistoricalInfo(); + } + const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; + const int elapsedTimeStep = getElapsedTimeStepCount( + originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds); + if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { + // No need to update historical info. + return *originalHistoricalInfo; + } + // Lower the level. + const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? + originalHistoricalInfo->getLevel() : maxLevelDownAmonut; + const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() + + levelDownAmount * durationToLevelDownInSeconds; + return HistoricalInfo(adjustedTimestampInSeconds, + originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); +} + +/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, + const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) { + const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts(); + for (const auto ngramType : AllNgramTypes::ASCENDING) { + if (entryCounts.getNgramCount(ngramType) + >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) { + // Unigram count exceeds the limit. + return true; + } + } + if (mindsBlockByDecay) { + return false; + } + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS + < TimeKeeper::peekCurrentTime()) { + // Time to decay. + return true; + } + return false; +} + +// See comments in ProbabilityUtils::backoff(). +/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { + // See TODO comments in ForgettingCurveUtils::getProbability(). + return unigramProbability; +} + +/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp, + const int durationToLevelDownInSeconds) { + const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp; + const int timeStepDurationInSeconds = + durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + return elapsedTimeInSeconds / timeStepDurationInSeconds; +} + +/* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) { + return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, + const HeaderPolicy *const headerPolicy) { + return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1); +} + +/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { + return std::min(std::max(level, 0), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) { + return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT); +} + +const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10; + + +ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { + mTables.resize(PROBABILITY_TABLE_COUNT); + for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) { + mTables[tableId].resize(MAX_LEVEL + 1); + for (int level = 0; level <= MAX_LEVEL; ++level) { + mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); + const float initialProbability = getBaseProbabilityForLevel(tableId, level); + const float endProbability = getBaseProbabilityForLevel(tableId, level - 1); + for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; + ++timeStepCount) { + if (level < MIN_VISIBLE_LEVEL) { + mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; + continue; + } + const float probability = initialProbability + * powf(initialProbability / endProbability, + -1.0f * static_cast(timeStepCount) + / static_cast(MAX_ELAPSED_TIME_STEP_COUNT + 1)); + mTables[tableId][level][timeStepCount] = + std::min(std::max(static_cast(probability), 1), MAX_PROBABILITY); + } + } + } +} + +/* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel( + const int tableId, const int level) { + if (tableId == WEAK_PROBABILITY_TABLE_ID) { + // Max probability is 127. + return static_cast(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level))); + } else if (tableId == MODEST_PROBABILITY_TABLE_ID) { + // Max probability is 128. + return static_cast(MODEST_BASE_PROBABILITY * (level + 1)); + } else if (tableId == STRONG_PROBABILITY_TABLE_ID) { + // Max probability is 140. + return static_cast(STRONG_BASE_PROBABILITY * (level + 1)); + } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) { + // Max probability is 160. + return static_cast(AGGRESSIVE_BASE_PROBABILITY * (level + 1)); + } else { + return NOT_A_PROBABILITY; + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/dictionary/utils/forgetting_curve_utils.h new file mode 100644 index 000000000..ddaac7e3b --- /dev/null +++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORGETTING_CURVE_UTILS_H +#define LATINIME_FORGETTING_CURVE_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/utils/entry_counters.h" + +namespace latinime { + +class HeaderPolicy; + +class ForgettingCurveUtils { + public: + static const HistoricalInfo createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy); + + static const HistoricalInfo createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy); + + static int decodeProbability(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); + + static bool needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); + + static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters, + const HeaderPolicy *const headerPolicy); + + // TODO: Improve probability computation method and remove this. + static int getProbabilityBiasForNgram(const int n) { + return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE; + } + + AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) { + return static_cast(static_cast(maxEntryCount) + * ENTRY_COUNT_HARD_LIMIT_WEIGHT); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); + + class ProbabilityTable { + public: + ProbabilityTable(); + + int getProbability(const int tableId, const int level, + const int elapsedTimeStepCount) const { + return mTables[tableId][level][elapsedTimeStepCount]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); + + static const int PROBABILITY_TABLE_COUNT; + static const int WEAK_PROBABILITY_TABLE_ID; + static const int MODEST_PROBABILITY_TABLE_ID; + static const int STRONG_PROBABILITY_TABLE_ID; + static const int AGGRESSIVE_PROBABILITY_TABLE_ID; + + static const int WEAK_MAX_PROBABILITY; + static const int MODEST_BASE_PROBABILITY; + static const int STRONG_BASE_PROBABILITY; + static const int AGGRESSIVE_BASE_PROBABILITY; + + std::vector>> mTables; + + static int getBaseProbabilityForLevel(const int tableId, const int level); + }; + + static const int MULTIPLIER_TWO_IN_PROBABILITY_SCALE; + static const int DECAY_INTERVAL_SECONDS; + + static const int MAX_LEVEL; + static const int MIN_VISIBLE_LEVEL; + static const int MAX_ELAPSED_TIME_STEP_COUNT; + static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; + static const int OCCURRENCES_TO_RAISE_THE_LEVEL; + static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; + + static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT; + + static const ProbabilityTable sProbabilityTable; + + static int backoff(const int unigramProbability); + static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown); + static int clampToVisibleEntryLevelRange(const int level); + static int clampToValidLevelRange(const int level); + static int clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy); + static int clampToValidTimeStepCountRange(const int timeStepCount); +}; +} // namespace latinime +#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/format_utils.cpp b/native/jni/src/dictionary/utils/format_utils.cpp new file mode 100644 index 000000000..cef3b094c --- /dev/null +++ b/native/jni/src/dictionary/utils/format_utils.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/format_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; + +// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 +const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; + +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { + switch (formatVersion) { + case VERSION_2: + case VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return UNKNOWN_VERSION; + case VERSION_202: + return VERSION_202; + case VERSION_4_ONLY_FOR_TESTING: + return VERSION_4_ONLY_FOR_TESTING; + case VERSION_402: + return VERSION_402; + case VERSION_403: + return VERSION_403; + default: + return UNKNOWN_VERSION; + } +} +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( + const ReadOnlyByteArrayView dictBuffer) { + // The magic number is stored big-endian. + // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't + // understand this format. + if (dictBuffer.size() < DICTIONARY_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + const uint32_t magicNumber = ByteArrayUtils::readUint32(dictBuffer.data(), 0); + switch (magicNumber) { + case MAGIC_NUMBER: + // The layout of the header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Dictionary format version number (2 bytes) + // Options (2 bytes) + // Header size (4 bytes) : integer, big endian + // Conceptually this converts the hardcoded value of the bytes in the file into + // the symbolic value we use in the code. But we want the constants to be the + // same so we use them for both here. + return getFormatVersion(ByteArrayUtils::readUint16(dictBuffer.data(), 4)); + default: + return UNKNOWN_VERSION; + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/format_utils.h b/native/jni/src/dictionary/utils/format_utils.h new file mode 100644 index 000000000..1616efcce --- /dev/null +++ b/native/jni/src/dictionary/utils/format_utils.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORMAT_UTILS_H +#define LATINIME_FORMAT_UTILS_H + +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/** + * Methods to handle binary dictionary format version. + */ +class FormatUtils { + public: + enum FORMAT_VERSION { + // These MUST have the same values as the relevant constants in FormatSpec.java. + // TODO: Remove VERSION_2 and VERSION_201 when we: + // * Confirm that old versions of LatinIME download old-format dictionaries + // * We no longer need the corresponding constants on the Java side for dicttool + VERSION_2 = 2, + VERSION_201 = 201, + VERSION_202 = 202, + VERSION_4_ONLY_FOR_TESTING = 399, + VERSION_402 = 402, + VERSION_403 = 403, + UNKNOWN_VERSION = -1 + }; + + // 32 bit magic number is stored at the beginning of the dictionary header to reject + // unsupported or obsolete dictionary formats. + static const uint32_t MAGIC_NUMBER; + + static FORMAT_VERSION getFormatVersion(const int formatVersion); + static FORMAT_VERSION detectFormatVersion(const ReadOnlyByteArrayView dictBuffer); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); + + static const size_t DICTIONARY_MINIMUM_SIZE; +}; +} // namespace latinime +#endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/dictionary/utils/mmapped_buffer.cpp new file mode 100644 index 000000000..c5259de6d --- /dev/null +++ b/native/jni/src/dictionary/utils/mmapped_buffer.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/mmapped_buffer.h" + +#include +#include +#include +#include +#include +#include + +#include "dictionary/utils/file_utils.h" + +namespace latinime { + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const int bufferOffset, const int bufferSize, + const bool isUpdatable) { + const int mmapFd = open(path, O_RDONLY); + if (mmapFd < 0) { + AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); + return nullptr; + } + const int pagesize = sysconf(_SC_PAGESIZE); + const int offset = bufferOffset % pagesize; + int alignedOffset = bufferOffset - offset; + int alignedSize = bufferSize + offset; + const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; + void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, + alignedOffset); + if (mmappedBuffer == MAP_FAILED) { + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + close(mmapFd); + return nullptr; + } + uint8_t *const buffer = static_cast(mmappedBuffer) + offset; + if (!buffer) { + AKLOGE("DICT: buffer is null"); + close(mmapFd); + return nullptr; + } + return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, + mmapFd, isUpdatable)); +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const bool isUpdatable) { + const int fileSize = FileUtils::getFileSize(path); + if (fileSize == -1) { + return nullptr; + } else if (fileSize == 0) { + return MmappedBufferPtr(new MmappedBuffer(isUpdatable)); + } else { + return openBuffer(path, 0 /* bufferOffset */, fileSize, isUpdatable); + } +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const dirPath, const char *const fileName, const bool isUpdatable) { + const int filePathBufferSize = PATH_MAX + 1 /* terminator */; + char filePath[filePathBufferSize]; + const int filePathLength = snprintf(filePath, filePathBufferSize, "%s%s", dirPath, + fileName); + if (filePathLength >= filePathBufferSize) { + return nullptr; + } + return openBuffer(filePath, isUpdatable); +} + +MmappedBuffer::~MmappedBuffer() { + if (mAlignedSize == 0) { + return; + } + int ret = munmap(mMmappedBuffer, mAlignedSize); + if (ret != 0) { + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + } + ret = close(mMmapFd); + if (ret != 0) { + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/mmapped_buffer.h b/native/jni/src/dictionary/utils/mmapped_buffer.h new file mode 100644 index 000000000..e25310373 --- /dev/null +++ b/native/jni/src/dictionary/utils/mmapped_buffer.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MMAPPED_BUFFER_H +#define LATINIME_MMAPPED_BUFFER_H + +#include +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class MmappedBuffer { + public: + typedef std::unique_ptr MmappedBufferPtr; + + static MmappedBufferPtr openBuffer(const char *const path, + const int bufferOffset, const int bufferSize, const bool isUpdatable); + + // Mmap entire file. + static MmappedBufferPtr openBuffer(const char *const path, const bool isUpdatable); + + static MmappedBufferPtr openBuffer(const char *const dirPath, const char *const fileName, + const bool isUpdatable); + + ~MmappedBuffer(); + + ReadWriteByteArrayView getReadWriteByteArrayView() const { + return mByteArrayView; + } + + ReadOnlyByteArrayView getReadOnlyByteArrayView() const { + return mByteArrayView.getReadOnlyView(); + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + private: + AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize, + void *const mmappedBuffer, const int alignedSize, const int mmapFd, + const bool isUpdatable) + : mByteArrayView(buffer, bufferSize), mMmappedBuffer(mmappedBuffer), + mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} + + // Empty file. We have to handle an empty file as a valid part of a dictionary. + AK_FORCE_INLINE MmappedBuffer(const bool isUpdatable) + : mByteArrayView(), mMmappedBuffer(nullptr), mAlignedSize(0), + mMmapFd(0), mIsUpdatable(isUpdatable) {} + + DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); + + const ReadWriteByteArrayView mByteArrayView; + void *const mMmappedBuffer; + const int mAlignedSize; + const int mMmapFd; + const bool mIsUpdatable; +}; +} +#endif /* LATINIME_MMAPPED_BUFFER_H */ diff --git a/native/jni/src/dictionary/utils/multi_bigram_map.cpp b/native/jni/src/dictionary/utils/multi_bigram_map.cpp new file mode 100644 index 000000000..e730fff8e --- /dev/null +++ b/native/jni/src/dictionary/utils/multi_bigram_map.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/multi_bigram_map.h" + +#include +#include + +namespace latinime { + +// Max number of bigram maps (previous word contexts) to be cached. Increasing this number +// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory +// usage. Also, there are diminishing returns since the most frequently used bigrams are +// typically near the beginning of the input and are thus the first ones to be cached. Note +// that these bigrams are reset for each new composing word. +const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; + +// Most common previous word contexts currently have 100 bigrams +const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; + +// Look up the bigram probability for the given word pair from the cached bigram maps. +// Also caches the bigrams if there is space remaining and they have not been cached already. +int MultiBigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, + const int unigramProbability) { + if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) { + return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); + } + const auto mapPosition = mBigramMaps.find(prevWordIds[0]); + if (mapPosition != mBigramMaps.end()) { + return mapPosition->second.getBigramProbability(structurePolicy, nextWordId, + unigramProbability); + } + if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { + addBigramsForWord(structurePolicy, prevWordIds); + return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy, + nextWordId, unigramProbability); + } + return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds, + nextWordId, unigramProbability); +} + +void MultiBigramMap::BigramMap::init( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds) { + structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */); +} + +int MultiBigramMap::BigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordId, const int unigramProbability) const { + int bigramProbability = NOT_A_PROBABILITY; + if (mBloomFilter.isInFilter(nextWordId)) { + const auto bigramProbabilityIt = mBigramMap.find(nextWordId); + if (bigramProbabilityIt != mBigramMap.end()) { + bigramProbability = bigramProbabilityIt->second; + } + } + return structurePolicy->getProbability(unigramProbability, bigramProbability); +} + +void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) { + if (targetWordId == NOT_A_WORD_ID) { + return; + } + mBigramMap[targetWordId] = ngramProbability; + mBloomFilter.setInFilter(targetWordId); +} + +void MultiBigramMap::addBigramsForWord( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds) { + mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds); +} + +int MultiBigramMap::readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) { + const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId); + if (bigramProbability != NOT_A_PROBABILITY) { + return bigramProbability; + } + return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/multi_bigram_map.h b/native/jni/src/dictionary/utils/multi_bigram_map.h new file mode 100644 index 000000000..6f23d98bc --- /dev/null +++ b/native/jni/src/dictionary/utils/multi_bigram_map.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MULTI_BIGRAM_MAP_H +#define LATINIME_MULTI_BIGRAM_MAP_H + +#include +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/bloom_filter.h" +#include "utils/int_array_view.h" + +namespace latinime { + +// Class for caching bigram maps for multiple previous word contexts. This is useful since the +// algorithm needs to look up the set of bigrams for every word pair that occurs in every +// multi-word suggestion. +class MultiBigramMap { + public: + MultiBigramMap() : mBigramMaps() {} + ~MultiBigramMap() {} + + // Look up the bigram probability for the given word pair from the cached bigram maps. + // Also caches the bigrams if there is space remaining and they have not been cached already. + int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); + + void clear() { + mBigramMaps.clear(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(MultiBigramMap); + + class BigramMap : public NgramListener { + public: + BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} + // Copy constructor needed for std::unordered_map. + BigramMap(const BigramMap &bigramMap) + : mBigramMap(bigramMap.mBigramMap), mBloomFilter(bigramMap.mBloomFilter) {} + virtual ~BigramMap() {} + + void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds); + int getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordId, const int unigramProbability) const; + virtual void onVisitEntry(const int ngramProbability, const int targetWordId); + + private: + static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; + std::unordered_map mBigramMap; + BloomFilter mBloomFilter; + }; + + void addBigramsForWord(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds); + + int readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); + + static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; + std::unordered_map mBigramMaps; +}; +} // namespace latinime +#endif // LATINIME_MULTI_BIGRAM_MAP_H diff --git a/native/jni/src/dictionary/utils/probability_utils.cpp b/native/jni/src/dictionary/utils/probability_utils.cpp new file mode 100644 index 000000000..426a0e783 --- /dev/null +++ b/native/jni/src/dictionary/utils/probability_utils.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/probability_utils.h" + +namespace latinime { + +const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f; + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/probability_utils.h b/native/jni/src/dictionary/utils/probability_utils.h new file mode 100644 index 000000000..2050af1e9 --- /dev/null +++ b/native/jni/src/dictionary/utils/probability_utils.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_UTILS_H +#define LATINIME_PROBABILITY_UTILS_H + +#include +#include + +#include "defines.h" + +namespace latinime { + +// TODO: Quit using bigram probability to indicate the delta. +class ProbabilityUtils { + public: + static AK_FORCE_INLINE int backoff(const int unigramProbability) { + return unigramProbability; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramProbability > 8 ? + // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); + } + + static AK_FORCE_INLINE int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want + // the unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictEncoder#makeBigramFlags for details. + const float stepSize = static_cast(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast(static_cast(bigramProbability + 1) * stepSize); + } + + // Encode probability using the same way as we are doing for main dictionaries. + static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) { + const float probability = static_cast(MAX_PROBABILITY) + + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER; + if (probability < 0.0f) { + return 0; + } + return std::min(static_cast(probability + 0.5f), MAX_PROBABILITY); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); + + static const float PROBABILITY_ENCODING_SCALER; +}; +} +#endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/sparse_table.cpp b/native/jni/src/dictionary/utils/sparse_table.cpp new file mode 100644 index 000000000..029329fab --- /dev/null +++ b/native/jni/src/dictionary/utils/sparse_table.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/sparse_table.h" + +namespace latinime { + +const int SparseTable::NOT_EXIST = -1; +const int SparseTable::INDEX_SIZE = 4; + +bool SparseTable::contains(const int id) const { + const int readingPos = getPosInIndexTable(id); + if (id < 0 || mIndexTableBuffer->getTailPosition() <= readingPos) { + return false; + } + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, readingPos); + return index != NOT_EXIST; +} + +uint32_t SparseTable::get(const int id) const { + const int indexTableReadingPos = getPosInIndexTable(id); + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, indexTableReadingPos); + const int contentTableReadingPos = getPosInContentTable(id, index); + if (contentTableReadingPos < 0 + || contentTableReadingPos >= mContentTableBuffer->getTailPosition()) { + AKLOGE("contentTableReadingPos(%d) is invalid. id: %d, index: %d", + contentTableReadingPos, id, index); + return NOT_A_DICT_POS; + } + const int contentValue = mContentTableBuffer->readUint(mDataSize, contentTableReadingPos); + return contentValue == NOT_EXIST ? NOT_A_DICT_POS : contentValue; +} + +bool SparseTable::set(const int id, const uint32_t value) { + const int posInIndexTable = getPosInIndexTable(id); + // Extends the index table if needed. + int tailPos = mIndexTableBuffer->getTailPosition(); + while (tailPos <= posInIndexTable) { + if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) { + AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable); + return false; + } + } + if (contains(id)) { + // The entry is already in the content table. + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable); + if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) { + AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value, + getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(), + mDataSize); + return false; + } + return true; + } + // The entry is not in the content table. + // Create new entry in the content table. + const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition()); + if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) { + AKLOGE("cannot write index %d. pos %d", index, posInIndexTable); + return false; + } + // Write a new block that containing the entry to be set. + int writingPos = getPosInContentTable(0 /* id */, index); + for (int i = 0; i < mBlockSize; ++i) { + if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, mDataSize, + &writingPos)) { + AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, " + "mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize); + return false; + } + } + return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index)); +} + +int SparseTable::getIndexFromContentTablePos(const int contentTablePos) const { + return contentTablePos / mDataSize / mBlockSize; +} + +int SparseTable::getPosInIndexTable(const int id) const { + return (id / mBlockSize) * INDEX_SIZE; +} + +int SparseTable::getPosInContentTable(const int id, const int index) const { + const int offset = id % mBlockSize; + return (index * mBlockSize + offset) * mDataSize; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/sparse_table.h b/native/jni/src/dictionary/utils/sparse_table.h new file mode 100644 index 000000000..bd1190e8b --- /dev/null +++ b/native/jni/src/dictionary/utils/sparse_table.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SPARSE_TABLE_H +#define LATINIME_SPARSE_TABLE_H + +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// TODO: Support multiple content buffers. +class SparseTable { + public: + SparseTable(BufferWithExtendableBuffer *const indexTableBuffer, + BufferWithExtendableBuffer *const contentTableBuffer, const int blockSize, + const int dataSize) + : mIndexTableBuffer(indexTableBuffer), mContentTableBuffer(contentTableBuffer), + mBlockSize(blockSize), mDataSize(dataSize) {} + + bool contains(const int id) const; + + uint32_t get(const int id) const; + + bool set(const int id, const uint32_t value); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTable); + + int getIndexFromContentTablePos(const int contentTablePos) const; + + int getPosInIndexTable(const int id) const; + + int getPosInContentTable(const int id, const int index) const; + + static const int NOT_EXIST; + static const int INDEX_SIZE; + + BufferWithExtendableBuffer *const mIndexTableBuffer; + BufferWithExtendableBuffer *const mContentTableBuffer; + const int mBlockSize; + const int mDataSize; +}; +} // namespace latinime +#endif /* LATINIME_SPARSE_TABLE_H */ diff --git a/native/jni/src/dictionary/utils/trie_map.cpp b/native/jni/src/dictionary/utils/trie_map.cpp new file mode 100644 index 000000000..0bef8c702 --- /dev/null +++ b/native/jni/src/dictionary/utils/trie_map.cpp @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/trie_map.h" + +#include "dictionary/utils/dict_file_writing_utils.h" + +namespace latinime { + +const int TrieMap::INVALID_INDEX = -1; +const int TrieMap::FIELD0_SIZE = 4; +const int TrieMap::FIELD1_SIZE = 3; +const int TrieMap::ENTRY_SIZE = FIELD0_SIZE + FIELD1_SIZE; +const uint32_t TrieMap::VALUE_FLAG = 0x400000; +const uint32_t TrieMap::VALUE_MASK = 0x3FFFFF; +const uint32_t TrieMap::INVALID_VALUE_IN_KEY_VALUE_ENTRY = VALUE_MASK; +const uint32_t TrieMap::TERMINAL_LINK_FLAG = 0x800000; +const uint32_t TrieMap::TERMINAL_LINK_MASK = 0x7FFFFF; +const int TrieMap::NUM_OF_BITS_USED_FOR_ONE_LEVEL = 5; +const uint32_t TrieMap::LABEL_MASK = 0x1F; +const int TrieMap::MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL = 1 << NUM_OF_BITS_USED_FOR_ONE_LEVEL; +const int TrieMap::ROOT_BITMAP_ENTRY_INDEX = 0; +const int TrieMap::ROOT_BITMAP_ENTRY_POS = MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL * FIELD0_SIZE; +const TrieMap::Entry TrieMap::EMPTY_BITMAP_ENTRY = TrieMap::Entry(0, 0); +const int TrieMap::TERMINAL_LINKED_ENTRY_COUNT = 2; // Value entry and bitmap entry. +const uint64_t TrieMap::MAX_VALUE = + (static_cast(1) << ((FIELD0_SIZE + FIELD1_SIZE) * CHAR_BIT)) - 1; +const int TrieMap::MAX_BUFFER_SIZE = TERMINAL_LINK_MASK * ENTRY_SIZE; + +TrieMap::TrieMap() : mBuffer(MAX_BUFFER_SIZE) { + mBuffer.extend(ROOT_BITMAP_ENTRY_POS); + writeEntry(EMPTY_BITMAP_ENTRY, ROOT_BITMAP_ENTRY_INDEX); +} + +TrieMap::TrieMap(const ReadWriteByteArrayView buffer) + : mBuffer(buffer, BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} + +void TrieMap::dump(const int from, const int to) const { + AKLOGI("BufSize: %d", mBuffer.getTailPosition()); + for (int i = from; i < to; ++i) { + AKLOGI("Entry[%d]: %x, %x", i, readField0(i), readField1(i)); + } + int unusedRegionSize = 0; + for (int i = 1; i <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; ++i) { + int index = readEmptyTableLink(i); + while (index != ROOT_BITMAP_ENTRY_INDEX) { + index = readField0(index); + unusedRegionSize += i; + } + } + AKLOGI("Unused Size: %d", unusedRegionSize); +} + +int TrieMap::getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex) { + const Entry bitmapEntry = readEntry(bitmapEntryIndex); + const uint32_t unsignedKey = static_cast(key); + const int terminalEntryIndex = getTerminalEntryIndex( + unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return INVALID_INDEX; + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (terminalEntry.hasTerminalLink()) { + return terminalEntry.getValueEntryIndex() + 1; + } + // Create a value entry and a bitmap entry. + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return INVALID_INDEX; + } + if (!writeEntry(Entry(0, terminalEntry.getValue()), valueEntryIndex)) { + return INVALID_INDEX; + } + if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { + return INVALID_INDEX; + } + if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex)) { + return INVALID_INDEX; + } + return valueEntryIndex + 1; +} + +const TrieMap::Result TrieMap::get(const int key, const int bitmapEntryIndex) const { + const uint32_t unsignedKey = static_cast(key); + return getInternal(unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntryIndex, + 0 /* level */); +} + +bool TrieMap::put(const int key, const uint64_t value, const int bitmapEntryIndex) { + if (value > MAX_VALUE) { + return false; + } + const uint32_t unsignedKey = static_cast(key); + return putInternal(unsignedKey, value, getBitShuffledKey(unsignedKey), bitmapEntryIndex, + readEntry(bitmapEntryIndex), 0 /* level */); +} + +bool TrieMap::save(FILE *const file) const { + return DictFileWritingUtils::writeBufferToFileTail(file, &mBuffer); +} + +bool TrieMap::remove(const int key, const int bitmapEntryIndex) { + const Entry bitmapEntry = readEntry(bitmapEntryIndex); + const uint32_t unsignedKey = static_cast(key); + const int terminalEntryIndex = getTerminalEntryIndex( + unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return false; + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , terminalEntryIndex)) { + return false; + } + if (terminalEntry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(terminalEntry.getValueEntryIndex() + 1); + if (!freeTable(terminalEntry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)){ + return false; + } + } + return true; +} + +/** + * Iterate next entry in a certain level. + * + * @param iterationState the iteration state that will be read and updated in this method. + * @param outKey the output key + * @return Result instance. mIsValid is false when all entries are iterated. + */ +const TrieMap::Result TrieMap::iterateNext(std::vector *const iterationState, + int *const outKey) const { + while (!iterationState->empty()) { + TableIterationState &state = iterationState->back(); + if (state.mTableSize <= state.mCurrentIndex) { + // Move to parent. + iterationState->pop_back(); + } else { + const int entryIndex = state.mTableIndex + state.mCurrentIndex; + state.mCurrentIndex += 1; + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Move to child. + iterationState->emplace_back(popCount(entry.getBitmap()), entry.getTableIndex()); + } else if (entry.isValidTerminalEntry()) { + if (outKey) { + *outKey = entry.getKey(); + } + if (!entry.hasTerminalLink()) { + return Result(entry.getValue(), true, INVALID_INDEX); + } + const int valueEntryIndex = entry.getValueEntryIndex(); + const Entry valueEntry = readEntry(valueEntryIndex); + return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); + } + } + } + // Visited all entries. + return Result(0, false, INVALID_INDEX); +} + +/** + * Shuffle bits of the key in the fixed order. + * + * This method is used as a hash function. This returns different values for different inputs. + */ +uint32_t TrieMap::getBitShuffledKey(const uint32_t key) const { + uint32_t shuffledKey = 0; + for (int i = 0; i < 4; ++i) { + const uint32_t keyPiece = (key >> (i * 8)) & 0xFF; + shuffledKey ^= ((keyPiece ^ (keyPiece << 7) ^ (keyPiece << 14) ^ (keyPiece << 21)) + & 0x11111111) << i; + } + return shuffledKey; +} + +bool TrieMap::writeValue(const uint64_t value, const int terminalEntryIndex) { + if (value < VALUE_MASK) { + // Write value into the terminal entry. + return writeField1(value | VALUE_FLAG, terminalEntryIndex); + } + // Create value entry and write value. + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return false; + } + if (!writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex)) { + return false; + } + if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { + return false; + } + return writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex); +} + +bool TrieMap::updateValue(const Entry &terminalEntry, const uint64_t value, + const int terminalEntryIndex) { + if (!terminalEntry.hasTerminalLink()) { + return writeValue(value, terminalEntryIndex); + } + const int valueEntryIndex = terminalEntry.getValueEntryIndex(); + return writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex); +} + +bool TrieMap::freeTable(const int tableIndex, const int entryCount) { + if (!writeField0(readEmptyTableLink(entryCount), tableIndex)) { + return false; + } + return writeEmptyTableLink(tableIndex, entryCount); +} + +/** + * Allocate table with entryCount-entries. Reuse freed table if possible. + */ +int TrieMap::allocateTable(const int entryCount) { + if (entryCount > 0 && entryCount <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL) { + const int tableIndex = readEmptyTableLink(entryCount); + if (tableIndex > 0) { + if (!writeEmptyTableLink(readField0(tableIndex), entryCount)) { + return INVALID_INDEX; + } + // Reuse the table. + return tableIndex; + } + } + // Allocate memory space at tail position of the buffer. + const int mapIndex = getTailEntryIndex(); + if (!mBuffer.extend(entryCount * ENTRY_SIZE)) { + return INVALID_INDEX; + } + return mapIndex; +} + +int TrieMap::getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, + const Entry &bitmapEntry, const int level) const { + const int label = getLabel(hashedKey, level); + if (!exists(bitmapEntry.getBitmap(), label)) { + return INVALID_INDEX; + } + const int entryIndex = bitmapEntry.getTableIndex() + popCount(bitmapEntry.getBitmap(), label); + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Move to the next level. + return getTerminalEntryIndex(key, hashedKey, entry, level + 1); + } + if (!entry.isValidTerminalEntry()) { + return INVALID_INDEX; + } + if (entry.getKey() == key) { + // Terminal entry is found. + return entryIndex; + } + return INVALID_INDEX; +} + +/** + * Get Result corresponding to the key. + * + * @param key the key. + * @param hashedKey the hashed key. + * @param bitmapEntryIndex the index of bitmap entry + * @param level current level + * @return Result instance corresponding to the key. mIsValid indicates whether the key is in the + * map. + */ +const TrieMap::Result TrieMap::getInternal(const uint32_t key, const uint32_t hashedKey, + const int bitmapEntryIndex, const int level) const { + const int terminalEntryIndex = getTerminalEntryIndex(key, hashedKey, + readEntry(bitmapEntryIndex), level); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return Result(0, false, INVALID_INDEX); + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (!terminalEntry.hasTerminalLink()) { + return Result(terminalEntry.getValue(), true, INVALID_INDEX); + } + const int valueEntryIndex = terminalEntry.getValueEntryIndex(); + const Entry valueEntry = readEntry(valueEntryIndex); + return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); +} + +/** + * Put key to value mapping to the map. + * + * @param key the key. + * @param value the value + * @param hashedKey the hashed key. + * @param bitmapEntryIndex the index of bitmap entry + * @param bitmapEntry the bitmap entry + * @param level current level + * @return whether the key-value has been correctly inserted to the map or not. + */ +bool TrieMap::putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, + const int bitmapEntryIndex, const Entry &bitmapEntry, const int level) { + const int label = getLabel(hashedKey, level); + const uint32_t bitmap = bitmapEntry.getBitmap(); + const int mapIndex = bitmapEntry.getTableIndex(); + if (!exists(bitmap, label)) { + // Current map doesn't contain the label. + return addNewEntryByExpandingTable(key, value, mapIndex, bitmap, bitmapEntryIndex, label); + } + const int entryIndex = mapIndex + popCount(bitmap, label); + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Bitmap entry is found. Go to the next level. + return putInternal(key, value, hashedKey, entryIndex, entry, level + 1); + } + if (!entry.isValidTerminalEntry()) { + // Overwrite invalid terminal entry. + return writeTerminalEntry(key, value, entryIndex); + } + if (entry.getKey() == key) { + // Terminal entry for the key is found. Update the value. + return updateValue(entry, value, entryIndex); + } + // Conflict with the existing key. + return addNewEntryByResolvingConflict(key, value, hashedKey, entry, entryIndex, level); +} + +/** + * Resolve a conflict in the current level and add new entry. + * + * @param key the key + * @param value the value + * @param hashedKey the hashed key + * @param conflictedEntry the existing conflicted entry + * @param conflictedEntryIndex the index of existing conflicted entry + * @param level current level + * @return whether the key-value has been correctly inserted to the map or not. + */ +bool TrieMap::addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, + const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, + const int level) { + const int conflictedKeyNextLabel = + getLabel(getBitShuffledKey(conflictedEntry.getKey()), level + 1); + const int nextLabel = getLabel(hashedKey, level + 1); + if (conflictedKeyNextLabel == nextLabel) { + // Conflicted again in the next level. + const int newTableIndex = allocateTable(1 /* entryCount */); + if (newTableIndex == INVALID_INDEX) { + return false; + } + if (!writeEntry(conflictedEntry, newTableIndex)) { + return false; + } + const Entry newBitmapEntry(setExist(0 /* bitmap */, nextLabel), newTableIndex); + if (!writeEntry(newBitmapEntry, conflictedEntryIndex)) { + return false; + } + return putInternal(key, value, hashedKey, conflictedEntryIndex, newBitmapEntry, level + 1); + } + // The conflict has been resolved. Create a table that contains 2 entries. + const int newTableIndex = allocateTable(2 /* entryCount */); + if (newTableIndex == INVALID_INDEX) { + return false; + } + if (nextLabel < conflictedKeyNextLabel) { + if (!writeTerminalEntry(key, value, newTableIndex)) { + return false; + } + if (!writeEntry(conflictedEntry, newTableIndex + 1)) { + return false; + } + } else { // nextLabel > conflictedKeyNextLabel + if (!writeEntry(conflictedEntry, newTableIndex)) { + return false; + } + if (!writeTerminalEntry(key, value, newTableIndex + 1)) { + return false; + } + } + const uint32_t updatedBitmap = + setExist(setExist(0 /* bitmap */, nextLabel), conflictedKeyNextLabel); + return writeEntry(Entry(updatedBitmap, newTableIndex), conflictedEntryIndex); +} + +/** + * Add new entry to the existing table. + */ +bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, + const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, const int label) { + // Current map doesn't contain the label. + const int entryCount = popCount(bitmap); + const int newTableIndex = allocateTable(entryCount + 1); + if (newTableIndex == INVALID_INDEX) { + return false; + } + const int newEntryIndexInTable = popCount(bitmap, label); + // Copy from existing table to the new table. + for (int i = 0; i < entryCount; ++i) { + if (!copyEntry(tableIndex + i, newTableIndex + i + (i >= newEntryIndexInTable ? 1 : 0))) { + return false; + } + } + // Write new terminal entry. + if (!writeTerminalEntry(key, value, newTableIndex + newEntryIndexInTable)) { + return false; + } + // Update bitmap. + if (!writeEntry(Entry(setExist(bitmap, label), newTableIndex), bitmapEntryIndex)) { + return false; + } + if (entryCount > 0) { + return freeTable(tableIndex, entryCount); + } + return true; +} + +bool TrieMap::removeInner(const Entry &bitmapEntry) { + const int tableSize = popCount(bitmapEntry.getBitmap()); + if (tableSize <= 0) { + // The table is empty. No need to remove any entries. + return true; + } + for (int i = 0; i < tableSize; ++i) { + const int entryIndex = bitmapEntry.getTableIndex() + i; + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Delete next bitmap entry recursively. + if (!removeInner(entry)) { + return false; + } + } else { + // Invalidate terminal entry just in case. + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , entryIndex)) { + return false; + } + if (entry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(entry.getValueEntryIndex() + 1); + if (!freeTable(entry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)) { + return false; + } + } + } + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/trie_map.h b/native/jni/src/dictionary/utils/trie_map.h new file mode 100644 index 000000000..5fc6c2690 --- /dev/null +++ b/native/jni/src/dictionary/utils/trie_map.h @@ -0,0 +1,399 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TRIE_MAP_H +#define LATINIME_TRIE_MAP_H + +#include +#include +#include +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/** + * Trie map derived from Phil Bagwell's Hash Array Mapped Trie. + * key is int and value is uint64_t. + * This supports multiple level map. Terminal entries can have a bitmap for the next level map. + * This doesn't support root map resizing. + */ +class TrieMap { + public: + struct Result { + const uint64_t mValue; + const bool mIsValid; + const int mNextLevelBitmapEntryIndex; + + Result(const uint64_t value, const bool isValid, const int nextLevelBitmapEntryIndex) + : mValue(value), mIsValid(isValid), + mNextLevelBitmapEntryIndex(nextLevelBitmapEntryIndex) {} + }; + + /** + * Struct to record iteration state in a table. + */ + struct TableIterationState { + int mTableSize; + int mTableIndex; + int mCurrentIndex; + + TableIterationState(const int tableSize, const int tableIndex) + : mTableSize(tableSize), mTableIndex(tableIndex), mCurrentIndex(0) {} + }; + + class TrieMapRange; + class TrieMapIterator { + public: + class IterationResult { + public: + IterationResult(const TrieMap *const trieMap, const int key, const uint64_t value, + const int nextLeveBitmapEntryIndex) + : mTrieMap(trieMap), mKey(key), mValue(value), + mNextLevelBitmapEntryIndex(nextLeveBitmapEntryIndex) {} + + const TrieMapRange getEntriesInNextLevel() const { + return TrieMapRange(mTrieMap, mNextLevelBitmapEntryIndex); + } + + bool hasNextLevelMap() const { + return mNextLevelBitmapEntryIndex != INVALID_INDEX; + } + + AK_FORCE_INLINE int key() const { + return mKey; + } + + AK_FORCE_INLINE uint64_t value() const { + return mValue; + } + + AK_FORCE_INLINE int getNextLevelBitmapEntryIndex() const { + return mNextLevelBitmapEntryIndex; + } + + private: + const TrieMap *const mTrieMap; + const int mKey; + const uint64_t mValue; + const int mNextLevelBitmapEntryIndex; + }; + + TrieMapIterator(const TrieMap *const trieMap, const int bitmapEntryIndex) + : mTrieMap(trieMap), mStateStack(), mBaseBitmapEntryIndex(bitmapEntryIndex), + mKey(0), mValue(0), mIsValid(false), mNextLevelBitmapEntryIndex(INVALID_INDEX) { + if (!trieMap || mBaseBitmapEntryIndex == INVALID_INDEX) { + return; + } + const Entry bitmapEntry = mTrieMap->readEntry(mBaseBitmapEntryIndex); + mStateStack.emplace_back( + mTrieMap->popCount(bitmapEntry.getBitmap()), bitmapEntry.getTableIndex()); + this->operator++(); + } + + const IterationResult operator*() const { + return IterationResult(mTrieMap, mKey, mValue, mNextLevelBitmapEntryIndex); + } + + bool operator!=(const TrieMapIterator &other) const { + // Caveat: This works only for for loops. + return mIsValid || other.mIsValid; + } + + const TrieMapIterator &operator++() { + const Result result = mTrieMap->iterateNext(&mStateStack, &mKey); + mValue = result.mValue; + mIsValid = result.mIsValid; + mNextLevelBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + return *this; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapIterator); + DISALLOW_ASSIGNMENT_OPERATOR(TrieMapIterator); + + const TrieMap *const mTrieMap; + std::vector mStateStack; + const int mBaseBitmapEntryIndex; + int mKey; + uint64_t mValue; + bool mIsValid; + int mNextLevelBitmapEntryIndex; + }; + + /** + * Class to support iterating entries in TrieMap by range base for loops. + */ + class TrieMapRange { + public: + TrieMapRange(const TrieMap *const trieMap, const int bitmapEntryIndex) + : mTrieMap(trieMap), mBaseBitmapEntryIndex(bitmapEntryIndex) {}; + + TrieMapIterator begin() const { + return TrieMapIterator(mTrieMap, mBaseBitmapEntryIndex); + } + + const TrieMapIterator end() const { + return TrieMapIterator(nullptr, INVALID_INDEX); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapRange); + DISALLOW_ASSIGNMENT_OPERATOR(TrieMapRange); + + const TrieMap *const mTrieMap; + const int mBaseBitmapEntryIndex; + }; + + static const int INVALID_INDEX; + static const uint64_t MAX_VALUE; + + TrieMap(); + // Construct TrieMap using existing data in the memory region written by save(). + TrieMap(const ReadWriteByteArrayView buffer); + void dump(const int from = 0, const int to = 0) const; + + bool isNearSizeLimit() const { + return mBuffer.isNearSizeLimit(); + } + + int getRootBitmapEntryIndex() const { + return ROOT_BITMAP_ENTRY_INDEX; + } + + // Returns bitmapEntryIndex. Create the next level map if it doesn't exist. + int getNextLevelBitmapEntryIndex(const int key) { + return getNextLevelBitmapEntryIndex(key, ROOT_BITMAP_ENTRY_INDEX); + } + + int getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex); + + const Result getRoot(const int key) const { + return get(key, ROOT_BITMAP_ENTRY_INDEX); + } + + const Result get(const int key, const int bitmapEntryIndex) const; + + bool putRoot(const int key, const uint64_t value) { + return put(key, value, ROOT_BITMAP_ENTRY_INDEX); + } + + bool put(const int key, const uint64_t value, const int bitmapEntryIndex); + + const TrieMapRange getEntriesInRootLevel() const { + return getEntriesInSpecifiedLevel(ROOT_BITMAP_ENTRY_INDEX); + } + + const TrieMapRange getEntriesInSpecifiedLevel(const int bitmapEntryIndex) const { + return TrieMapRange(this, bitmapEntryIndex); + } + + bool save(FILE *const file) const; + + bool remove(const int key, const int bitmapEntryIndex); + + private: + DISALLOW_COPY_AND_ASSIGN(TrieMap); + + /** + * Struct represents an entry. + * + * Entry is one of these entry types. All entries are fixed size and have 2 fields FIELD_0 and + * FIELD_1. + * 1. bitmap entry. bitmap entry contains bitmap and the link to hash table. + * FIELD_0(bitmap) FIELD_1(LINK_TO_HASH_TABLE) + * 2. terminal entry. terminal entry contains hashed key and value or terminal link. terminal + * entry have terminal link when the value is not fit to FIELD_1 or there is a next level map + * for the key. + * FIELD_0(hashed key) (FIELD_1(VALUE_FLAG VALUE) | FIELD_1(TERMINAL_LINK_FLAG TERMINAL_LINK)) + * 3. value entry. value entry represents a value. Upper order bytes are stored in FIELD_0 and + * lower order bytes are stored in FIELD_1. + * FIELD_0(value (upper order bytes)) FIELD_1(value (lower order bytes)) + */ + struct Entry { + const uint32_t mData0; + const uint32_t mData1; + + Entry(const uint32_t data0, const uint32_t data1) : mData0(data0), mData1(data1) {} + + AK_FORCE_INLINE bool isBitmapEntry() const { + return (mData1 & VALUE_FLAG) == 0 && (mData1 & TERMINAL_LINK_FLAG) == 0; + } + + AK_FORCE_INLINE bool hasTerminalLink() const { + return (mData1 & TERMINAL_LINK_FLAG) != 0; + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getKey() const { + return mData0; + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getValue() const { + return mData1 & VALUE_MASK; + } + + // For terminal entry. + AK_FORCE_INLINE bool isValidTerminalEntry() const { + return hasTerminalLink() || ((mData1 & VALUE_MASK) != INVALID_VALUE_IN_KEY_VALUE_ENTRY); + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getValueEntryIndex() const { + return mData1 & TERMINAL_LINK_MASK; + } + + // For bitmap entry. + AK_FORCE_INLINE uint32_t getBitmap() const { + return mData0; + } + + // For bitmap entry. + AK_FORCE_INLINE int getTableIndex() const { + return static_cast(mData1); + } + + // For value entry. + AK_FORCE_INLINE uint64_t getValueOfValueEntry() const { + return ((static_cast(mData0) << (FIELD1_SIZE * CHAR_BIT)) ^ mData1); + } + }; + + BufferWithExtendableBuffer mBuffer; + + static const int FIELD0_SIZE; + static const int FIELD1_SIZE; + static const int ENTRY_SIZE; + static const uint32_t VALUE_FLAG; + static const uint32_t VALUE_MASK; + static const uint32_t INVALID_VALUE_IN_KEY_VALUE_ENTRY; + static const uint32_t TERMINAL_LINK_FLAG; + static const uint32_t TERMINAL_LINK_MASK; + static const int NUM_OF_BITS_USED_FOR_ONE_LEVEL; + static const uint32_t LABEL_MASK; + static const int MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; + static const int ROOT_BITMAP_ENTRY_INDEX; + static const int ROOT_BITMAP_ENTRY_POS; + static const Entry EMPTY_BITMAP_ENTRY; + static const int TERMINAL_LINKED_ENTRY_COUNT; + static const int MAX_BUFFER_SIZE; + + uint32_t getBitShuffledKey(const uint32_t key) const; + bool writeValue(const uint64_t value, const int terminalEntryIndex); + bool updateValue(const Entry &terminalEntry, const uint64_t value, + const int terminalEntryIndex); + bool freeTable(const int tableIndex, const int entryCount); + int allocateTable(const int entryCount); + int getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, + const Entry &bitmapEntry, const int level) const; + const Result getInternal(const uint32_t key, const uint32_t hashedKey, + const int bitmapEntryIndex, const int level) const; + bool putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, + const int bitmapEntryIndex, const Entry &bitmapEntry, const int level); + bool addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, + const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, + const int level); + bool addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, + const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, + const int label); + const Result iterateNext(std::vector *const iterationState, + int *const outKey) const; + + AK_FORCE_INLINE const Entry readEntry(const int entryIndex) const { + return Entry(readField0(entryIndex), readField1(entryIndex)); + } + + // Returns whether an entry for the index is existing by testing if the index-th bit in the + // bitmap is set or not. + AK_FORCE_INLINE bool exists(const uint32_t bitmap, const int index) const { + return (bitmap & (1 << index)) != 0; + } + + // Set index-th bit in the bitmap. + AK_FORCE_INLINE uint32_t setExist(const uint32_t bitmap, const int index) const { + return bitmap | (1 << index); + } + + // Count set bits before index in the bitmap. + AK_FORCE_INLINE int popCount(const uint32_t bitmap, const int index) const { + return popCount(bitmap & ((1 << index) - 1)); + } + + // Count set bits in the bitmap. + AK_FORCE_INLINE int popCount(const uint32_t bitmap) const { + return __builtin_popcount(bitmap); + // int v = bitmap - ((bitmap >> 1) & 0x55555555); + // v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + // return (((v + (v >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; + } + + AK_FORCE_INLINE int getLabel(const uint32_t hashedKey, const int level) const { + return (hashedKey >> (level * NUM_OF_BITS_USED_FOR_ONE_LEVEL)) & LABEL_MASK; + } + + AK_FORCE_INLINE uint32_t readField0(const int entryIndex) const { + return mBuffer.readUint(FIELD0_SIZE, ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); + } + + AK_FORCE_INLINE uint32_t readField1(const int entryIndex) const { + return mBuffer.readUint(FIELD1_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); + } + + AK_FORCE_INLINE int readEmptyTableLink(const int entryCount) const { + return mBuffer.readUint(FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); + } + + AK_FORCE_INLINE bool writeEmptyTableLink(const int tableIndex, const int entryCount) { + return mBuffer.writeUint(tableIndex, FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); + } + + AK_FORCE_INLINE bool writeField0(const uint32_t data, const int entryIndex) { + return mBuffer.writeUint(data, FIELD0_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); + } + + AK_FORCE_INLINE bool writeField1(const uint32_t data, const int entryIndex) { + return mBuffer.writeUint(data, FIELD1_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); + } + + AK_FORCE_INLINE bool writeEntry(const Entry &entry, const int entryIndex) { + return writeField0(entry.mData0, entryIndex) && writeField1(entry.mData1, entryIndex); + } + + AK_FORCE_INLINE bool writeTerminalEntry(const uint32_t key, const uint64_t value, + const int entryIndex) { + return writeField0(key, entryIndex) && writeValue(value, entryIndex); + } + + AK_FORCE_INLINE bool copyEntry(const int originalEntryIndex, const int newEntryIndex) { + return writeEntry(readEntry(originalEntryIndex), newEntryIndex); + } + + AK_FORCE_INLINE int getTailEntryIndex() const { + return (mBuffer.getTailPosition() - ROOT_BITMAP_ENTRY_POS) / ENTRY_SIZE; + } + + bool removeInner(const Entry &bitmapEntry); +}; + +} // namespace latinime +#endif /* LATINIME_TRIE_MAP_H */ diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index ea438922f..a20252cd2 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -16,9 +16,9 @@ #include "suggest/core/dicnode/dic_node_utils.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" namespace latinime { diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h deleted file mode 100644 index 178b06554..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H -#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H - -#include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" - -namespace latinime { - -class BinaryDictionaryBigramsIterator { - public: - // Empty iterator. - BinaryDictionaryBigramsIterator() - : mBigramsStructurePolicy(nullptr), mPos(NOT_A_DICT_POS), - mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(false) {} - - BinaryDictionaryBigramsIterator( - const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos) - : mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos), - mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), - mHasNext(pos != NOT_A_DICT_POS) {} - - BinaryDictionaryBigramsIterator(BinaryDictionaryBigramsIterator &&bigramsIterator) - : mBigramsStructurePolicy(bigramsIterator.mBigramsStructurePolicy), - mPos(bigramsIterator.mPos), mBigramPos(bigramsIterator.mBigramPos), - mProbability(bigramsIterator.mProbability), mHasNext(bigramsIterator.mHasNext) {} - - AK_FORCE_INLINE bool hasNext() const { - return mHasNext; - } - - AK_FORCE_INLINE void next() { - mBigramsStructurePolicy->getNextBigram(&mBigramPos, &mProbability, &mHasNext, &mPos); - } - - AK_FORCE_INLINE int getProbability() const { - return mProbability; - } - - AK_FORCE_INLINE int getBigramPos() const { - return mBigramPos; - } - - private: - DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator); - - const DictionaryBigramsStructurePolicy *const mBigramsStructurePolicy; - int mPos; - int mBigramPos; - int mProbability; - bool mHasNext; -}; -} // namespace latinime -#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h deleted file mode 100644 index ee1606b6a..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H -#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H - -#include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" - -namespace latinime { - -class BinaryDictionaryShortcutIterator { - public: - BinaryDictionaryShortcutIterator( - const DictionaryShortcutsStructurePolicy *const shortcutStructurePolicy, - const int shortcutPos) - : mShortcutStructurePolicy(shortcutStructurePolicy), - mPos(shortcutStructurePolicy->getStartPos(shortcutPos)), - mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {} - - BinaryDictionaryShortcutIterator(const BinaryDictionaryShortcutIterator &&shortcutIterator) - : mShortcutStructurePolicy(shortcutIterator.mShortcutStructurePolicy), - mPos(shortcutIterator.mPos), - mHasNextShortcutTarget(shortcutIterator.mHasNextShortcutTarget) {} - - AK_FORCE_INLINE bool hasNextShortcutTarget() const { - return mHasNextShortcutTarget; - } - - // Gets the shortcut target itself as an int string and put it to outTarget, put its length - // to outTargetLength, put whether it is whitelist to outIsWhitelist. - AK_FORCE_INLINE void nextShortcutTarget( - const int maxDepth, int *const outTarget, int *const outTargetLength, - bool *const outIsWhitelist) { - mShortcutStructurePolicy->getNextShortcut(maxDepth, outTarget, outTargetLength, - outIsWhitelist, &mHasNextShortcutTarget, &mPos); - } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(BinaryDictionaryShortcutIterator); - DISALLOW_ASSIGNMENT_OPERATOR(BinaryDictionaryShortcutIterator); - - const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy; - int mPos; - bool mHasNextShortcutTarget; -}; -} // namespace latinime -#endif // LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/suggest/core/dictionary/bloom_filter.h deleted file mode 100644 index 1e60f49ed..000000000 --- a/native/jni/src/suggest/core/dictionary/bloom_filter.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BLOOM_FILTER_H -#define LATINIME_BLOOM_FILTER_H - -#include - -#include "defines.h" - -namespace latinime { - -// This bloom filter is used for optimizing bigram retrieval. -// Execution times with previous word "this" are as follows: -// without bloom filter (use only hash_map): -// Total 147792.34 (sum of others 147771.57) -// with bloom filter: -// Total 145900.64 (sum of others 145874.30) -// always read binary dictionary: -// Total 148603.14 (sum of others 148579.90) -class BloomFilter { - public: - BloomFilter() : mFilter() {} - - AK_FORCE_INLINE void setInFilter(const int position) { - mFilter.set(getIndex(position)); - } - - AK_FORCE_INLINE bool isInFilter(const int position) const { - return mFilter.test(getIndex(position)); - } - - private: - DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter); - - AK_FORCE_INLINE size_t getIndex(const int position) const { - return static_cast(position) % BIGRAM_FILTER_MODULO; - } - - // Size, in bits, of the bloom filter index for bigrams - // The probability of false positive is (1 - e ** (-kn/m))**k, - // where k is the number of hash functions, n the number of bigrams, and m the number of - // bits we can test. - // At the moment 100 is the maximum number of bigrams for a word with the current main - // dictionaries, so n = 100. 1024 buckets give us m = 1024. - // With 1 hash function, our false positive rate is about 9.3%, which should be enough for - // our uses since we are only using this to increase average performance. For the record, - // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, - // and m = 4096 gives 2.4%. - // This is assigned here because it is used for bitset size. - // 1021 is the largest prime under 1024. - static const size_t BIGRAM_FILTER_MODULO = 1021; - std::bitset mFilter; -}; -} // namespace latinime -#endif // LATINIME_BLOOM_FILTER_H diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 6a5df9d95..5c9a1392e 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -19,11 +19,11 @@ #include "suggest/core/dictionary/dictionary.h" #include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/property/ngram_context.h" #include "suggest/core/dictionary/dictionary_utils.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/result/suggestion_results.h" #include "suggest/core/session/dic_traverse_session.h" -#include "suggest/core/session/ngram_context.h" #include "suggest/core/suggest.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index b1774371b..9e224ebfb 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -21,11 +21,11 @@ #include "defines.h" #include "jni.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/word_property.h" #include "suggest/core/suggest_interface.h" #include "utils/int_array_view.h" diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp index 9573c37bc..7de550026 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp @@ -16,13 +16,13 @@ #include "suggest/core/dictionary/dictionary_utils.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/ngram_context.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_priority_queue.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" -#include "suggest/core/session/ngram_context.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "utils/int_array_view.h" namespace latinime { diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp index bb2ce5012..4d68f620f 100644 --- a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp @@ -19,7 +19,7 @@ #include #include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" #include "utils/char_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp b/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp deleted file mode 100644 index 761f51ec8..000000000 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/dictionary/multi_bigram_map.h" - -#include -#include - -namespace latinime { - -// Max number of bigram maps (previous word contexts) to be cached. Increasing this number -// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory -// usage. Also, there are diminishing returns since the most frequently used bigrams are -// typically near the beginning of the input and are thus the first ones to be cached. Note -// that these bigrams are reset for each new composing word. -const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; - -// Most common previous word contexts currently have 100 bigrams -const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; - -// Look up the bigram probability for the given word pair from the cached bigram maps. -// Also caches the bigrams if there is space remaining and they have not been cached already. -int MultiBigramMap::getBigramProbability( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds, const int nextWordId, - const int unigramProbability) { - if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) { - return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); - } - const auto mapPosition = mBigramMaps.find(prevWordIds[0]); - if (mapPosition != mBigramMaps.end()) { - return mapPosition->second.getBigramProbability(structurePolicy, nextWordId, - unigramProbability); - } - if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { - addBigramsForWord(structurePolicy, prevWordIds); - return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy, - nextWordId, unigramProbability); - } - return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds, - nextWordId, unigramProbability); -} - -void MultiBigramMap::BigramMap::init( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds) { - structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */); -} - -int MultiBigramMap::BigramMap::getBigramProbability( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int nextWordId, const int unigramProbability) const { - int bigramProbability = NOT_A_PROBABILITY; - if (mBloomFilter.isInFilter(nextWordId)) { - const auto bigramProbabilityIt = mBigramMap.find(nextWordId); - if (bigramProbabilityIt != mBigramMap.end()) { - bigramProbability = bigramProbabilityIt->second; - } - } - return structurePolicy->getProbability(unigramProbability, bigramProbability); -} - -void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) { - if (targetWordId == NOT_A_WORD_ID) { - return; - } - mBigramMap[targetWordId] = ngramProbability; - mBloomFilter.setInFilter(targetWordId); -} - -void MultiBigramMap::addBigramsForWord( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds) { - mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds); -} - -int MultiBigramMap::readBigramProbabilityFromBinaryDictionary( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) { - const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId); - if (bigramProbability != NOT_A_PROBABILITY) { - return bigramProbability; - } - return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h deleted file mode 100644 index d2eb5cc32..000000000 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_MULTI_BIGRAM_MAP_H -#define LATINIME_MULTI_BIGRAM_MAP_H - -#include -#include - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/bloom_filter.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "utils/int_array_view.h" - -namespace latinime { - -// Class for caching bigram maps for multiple previous word contexts. This is useful since the -// algorithm needs to look up the set of bigrams for every word pair that occurs in every -// multi-word suggestion. -class MultiBigramMap { - public: - MultiBigramMap() : mBigramMaps() {} - ~MultiBigramMap() {} - - // Look up the bigram probability for the given word pair from the cached bigram maps. - // Also caches the bigrams if there is space remaining and they have not been cached already. - int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); - - void clear() { - mBigramMaps.clear(); - } - - private: - DISALLOW_COPY_AND_ASSIGN(MultiBigramMap); - - class BigramMap : public NgramListener { - public: - BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} - // Copy constructor needed for std::unordered_map. - BigramMap(const BigramMap &bigramMap) - : mBigramMap(bigramMap.mBigramMap), mBloomFilter(bigramMap.mBloomFilter) {} - virtual ~BigramMap() {} - - void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds); - int getBigramProbability( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int nextWordId, const int unigramProbability) const; - virtual void onVisitEntry(const int ngramProbability, const int targetWordId); - - private: - static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; - std::unordered_map mBigramMap; - BloomFilter mBloomFilter; - }; - - void addBigramsForWord(const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds); - - int readBigramProbabilityFromBinaryDictionary( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); - - static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; - std::unordered_map mBigramMaps; -}; -} // namespace latinime -#endif // LATINIME_MULTI_BIGRAM_MAP_H diff --git a/native/jni/src/suggest/core/dictionary/ngram_listener.h b/native/jni/src/suggest/core/dictionary/ngram_listener.h deleted file mode 100644 index 2eb5e9fd1..000000000 --- a/native/jni/src/suggest/core/dictionary/ngram_listener.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_NGRAM_LISTENER_H -#define LATINIME_NGRAM_LISTENER_H - -#include "defines.h" - -namespace latinime { - -/** - * Interface to iterate ngram entries. - */ -class NgramListener { - public: - // ngramProbability is always 0 for v403 decaying dictionary. - // TODO: Remove ngramProbability. - virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0; - virtual ~NgramListener() {}; - - protected: - NgramListener() {} - - private: - DISALLOW_COPY_AND_ASSIGN(NgramListener); - -}; -} // namespace latinime -#endif /* LATINIME_NGRAM_LISTENER_H */ diff --git a/native/jni/src/suggest/core/dictionary/property/historical_info.h b/native/jni/src/suggest/core/dictionary/property/historical_info.h deleted file mode 100644 index e5ce1ea25..000000000 --- a/native/jni/src/suggest/core/dictionary/property/historical_info.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_HISTORICAL_INFO_H -#define LATINIME_HISTORICAL_INFO_H - -#include "defines.h" - -namespace latinime { - -class HistoricalInfo { - public: - // Invalid historical info. - HistoricalInfo() - : mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0) {} - - HistoricalInfo(const int timestamp, const int level, const int count) - : mTimestamp(timestamp), mLevel(level), mCount(count) {} - - bool isValid() const { - return mTimestamp != NOT_A_TIMESTAMP; - } - - int getTimestamp() const { - return mTimestamp; - } - - // TODO: Remove - int getLevel() const { - return mLevel; - } - - int getCount() const { - return mCount; - } - - private: - // Default copy constructor is used for using in std::vector. - DISALLOW_ASSIGNMENT_OPERATOR(HistoricalInfo); - - const int mTimestamp; - const int mLevel; - const int mCount; -}; -} // namespace latinime -#endif /* LATINIME_HISTORICAL_INFO_H */ diff --git a/native/jni/src/suggest/core/dictionary/property/ngram_property.h b/native/jni/src/suggest/core/dictionary/property/ngram_property.h deleted file mode 100644 index e67b4da31..000000000 --- a/native/jni/src/suggest/core/dictionary/property/ngram_property.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_NGRAM_PROPERTY_H -#define LATINIME_NGRAM_PROPERTY_H - -#include - -#include "defines.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "suggest/core/session/ngram_context.h" - -namespace latinime { - -class NgramProperty { - public: - NgramProperty(const NgramContext &ngramContext, const std::vector &&targetCodePoints, - const int probability, const HistoricalInfo historicalInfo) - : mNgramContext(ngramContext), mTargetCodePoints(std::move(targetCodePoints)), - mProbability(probability), mHistoricalInfo(historicalInfo) {} - - const NgramContext *getNgramContext() const { - return &mNgramContext; - } - - const std::vector *getTargetCodePoints() const { - return &mTargetCodePoints; - } - - int getProbability() const { - return mProbability; - } - - const HistoricalInfo getHistoricalInfo() const { - return mHistoricalInfo; - } - - private: - // Default copy constructor is used for using in std::vector. - DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty); - DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty); - - const NgramContext mNgramContext; - const std::vector mTargetCodePoints; - const int mProbability; - const HistoricalInfo mHistoricalInfo; -}; -} // namespace latinime -#endif // LATINIME_NGRAM_PROPERTY_H diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h deleted file mode 100644 index f194f979a..000000000 --- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_UNIGRAM_PROPERTY_H -#define LATINIME_UNIGRAM_PROPERTY_H - -#include - -#include "defines.h" -#include "suggest/core/dictionary/property/historical_info.h" - -namespace latinime { - -class UnigramProperty { - public: - class ShortcutProperty { - public: - ShortcutProperty(const std::vector &&targetCodePoints, const int probability) - : mTargetCodePoints(std::move(targetCodePoints)), - mProbability(probability) {} - - const std::vector *getTargetCodePoints() const { - return &mTargetCodePoints; - } - - int getProbability() const { - return mProbability; - } - - private: - // Default copy constructor is used for using in std::vector. - DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty); - - const std::vector mTargetCodePoints; - const int mProbability; - }; - - UnigramProperty() - : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), - mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY), - mHistoricalInfo(), mShortcuts() {} - - // In contexts which do not support the Blacklisted flag (v2, v4<403) - UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, - const bool isPossiblyOffensive, const int probability, - const HistoricalInfo historicalInfo, const std::vector &&shortcuts) - : mRepresentsBeginningOfSentence(representsBeginningOfSentence), - mIsNotAWord(isNotAWord), mIsBlacklisted(false), - mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), - mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} - - // Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403) - UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, - const bool isPossiblyOffensive, const int probability, - const HistoricalInfo historicalInfo) - : mRepresentsBeginningOfSentence(representsBeginningOfSentence), - mIsNotAWord(isNotAWord), mIsBlacklisted(false), - mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), - mHistoricalInfo(historicalInfo), mShortcuts() {} - - // In contexts which DO support the Blacklisted flag (v403) - UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, - const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, - const HistoricalInfo historicalInfo, const std::vector &&shortcuts) - : mRepresentsBeginningOfSentence(representsBeginningOfSentence), - mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), - mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), - mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} - - // Without shortcuts, in contexts which DO support the Blacklisted flag (v403) - UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, - const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, - const HistoricalInfo historicalInfo) - : mRepresentsBeginningOfSentence(representsBeginningOfSentence), - mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), - mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), - mHistoricalInfo(historicalInfo), mShortcuts() {} - - bool representsBeginningOfSentence() const { - return mRepresentsBeginningOfSentence; - } - - bool isNotAWord() const { - return mIsNotAWord; - } - - bool isPossiblyOffensive() const { - return mIsPossiblyOffensive; - } - - bool isBlacklisted() const { - return mIsBlacklisted; - } - - bool hasShortcuts() const { - return !mShortcuts.empty(); - } - - int getProbability() const { - return mProbability; - } - - const HistoricalInfo getHistoricalInfo() const { - return mHistoricalInfo; - } - - const std::vector &getShortcuts() const { - return mShortcuts; - } - - private: - // Default copy constructor is used for using as a return value. - DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); - - const bool mRepresentsBeginningOfSentence; - const bool mIsNotAWord; - const bool mIsBlacklisted; - const bool mIsPossiblyOffensive; - const int mProbability; - const HistoricalInfo mHistoricalInfo; - const std::vector mShortcuts; -}; -} // namespace latinime -#endif // LATINIME_UNIGRAM_PROPERTY_H diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/suggest/core/dictionary/property/word_property.h deleted file mode 100644 index 9efc7f304..000000000 --- a/native/jni/src/suggest/core/dictionary/property/word_property.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_WORD_PROPERTY_H -#define LATINIME_WORD_PROPERTY_H - -#include - -#include "defines.h" -#include "suggest/core/dictionary/property/ngram_property.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "utils/int_array_view.h" - -namespace latinime { - -// This class is used for returning information belonging to a word to java side. -class WordProperty { - public: - // Default constructor is used to create an instance that indicates an invalid word. - WordProperty() - : mCodePoints(), mUnigramProperty(), mNgrams() {} - - WordProperty(const std::vector &&codePoints, const UnigramProperty &unigramProperty, - const std::vector &ngrams) - : mCodePoints(std::move(codePoints)), mUnigramProperty(unigramProperty), - mNgrams(ngrams) {} - - const CodePointArrayView getCodePoints() const { - return CodePointArrayView(mCodePoints); - } - - const UnigramProperty &getUnigramProperty() const { - return mUnigramProperty; - } - - const std::vector &getNgramProperties() const { - return mNgrams; - } - - private: - // Default copy constructor is used for using as a return value. - DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); - - const std::vector mCodePoints; - const UnigramProperty mUnigramProperty; - const std::vector mNgrams; -}; -} // namespace latinime -#endif // LATINIME_WORD_PROPERTY_H diff --git a/native/jni/src/suggest/core/dictionary/word_attributes.h b/native/jni/src/suggest/core/dictionary/word_attributes.h deleted file mode 100644 index 5351e7d7d..000000000 --- a/native/jni/src/suggest/core/dictionary/word_attributes.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_WORD_ATTRIBUTES_H -#define LATINIME_WORD_ATTRIBUTES_H - -#include "defines.h" - -class WordAttributes { - public: - // Invalid word attributes. - WordAttributes() - : mProbability(NOT_A_PROBABILITY), mIsBlacklisted(false), mIsNotAWord(false), - mIsPossiblyOffensive(false) {} - - WordAttributes(const int probability, const bool isBlacklisted, const bool isNotAWord, - const bool isPossiblyOffensive) - : mProbability(probability), mIsBlacklisted(isBlacklisted), mIsNotAWord(isNotAWord), - mIsPossiblyOffensive(isPossiblyOffensive) {} - - int getProbability() const { - return mProbability; - } - - bool isBlacklisted() const { - return mIsBlacklisted; - } - - bool isNotAWord() const { - return mIsNotAWord; - } - - // Whether or not a word is possibly offensive. - // * Static dictionaries =v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag. - // * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model - // flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero) - // - // See the ::getWordAttributes function for each of these dictionary policies for more details. - bool isPossiblyOffensive() const { - return mIsPossiblyOffensive; - } - - private: - DISALLOW_ASSIGNMENT_OPERATOR(WordAttributes); - - int mProbability; - bool mIsBlacklisted; - bool mIsNotAWord; - bool mIsPossiblyOffensive; -}; - - // namespace -#endif /* LATINIME_WORD_ATTRIBUTES_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h deleted file mode 100644 index aa0d068aa..000000000 --- a/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H -#define LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H - -#include "defines.h" - -namespace latinime { - -/* - * This class abstracts structure of bigrams. - */ -class DictionaryBigramsStructurePolicy { - public: - virtual ~DictionaryBigramsStructurePolicy() {} - - virtual void getNextBigram(int *const outBigramPos, int *const outProbability, - bool *const outHasNext, int *const pos) const = 0; - virtual bool skipAllBigrams(int *const pos) const = 0; - - protected: - DictionaryBigramsStructurePolicy() {} - - private: - DISALLOW_COPY_AND_ASSIGN(DictionaryBigramsStructurePolicy); -}; -} // namespace latinime -#endif /* LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h deleted file mode 100644 index 6da390e55..000000000 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H -#define LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H - -#include -#include - -#include "defines.h" - -namespace latinime { - -/* - * This class abstracts structure of dictionaries. - * Implement this policy to support additional dictionaries. - */ -class DictionaryHeaderStructurePolicy { - public: - typedef std::map, std::vector> AttributeMap; - - virtual ~DictionaryHeaderStructurePolicy() {} - - virtual int getFormatVersionNumber() const = 0; - - virtual int getSize() const = 0; - - virtual const AttributeMap *getAttributeMap() const = 0; - - virtual bool requiresGermanUmlautProcessing() const = 0; - - virtual float getMultiWordCostMultiplier() const = 0; - - virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, - int outValueSize) const = 0; - - virtual bool shouldBoostExactMatches() const = 0; - - virtual const std::vector *getLocale() const = 0; - - virtual bool supportsBeginningOfSentence() const = 0; - - protected: - DictionaryHeaderStructurePolicy() {} - - private: - DISALLOW_COPY_AND_ASSIGN(DictionaryHeaderStructurePolicy); -}; -} // namespace latinime -#endif /* LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h deleted file mode 100644 index 40b6c2de1..000000000 --- a/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H -#define LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H - -#include "defines.h" - -namespace latinime { - -/* - * This class abstracts structure of shortcuts. - */ -class DictionaryShortcutsStructurePolicy { - public: - virtual ~DictionaryShortcutsStructurePolicy() {} - - virtual int getStartPos(const int pos) const = 0; - - virtual void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, - int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, - int *const pos) const = 0; - - virtual void skipAllShortcuts(int *const pos) const = 0; - - protected: - DictionaryShortcutsStructurePolicy() {} - - private: - DISALLOW_COPY_AND_ASSIGN(DictionaryShortcutsStructurePolicy); -}; -} // namespace latinime -#endif /* LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h deleted file mode 100644 index 33a0fbc19..000000000 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H -#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H - -#include - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/dictionary/word_attributes.h" -#include "utils/int_array_view.h" - -namespace latinime { - -class DicNode; -class DicNodeVector; -class DictionaryHeaderStructurePolicy; -class MultiBigramMap; -class NgramListener; -class NgramContext; -class UnigramProperty; - -/* - * This class abstracts the structure of dictionaries. - * Implement this policy to support additional dictionaries. - */ -class DictionaryStructureWithBufferPolicy { - public: - typedef std::unique_ptr StructurePolicyPtr; - - virtual ~DictionaryStructureWithBufferPolicy() {} - - virtual int getRootPosition() const = 0; - - virtual void createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const = 0; - - virtual int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, - int *const outCodePoints) const = 0; - - virtual int getWordId(const CodePointArrayView wordCodePoints, - const bool forceLowerCaseSearch) const = 0; - - virtual const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, - const int wordId, MultiBigramMap *const multiBigramMap) const = 0; - - // TODO: Remove - virtual int getProbability(const int unigramProbability, const int bigramProbability) const = 0; - - virtual int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const = 0; - - virtual void iterateNgramEntries(const WordIdArrayView prevWordIds, - NgramListener *const listener) const = 0; - - virtual BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const = 0; - - virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0; - - // Returns whether the update was success or not. - virtual bool addUnigramEntry(const CodePointArrayView wordCodePoints, - const UnigramProperty *const unigramProperty) = 0; - - // Returns whether the update was success or not. - virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0; - - // Returns whether the update was success or not. - virtual bool addNgramEntry(const NgramProperty *const ngramProperty) = 0; - - // Returns whether the update was success or not. - virtual bool removeNgramEntry(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints) = 0; - - // Returns whether the update was success or not. - virtual bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints, const bool isValidWord, - const HistoricalInfo historicalInfo) = 0; - - // Returns whether the flush was success or not. - virtual bool flush(const char *const filePath) = 0; - - // Returns whether the GC and flush were success or not. - virtual bool flushWithGC(const char *const filePath) = 0; - - virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; - - // Currently, this method is used only for testing. You may want to consider creating new - // dedicated method instead of this if you want to use this in the production. - virtual void getProperty(const char *const query, const int queryLength, char *const outResult, - const int maxResultLength) = 0; - - virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0; - - // Method to iterate all words in the dictionary. - // The returned token has to be used to get the next word. If token is 0, this method newly - // starts iterating the dictionary. - virtual int getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount) = 0; - - virtual bool isCorrupted() const = 0; - - protected: - DictionaryStructureWithBufferPolicy() {} - - private: - DISALLOW_COPY_AND_ASSIGN(DictionaryStructureWithBufferPolicy); -}; -} // namespace latinime -#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp index 1aff72952..7c37241de 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -19,9 +19,9 @@ #include #include +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_utils.h" -#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" #include "suggest/core/dictionary/error_type_utils.h" #include "suggest/core/policy/scoring.h" #include "suggest/core/result/suggestion_results.h" diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h index eca1f78b2..bcb75a483 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.h +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h @@ -18,7 +18,7 @@ #define LATINIME_SUGGESTIONS_OUTPUT_UTILS #include "defines.h" -#include "suggest/core/dictionary/word_attributes.h" +#include "dictionary/property/word_attributes.h" namespace latinime { diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp index 52dc2f86c..d7dd5a02d 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp +++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -17,10 +17,10 @@ #include "suggest/core/session/dic_traverse_session.h" #include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/ngram_context.h" #include "suggest/core/dictionary/dictionary.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/core/session/ngram_context.h" namespace latinime { diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h index bc53167f0..f5fcfddcd 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.h +++ b/native/jni/src/suggest/core/session/dic_traverse_session.h @@ -20,9 +20,9 @@ #include #include "defines.h" +#include "dictionary/utils/multi_bigram_map.h" #include "jni.h" #include "suggest/core/dicnode/dic_nodes_cache.h" -#include "suggest/core/dictionary/multi_bigram_map.h" #include "suggest/core/layout/proximity_info_state.h" #include "utils/int_array_view.h" diff --git a/native/jni/src/suggest/core/session/ngram_context.cpp b/native/jni/src/suggest/core/session/ngram_context.cpp deleted file mode 100644 index 17ef9ae60..000000000 --- a/native/jni/src/suggest/core/session/ngram_context.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/session/ngram_context.h" - -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "utils/char_utils.h" - -namespace latinime { - -NgramContext::NgramContext() : mPrevWordCount(0) {} - -NgramContext::NgramContext(const NgramContext &ngramContext) - : mPrevWordCount(ngramContext.mPrevWordCount) { - for (size_t i = 0; i < mPrevWordCount; ++i) { - mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i]; - memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i], - sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); - mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i]; - } -} - -NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], - const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, - const size_t prevWordCount) - : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) { - clear(); - for (size_t i = 0; i < mPrevWordCount; ++i) { - if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { - continue; - } - memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], - sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); - mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; - mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; - } -} - -NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, - const bool isBeginningOfSentence) : mPrevWordCount(1) { - clear(); - if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { - return; - } - memmove(mPrevWordCodePoints[0], prevWordCodePoints, - sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); - mPrevWordCodePointCount[0] = prevWordCodePointCount; - mIsBeginningOfSentence[0] = isBeginningOfSentence; -} - -bool NgramContext::isValid() const { - if (mPrevWordCodePointCount[0] > 0) { - return true; - } - if (mIsBeginningOfSentence[0]) { - return true; - } - return false; -} - -const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const { - if (n <= 0 || n > mPrevWordCount) { - return CodePointArrayView(); - } - return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]); -} - -bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const { - if (n <= 0 || n > mPrevWordCount) { - return false; - } - return mIsBeginningOfSentence[n - 1]; -} - -/* static */ int NgramContext::getWordId( - const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, - const int *const wordCodePoints, const int wordCodePointCount, - const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { - if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { - return NOT_A_WORD_ID; - } - int codePoints[MAX_WORD_LENGTH]; - int codePointCount = wordCodePointCount; - memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); - if (isBeginningOfSentence) { - codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount, - MAX_WORD_LENGTH); - if (codePointCount <= 0) { - return NOT_A_WORD_ID; - } - } - const CodePointArrayView codePointArrayView(codePoints, codePointCount); - const int wordId = dictStructurePolicy->getWordId(codePointArrayView, - false /* forceLowerCaseSearch */); - if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) { - // Return the id when when the word was found or doesn't try lower case search. - return wordId; - } - // Check bigrams for lower-cased previous word if original was not found. Useful for - // auto-capitalized words like "The [current_word]". - return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */); -} - -void NgramContext::clear() { - for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { - mPrevWordCodePointCount[i] = 0; - mIsBeginningOfSentence[i] = false; - } -} -} // namespace latinime diff --git a/native/jni/src/suggest/core/session/ngram_context.h b/native/jni/src/suggest/core/session/ngram_context.h deleted file mode 100644 index 9b36199c9..000000000 --- a/native/jni/src/suggest/core/session/ngram_context.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_NGRAM_CONTEXT_H -#define LATINIME_NGRAM_CONTEXT_H - -#include - -#include "defines.h" -#include "utils/int_array_view.h" - -namespace latinime { - -class DictionaryStructureWithBufferPolicy; - -class NgramContext { - public: - // No prev word information. - NgramContext(); - // Copy constructor to use this class with std::vector and use this class as a return value. - NgramContext(const NgramContext &ngramContext); - // Construct from previous words. - NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], - const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, - const size_t prevWordCount); - // Construct from a previous word. - NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, - const bool isBeginningOfSentence); - - size_t getPrevWordCount() const { - return mPrevWordCount; - } - bool isValid() const; - - template - const WordIdArrayView getPrevWordIds( - const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, - WordIdArray *const prevWordIdBuffer, const bool tryLowerCaseSearch) const { - for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) { - prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy, mPrevWordCodePoints[i], - mPrevWordCodePointCount[i], mIsBeginningOfSentence[i], tryLowerCaseSearch); - } - return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount); - } - - // n is 1-indexed. - const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const; - // n is 1-indexed. - bool isNthPrevWordBeginningOfSentence(const size_t n) const; - - private: - DISALLOW_ASSIGNMENT_OPERATOR(NgramContext); - - static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, - const int *const wordCodePoints, const int wordCodePointCount, - const bool isBeginningOfSentence, const bool tryLowerCaseSearch); - void clear(); - - const size_t mPrevWordCount; - int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; - int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; -}; -} // namespace latinime -#endif // LATINIME_NGRAM_CONTEXT_H diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index e5e9b46bf..52fa5a5db 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -16,14 +16,14 @@ #include "suggest/core/suggest.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/word_attributes.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_priority_queue.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" -#include "suggest/core/dictionary/word_attributes.h" #include "suggest/core/layout/proximity_info.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/traversal.h" #include "suggest/core/policy/weighting.h" #include "suggest/core/result/suggestions_output_utils.h" diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp deleted file mode 100644 index c93f31017..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/header/header_policy.h" - -#include - -#include "utils/ngram_utils.h" - -namespace latinime { - -// Note that these are corresponding definitions in Java side in DictionaryHeader. -const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; -const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = - "REQUIRES_GERMAN_UMLAUT_PROCESSING"; -// TODO: Change attribute string to "IS_DECAYING_DICT". -const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; -const char *const HeaderPolicy::DATE_KEY = "date"; -const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; -const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = - {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"}; -const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = - {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT", - "MAX_QUADGRAM_ENTRY_COUNT"}; -const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000}; -const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; -// Historical info is information that is needed to support decaying such as timestamp, level and -// count. -const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; -const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration -const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = - "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; - -const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; -const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; -const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; - -// Used for logging. Question mark is used to indicate that the key is not found. -void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, - int outValueSize) const { - if (outValueSize <= 0) return; - if (outValueSize == 1) { - outValue[0] = '\0'; - return; - } - std::vector keyCodePointVector; - HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); - DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it = - mAttributeMap.find(keyCodePointVector); - if (it == mAttributeMap.end()) { - // The key was not found. - outValue[0] = '?'; - outValue[1] = '\0'; - return; - } - const int terminalIndex = std::min(static_cast(it->second.size()), outValueSize - 1); - for (int i = 0; i < terminalIndex; ++i) { - outValue[i] = it->second[i]; - } - outValue[terminalIndex] = '\0'; -} - -const std::vector HeaderPolicy::readLocale() const { - return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY); -} - -float HeaderPolicy::readMultipleWordCostMultiplier() const { - const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); - if (demotionRate <= 0) { - return static_cast(MAX_VALUE_FOR_WEIGHTING); - } - return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast(demotionRate); -} - -bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { - return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, - REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false); -} - -bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, - const EntryCounts &entryCounts, const int extendedRegionSize, - BufferWithExtendableBuffer *const outBuffer) const { - int writingPos = 0; - DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); - fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite); - if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, - &writingPos)) { - return false; - } - if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags, - &writingPos)) { - return false; - } - // Temporarily writes a dummy header size. - int headerSizeFieldPos = writingPos; - if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */, - &writingPos)) { - return false; - } - if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite, - &writingPos)) { - return false; - } - // Writes the actual header size. - if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos, - &headerSizeFieldPos)) { - return false; - } - return true; -} - -namespace { - -int getIndexFromNgramType(const NgramType ngramType) { - return static_cast(ngramType); -} - -} // namespace - -void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, - const EntryCounts &entryCounts, const int extendedRegionSize, - DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { - for (const auto ngramType : AllNgramTypes::ASCENDING) { - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, - NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], - entryCounts.getNgramCount(ngramType)); - } - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, - extendedRegionSize); - // Set the current time as the generation time. - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY, - TimeKeeper::peekCurrentTime()); - HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale); - if (updatesLastDecayedTime) { - // Set current time as the last updated time. - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY, - TimeKeeper::peekCurrentTime()); - } -} - -/* static */ DictionaryHeaderStructurePolicy::AttributeMap - HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { - DictionaryHeaderStructurePolicy::AttributeMap attributeMap; - HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); - return attributeMap; -} - -/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { - MutableEntryCounters entryCounters; - for (const auto ngramType : AllNgramTypes::ASCENDING) { - const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); - entryCounters.setNgramCount(ngramType, entryCount); - } - return entryCounters.getEntryCounts(); -} - -/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { - MutableEntryCounters entryCounters; - for (const auto ngramType : AllNgramTypes::ASCENDING) { - const int index = getIndexFromNgramType(ngramType); - const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); - entryCounters.setNgramCount(ngramType, maxEntryCount); - } - return entryCounters.getEntryCounts(); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h deleted file mode 100644 index f76931baa..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_HEADER_POLICY_H -#define LATINIME_HEADER_POLICY_H - -#include - -#include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "utils/char_utils.h" -#include "utils/time_keeper.h" - -namespace latinime { - -class HeaderPolicy : public DictionaryHeaderStructurePolicy { - public: - // Reads information from existing dictionary buffer. - HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion) - : mDictFormatVersion(formatVersion), - mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), - mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), - mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), - mLocale(readLocale()), - mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), - mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), - mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, - IS_DECAYING_DICT_KEY, false /* defaultValue */)), - mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), - mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), - mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( - &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), - mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, - DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), - mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} - - // Constructs header information using an attribute map. - HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, - const std::vector &locale, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) - : mDictFormatVersion(dictFormatVersion), - mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( - attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale), - mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), - mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), - mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, - IS_DECAYING_DICT_KEY, false /* defaultValue */)), - mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), - mExtendedRegionSize(0), - mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( - &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), - mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, - DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), - mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} - - // Copy header information - HeaderPolicy(const HeaderPolicy *const headerPolicy) - : mDictFormatVersion(headerPolicy->mDictFormatVersion), - mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize), - mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale), - mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), - mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), - mIsDecayingDict(headerPolicy->mIsDecayingDict), - mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), - mNgramCounts(headerPolicy->mNgramCounts), - mMaxNgramCounts(headerPolicy->mMaxNgramCounts), - mExtendedRegionSize(headerPolicy->mExtendedRegionSize), - mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), - mForgettingCurveProbabilityValuesTableId( - headerPolicy->mForgettingCurveProbabilityValuesTableId), - mCodePointTable(headerPolicy->mCodePointTable) {} - - // Temporary dummy header. - HeaderPolicy() - : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), - mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), - mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), - mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), - mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), - mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} - - ~HeaderPolicy() {} - - virtual int getFormatVersionNumber() const { - // Conceptually this converts the symbolic value we use in the code into the - // hardcoded of the bytes in the file. But we want the constants to be the - // same so we use them for both here. - switch (mDictFormatVersion) { - case FormatUtils::VERSION_2: - case FormatUtils::VERSION_201: - AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); - return FormatUtils::UNKNOWN_VERSION; - case FormatUtils::VERSION_202: - return FormatUtils::VERSION_202; - case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - return FormatUtils::VERSION_4_ONLY_FOR_TESTING; - case FormatUtils::VERSION_402: - return FormatUtils::VERSION_402; - case FormatUtils::VERSION_403: - return FormatUtils::VERSION_403; - default: - return FormatUtils::UNKNOWN_VERSION; - } - } - - AK_FORCE_INLINE bool isValid() const { - // Decaying dictionary must have historical information. - if (!mIsDecayingDict) { - return true; - } - if (mHasHistoricalInfoOfWords) { - return true; - } else { - return false; - } - } - - AK_FORCE_INLINE int getSize() const { - return mSize; - } - - AK_FORCE_INLINE float getMultiWordCostMultiplier() const { - return mMultiWordCostMultiplier; - } - - AK_FORCE_INLINE bool isDecayingDict() const { - return mIsDecayingDict; - } - - AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { - return mRequiresGermanUmlautProcessing; - } - - AK_FORCE_INLINE int getDate() const { - return mDate; - } - - AK_FORCE_INLINE int getLastDecayedTime() const { - return mLastDecayedTime; - } - - AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { - return mNgramCounts; - } - - AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { - return mMaxNgramCounts; - } - - AK_FORCE_INLINE int getExtendedRegionSize() const { - return mExtendedRegionSize; - } - - AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { - return mHasHistoricalInfoOfWords; - } - - AK_FORCE_INLINE bool shouldBoostExactMatches() const { - // TODO: Investigate better ways to handle exact matches for personalized dictionaries. - return !isDecayingDict(); - } - - const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const { - return &mAttributeMap; - } - - AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { - return mForgettingCurveProbabilityValuesTableId; - } - - void readHeaderValueOrQuestionMark(const char *const key, - int *outValue, int outValueSize) const; - - bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, - const EntryCounts &entryCounts, const int extendedRegionSize, - BufferWithExtendableBuffer *const outBuffer) const; - - void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, - const int extendedRegionSize, - DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; - - AK_FORCE_INLINE const std::vector *getLocale() const { - return &mLocale; - } - - bool supportsBeginningOfSentence() const { - return mDictFormatVersion >= FormatUtils::VERSION_402; - } - - const int *getCodePointTable() const { - return mCodePointTable; - } - - private: - DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); - - static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; - static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; - static const char *const IS_DECAYING_DICT_KEY; - static const char *const DATE_KEY; - static const char *const LAST_DECAYED_TIME_KEY; - static const char *const NGRAM_COUNT_KEYS[]; - static const char *const MAX_NGRAM_COUNT_KEYS[]; - static const int DEFAULT_MAX_NGRAM_COUNTS[]; - static const char *const EXTENDED_REGION_SIZE_KEY; - static const char *const HAS_HISTORICAL_INFO_KEY; - static const char *const LOCALE_KEY; - static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; - static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; - static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; - static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; - static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; - static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; - - const FormatUtils::FORMAT_VERSION mDictFormatVersion; - const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; - const int mSize; - DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; - const std::vector mLocale; - const float mMultiWordCostMultiplier; - const bool mRequiresGermanUmlautProcessing; - const bool mIsDecayingDict; - const int mDate; - const int mLastDecayedTime; - const EntryCounts mNgramCounts; - const EntryCounts mMaxNgramCounts; - const int mExtendedRegionSize; - const bool mHasHistoricalInfoOfWords; - const int mForgettingCurveProbabilityValuesTableId; - const int *const mCodePointTable; - - const std::vector readLocale() const; - float readMultipleWordCostMultiplier() const; - bool readRequiresGermanUmlautProcessing() const; - const EntryCounts readNgramCounts() const; - const EntryCounts readMaxNgramCounts() const; - static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( - const uint8_t *const dictBuf); -}; -} // namespace latinime -#endif /* LATINIME_HEADER_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp deleted file mode 100644 index 19ed0d468..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" - -#include -#include -#include -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. -// As such, this is the maximum number of characters will be needed to represent an int as a -// string, including the terminator; this is used as the size of a string buffer large enough to -// hold any value that is intended to fit in an integer, e.g. in the code that reads the header -// of the binary dictionary where a {key,value} string pair scheme is used. -const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; - -const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; -const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048; - -const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; -const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; -const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; -const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; -const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable"; - -const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; - -typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; - -/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { - // See the format of the header in the comment in - // BinaryDictionaryFormatUtils::detectFormatVersion() - return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE - + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); -} - -/* static */ HeaderReadWriteUtils::DictionaryFlags - HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) { - return ByteArrayUtils::readUint16(dictBuf, - HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); -} - -/* static */ HeaderReadWriteUtils::DictionaryFlags - HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( - const AttributeMap *const attributeMap) { - return NO_FLAGS; -} - -/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, - AttributeMap *const headerAttributes) { - const int headerSize = getHeaderSize(dictBuf); - int pos = getHeaderOptionsPosition(); - if (pos == NOT_A_DICT_POS) { - // The header doesn't have header options. - return; - } - int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; - std::unique_ptr valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]); - while (pos < headerSize) { - // The values in the header don't use the code point table for their encoding. - const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos); - std::vector key; - key.insert(key.end(), keyBuffer, keyBuffer + keyLength); - const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos); - std::vector value; - value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength); - headerAttributes->insert(AttributeMap::value_type(key, value)); - } -} - -/* static */ const int *HeaderReadWriteUtils::readCodePointTable( - AttributeMap *const headerAttributes) { - AttributeMap::key_type keyVector; - insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector); - AttributeMap::const_iterator it = headerAttributes->find(keyVector); - if (it == headerAttributes->end()) { - return nullptr; - } - return it->second.data(); -} - -/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( - BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, - int *const writingPos) { - if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE, - writingPos)) { - return false; - } - switch (version) { - case FormatUtils::VERSION_2: - case FormatUtils::VERSION_201: - case FormatUtils::VERSION_202: - // None of the static dictionaries (v2x) support writing - return false; - case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_402: - case FormatUtils::VERSION_403: - return buffer->writeUintAndAdvancePosition(version /* data */, - HEADER_DICTIONARY_VERSION_SIZE, writingPos); - default: - return false; - } -} - -/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags( - BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags, - int *const writingPos) { - return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos); -} - -/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize( - BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) { - return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos); -} - -/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes( - BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes, - int *const writingPos) { - for (AttributeMap::const_iterator it = headerAttributes->begin(); - it != headerAttributes->end(); ++it) { - if (it->first.empty() || it->second.empty()) { - continue; - } - // Write a key. - if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(), - true /* writesTerminator */, writingPos)) { - return false; - } - // Write a value. - if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(), - true /* writesTerminator */, writingPos)) { - return false; - } - } - return true; -} - -/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute( - AttributeMap *const headerAttributes, const char *const key, - const std::vector &value) { - AttributeMap::key_type keyVector; - insertCharactersIntoVector(key, &keyVector); - (*headerAttributes)[keyVector] = value; -} - -/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, - const char *const key, const bool value) { - setIntAttribute(headerAttributes, key, value ? 1 : 0); -} - -/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, - const char *const key, const int value) { - AttributeMap::key_type keyVector; - insertCharactersIntoVector(key, &keyVector); - setIntAttributeInner(headerAttributes, &keyVector, value); -} - -/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const int value) { - AttributeMap::mapped_type valueVector; - char charBuf[LARGEST_INT_DIGIT_COUNT]; - snprintf(charBuf, sizeof(charBuf), "%d", value); - insertCharactersIntoVector(charBuf, &valueVector); - (*headerAttributes)[*key] = valueVector; -} - -/* static */ const std::vector HeaderReadWriteUtils::readCodePointVectorAttributeValue( - const AttributeMap *const headerAttributes, const char *const key) { - AttributeMap::key_type keyVector; - insertCharactersIntoVector(key, &keyVector); - AttributeMap::const_iterator it = headerAttributes->find(keyVector); - if (it == headerAttributes->end()) { - return std::vector(); - } else { - return it->second; - } -} - -/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( - const AttributeMap *const headerAttributes, const char *const key, - const bool defaultValue) { - const int intDefaultValue = defaultValue ? 1 : 0; - const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); - return intValue != 0; -} - -/* static */ int HeaderReadWriteUtils::readIntAttributeValue( - const AttributeMap *const headerAttributes, const char *const key, - const int defaultValue) { - AttributeMap::key_type keyVector; - insertCharactersIntoVector(key, &keyVector); - return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue); -} - -/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner( - const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, - const int defaultValue) { - AttributeMap::const_iterator it = headerAttributes->find(*key); - if (it != headerAttributes->end()) { - int value = 0; - bool isNegative = false; - for (size_t i = 0; i < it->second.size(); ++i) { - if (i == 0 && it->second.at(i) == '-') { - isNegative = true; - } else { - if (!isdigit(it->second.at(i))) { - // If not a number. - return defaultValue; - } - value *= 10; - value += it->second.at(i) - '0'; - } - } - return isNegative ? -value : value; - } - return defaultValue; -} - -/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters, - std::vector *const vector) { - for (int i = 0; characters[i]; ++i) { - vector->push_back(characters[i]); - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h deleted file mode 100644 index 5dd91b26c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H -#define LATINIME_HEADER_READ_WRITE_UTILS_H - -#include - -#include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" - -namespace latinime { - -class BufferWithExtendableBuffer; - -class HeaderReadWriteUtils { - public: - typedef uint16_t DictionaryFlags; - - static int getHeaderSize(const uint8_t *const dictBuf); - - static DictionaryFlags getFlags(const uint8_t *const dictBuf); - - static AK_FORCE_INLINE int getHeaderOptionsPosition() { - return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE - + HEADER_SIZE_FIELD_SIZE; - } - - static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); - - static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, - DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); - - static const int *readCodePointTable( - DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); - - static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, - const FormatUtils::FORMAT_VERSION version, int *const writingPos); - - static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer, - const DictionaryFlags flags, int *const writingPos); - - static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer, - const int size, int *const writingPos); - - static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, - const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - int *const writingPos); - - /** - * Methods for header attributes. - */ - static void setCodePointVectorAttribute( - DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const char *const key, const std::vector &value); - - static void setBoolAttribute( - DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const char *const key, const bool value); - - static void setIntAttribute( - DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const char *const key, const int value); - - static const std::vector readCodePointVectorAttributeValue( - const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const char *const key); - - static bool readBoolAttributeValue( - const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const char *const key, const bool defaultValue); - - static int readIntAttributeValue( - const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const char *const key, const int defaultValue); - - static void insertCharactersIntoVector(const char *const characters, - DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); - - static const int LARGEST_INT_DIGIT_COUNT; - static const int MAX_ATTRIBUTE_KEY_LENGTH; - static const int MAX_ATTRIBUTE_VALUE_LENGTH; - - static const int HEADER_MAGIC_NUMBER_SIZE; - static const int HEADER_DICTIONARY_VERSION_SIZE; - static const int HEADER_FLAG_SIZE; - static const int HEADER_SIZE_FIELD_SIZE; - - static const char *const CODE_POINT_TABLE_KEY; - - // Value for the "flags" field. It's unused at the moment. - static const DictionaryFlags NO_FLAGS; - - static void setIntAttributeInner( - DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, - const int value); - - static int readIntAttributeValueInner( - const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, - const int defaultValue); -}; -} -#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt deleted file mode 100644 index 9e29e836c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt +++ /dev/null @@ -1 +0,0 @@ -Files under this directory have been auto generated. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp deleted file mode 100644 index bc0f47f79..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! - * Do not edit this file other than updating policy's interface. - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" - -#include "suggest/core/dictionary/property/ngram_property.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { -namespace backward { -namespace v402 { - -void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, - bool *const outHasNext, int *const bigramEntryPos) const { - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos); - if (outBigramPos) { - // Lookup target PtNode position. - *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition( - bigramEntry.getTargetTerminalId()); - } - if (outProbability) { - if (bigramEntry.hasHistoricalInfo()) { - *outProbability = - ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(), - mHeaderPolicy); - } else { - *outProbability = bigramEntry.getProbability(); - } - } - if (outHasNext) { - *outHasNext = bigramEntry.hasNext(); - } -} - -bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, - const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { - // 1. The word has no bigrams yet. - // 2. The word has bigrams, and there is the target in the list. - // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. - // 4. The word has bigrams. We have to append new bigram entry to the list. - // 5. Same as 4, but the list is the last entry of the content file. - if (outAddedNewEntry) { - *outAddedNewEntry = false; - } - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Case 1. PtNode that doesn't have a bigram list. - // Create new bigram list. - if (!mBigramDictContent->createNewBigramList(terminalId)) { - return false; - } - const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, - newTargetTerminalId); - const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, - ngramProperty); - // Write an entry. - const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { - return false; - } - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - return true; - } - - int tailEntryPos = NOT_A_DICT_POS; - const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos, - &tailEntryPos); - if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) { - // Case 4, 5. - // Add new entry to the bigram list. - if (tailEntryPos == NOT_A_DICT_POS) { - // Case 4. Create new bigram list. - if (!mBigramDictContent->createNewBigramList(terminalId)) { - return false; - } - const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId); - // Copy existing bigram list. - if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) { - return false; - } - } - // Write new entry at the tail position of the bigram content. - const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, - newTargetTerminalId); - const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &newBigramEntry, ngramProperty); - if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) { - return false; - } - // Update has next flag of the tail entry. - if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) { - return false; - } - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - return true; - } - - // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry. - const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); - if (!originalBigramEntry.isValid()) { - // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing - // entry is updated. - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - } - const BigramEntry updatedBigramEntry = - originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); - const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &updatedBigramEntry, ngramProperty); - return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); -} - -bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Bigram list doesn't exist. - return false; - } - const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos, - nullptr /* outTailEntryPos */); - if (entryPosToUpdate == NOT_A_DICT_POS) { - // Bigram entry doesn't exist. - return false; - } - const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); - if (targetTerminalId != bigramEntry.getTargetTerminalId()) { - // Bigram entry doesn't exist. - return false; - } - // Remove bigram entry by marking it as invalid entry and overwriting the original entry. - const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); - return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate); -} - -bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, - int *const outBigramCount) { - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Bigram list doesn't exist. - return true; - } - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const int entryPos = readingPos; - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - if (!bigramEntry.isValid()) { - continue; - } - const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition( - bigramEntry.getTargetTerminalId()); - if (targetPtNodePos == NOT_A_DICT_POS) { - // Invalidate bigram entry. - const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); - if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { - return false; - } - } else if (bigramEntry.hasHistoricalInfo()) { - const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - bigramEntry.getHistoricalInfo(), mHeaderPolicy); - if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) { - const BigramEntry updatedBigramEntry = - bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); - if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { - return false; - } - *outBigramCount += 1; - } else { - // Remove entry. - const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); - if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { - return false; - } - } - } else { - *outBigramCount += 1; - } - } - return true; -} - -int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Bigram list doesn't exist. - return 0; - } - int bigramCount = 0; - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - if (bigramEntry.isValid()) { - bigramCount++; - } - } - return bigramCount; -} - -int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, - const int bigramListPos, int *const outTailEntryPos) const { - if (outTailEntryPos) { - *outTailEntryPos = NOT_A_DICT_POS; - } - bool hasNext = true; - int invalidEntryPos = NOT_A_DICT_POS; - int readingPos = bigramListPos; - while (hasNext) { - const int entryPos = readingPos; - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) { - // Entry with same target is found. - return entryPos; - } else if (!bigramEntry.isValid()) { - // Invalid entry that can be reused is found. - invalidEntryPos = entryPos; - } - if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) { - if (outTailEntryPos) { - *outTailEntryPos = entryPos; - } - } - } - return invalidEntryPos; -} - -const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( - const BigramEntry *const originalBigramEntry, - const NgramProperty *const ngramProperty) const { - // TODO: Consolidate historical info and probability. - if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - const HistoricalInfo &historicalInfoForUpdate = ngramProperty->getHistoricalInfo(); - const HistoricalInfo updatedHistoricalInfo = - ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalBigramEntry->getHistoricalInfo(), ngramProperty->getProbability(), - &historicalInfoForUpdate, mHeaderPolicy); - return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); - } else { - return originalBigramEntry->updateProbabilityAndGetEntry(ngramProperty->getProbability()); - } -} - -bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) { - const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos); - const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext); - return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos); -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h deleted file mode 100644 index aac6f5470..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! - * Do not edit this file other than updating policy's interface. - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H -#define LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H - -#include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class BigramDictContent; -} // namespace v402 -} // namespace backward -class NgramProperty; -namespace backward { -namespace v402 { -} // namespace v402 -} // namespace backward -class HeaderPolicy; -namespace backward { -namespace v402 { -class TerminalPositionLookupTable; - -class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { - public: - Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, - const TerminalPositionLookupTable *const terminalPositionLookupTable, - const HeaderPolicy *const headerPolicy) - : mBigramDictContent(bigramDictContent), - mTerminalPositionLookupTable(terminalPositionLookupTable), - mHeaderPolicy(headerPolicy) {} - - void getNextBigram(int *const outBigramPos, int *const outProbability, - bool *const outHasNext, int *const bigramEntryPos) const; - - bool skipAllBigrams(int *const pos) const { - // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. - return true; - } - - bool addNewEntry(const int terminalId, const int newTargetTerminalId, - const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); - - bool removeEntry(const int terminalId, const int targetTerminalId); - - bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, - int *const outBigramCount); - - int getBigramEntryConut(const int terminalId); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); - - int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos, - int *const outTailEntryPos) const; - - const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, - const NgramProperty *const ngramProperty) const; - - bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos); - - BigramDictContent *const mBigramDictContent; - const TerminalPositionLookupTable *const mTerminalPositionLookupTable; - const HeaderPolicy *const mHeaderPolicy; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp deleted file mode 100644 index 15ac88319..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { -namespace backward { -namespace v402 { - -const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( - int *const bigramEntryPos) const { - const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); - const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); - if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { - AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " - "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, - bigramListBuffer->getTailPosition()); - ASSERT(false); - return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, - Ver4DictConstants::NOT_A_TERMINAL_ID); - } - const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos); - const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0; - int probability = NOT_A_PROBABILITY; - int timestamp = NOT_A_TIMESTAMP; - int level = 0; - int count = 0; - if (mHasHistoricalInfo) { - timestamp = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos); - level = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos); - count = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos); - } else { - probability = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); - } - const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos); - const int targetTerminalId = - (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? - Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; - if (mHasHistoricalInfo) { - // Hack for better migration. - count += level; - const HistoricalInfo historicalInfo(timestamp, level, count); - return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId); - } else { - return BigramEntry(hasNext, probability, targetTerminalId); - } -} - -bool BigramDictContent::writeBigramEntryAndAdvancePosition( - const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) { - BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer(); - const int bigramFlags = createAndGetBigramFlags(bigramEntryToWrite->hasNext()); - if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags, - Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags); - return false; - } - if (mHasHistoricalInfo) { - const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo(); - if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), - Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos, - historicalInfo->getTimestamp()); - return false; - } - if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(), - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos, - historicalInfo->getLevel()); - return false; - } - if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getCount(), - Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos, - historicalInfo->getCount()); - return false; - } - } else { - if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(), - Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, - bigramEntryToWrite->getProbability()); - return false; - } - } - const int targetTerminalIdToWrite = - (bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ? - Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : - bigramEntryToWrite->getTargetTerminalId(); - if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite, - Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d", - *entryWritingPos, bigramEntryToWrite->getTargetTerminalId()); - return false; - } - return true; -} - -bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos, - int *const outTailEntryPos) { - int readingPos = bigramListPos; - int writingPos = toPos; - bool hasNext = true; - while (hasNext) { - const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - if (!hasNext) { - *outTailEntryPos = writingPos; - } - if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) { - AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos); - return false; - } - } - return true; -} - -bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const BigramDictContent *const originalBigramDictContent, - int *const outBigramEntryCount) { - for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); - it != terminalIdMap->end(); ++it) { - const int originalBigramListPos = - originalBigramDictContent->getBigramListHeadPos(it->first); - if (originalBigramListPos == NOT_A_DICT_POS) { - // This terminal does not have a bigram list. - continue; - } - const int bigramListPos = getContentBuffer()->getTailPosition(); - int bigramEntryCount = 0; - // Copy bigram list with GC from original content. - if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, - terminalIdMap, &bigramEntryCount)) { - AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d", - originalBigramListPos, bigramListPos); - return false; - } - if (bigramEntryCount == 0) { - // All bigram entries are useless. This terminal does not have a bigram list. - continue; - } - *outBigramEntryCount += bigramEntryCount; - // Set bigram list position to the lookup table. - if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { - AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d", - it->second, bigramListPos); - return false; - } - } - return true; -} - -// Returns whether GC for the bigram list was succeeded or not. -bool BigramDictContent::runGCBigramList(const int bigramListPos, - const BigramDictContent *const sourceBigramDictContent, const int toPos, - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - int *const outEntrycount) { - bool hasNext = true; - int readingPos = bigramListPos; - int writingPos = toPos; - int lastEntryPos = NOT_A_DICT_POS; - while (hasNext) { - const BigramEntry originalBigramEntry = - sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = originalBigramEntry.hasNext(); - if (originalBigramEntry.getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) { - continue; - } - TerminalPositionLookupTable::TerminalIdMap::const_iterator it = - terminalIdMap->find(originalBigramEntry.getTargetTerminalId()); - if (it == terminalIdMap->end()) { - // Target word has been removed. - continue; - } - lastEntryPos = hasNext ? writingPos : NOT_A_DICT_POS; - const BigramEntry updatedBigramEntry = - originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second); - if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) { - AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos); - return false; - } - *outEntrycount += 1; - } - if (lastEntryPos != NOT_A_DICT_POS) { - // Update has next flag in the last written entry. - const BigramEntry bigramEntry = getBigramEntry(lastEntryPos).updateHasNextAndGetEntry( - false /* hasNext */); - if (!writeBigramEntry(&bigramEntry, lastEntryPos)) { - AKLOGE("Cannot write bigram entry to set hasNext flag after GC. pos: %d", writingPos); - return false; - } - } - return true; -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h deleted file mode 100644 index b554e5676..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h - */ - -#ifndef LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H -#define LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class BigramDictContent : public SparseTableDictContent { - public: - BigramDictContent(const char *const dictPath, const bool hasHistoricalInfo, - const bool isUpdatable) - : SparseTableDictContent(dictPath, - Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, - Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, - Ver4DictConstants::BIGRAM_FILE_EXTENSION, isUpdatable, - Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), - mHasHistoricalInfo(hasHistoricalInfo) {} - - BigramDictContent(const bool hasHistoricalInfo) - : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), - mHasHistoricalInfo(hasHistoricalInfo) {} - - const BigramEntry getBigramEntry(const int bigramEntryPos) const { - int readingPos = bigramEntryPos; - return getBigramEntryAndAdvancePosition(&readingPos); - } - - const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const; - - // Returns head position of bigram list for a PtNode specified by terminalId. - int getBigramListHeadPos(const int terminalId) const { - const SparseTable *const addressLookupTable = getAddressLookupTable(); - if (!addressLookupTable->contains(terminalId)) { - return NOT_A_DICT_POS; - } - return addressLookupTable->get(terminalId); - } - - bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) { - int writingPos = getContentBuffer()->getTailPosition(); - return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); - } - - bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { - int writingPos = entryWritingPos; - return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); - } - - bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite, - int *const entryWritingPos); - - bool createNewBigramList(const int terminalId) { - const int bigramListPos = getContentBuffer()->getTailPosition(); - return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); - } - - bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos); - - bool flushToFile(const char *const dictPath) const { - return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, - Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, - Ver4DictConstants::BIGRAM_FILE_EXTENSION); - } - - bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const BigramDictContent *const originalBigramDictContent, - int *const outBigramEntryCount); - - bool isContentTailPos(const int pos) const { - return pos == getContentBuffer()->getTailPosition(); - } - - private: - DISALLOW_COPY_AND_ASSIGN(BigramDictContent); - - int createAndGetBigramFlags(const bool hasNext) const { - return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0; - } - - int getBigramEntrySize() const { - if (mHasHistoricalInfo) { - return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE - + Ver4DictConstants::TIME_STAMP_FIELD_SIZE - + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE - + Ver4DictConstants::WORD_COUNT_FIELD_SIZE - + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; - } else { - return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE - + Ver4DictConstants::PROBABILITY_SIZE - + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; - } - } - - bool runGCBigramList(const int bigramListPos, - const BigramDictContent *const sourceBigramDictContent, const int toPos, - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - int *const outEntryCount); - - bool mHasHistoricalInfo; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h deleted file mode 100644 index 480095a2f..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h - */ - -#ifndef LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H -#define LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H - -#include "defines.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class BigramEntry { - public: - BigramEntry(const BigramEntry& bigramEntry) - : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability), - mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {} - - // Entry with historical information. - BigramEntry(const bool hasNext, const int probability, const int targetTerminalId) - : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(), - mTargetTerminalId(targetTerminalId) {} - - // Entry with historical information. - BigramEntry(const bool hasNext, const int probability, - const HistoricalInfo *const historicalInfo, const int targetTerminalId) - : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo), - mTargetTerminalId(targetTerminalId) {} - - const BigramEntry getInvalidatedEntry() const { - return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID); - } - - const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const { - return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId); - } - - const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const { - return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId); - } - - const BigramEntry updateProbabilityAndGetEntry(const int probability) const { - return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId); - } - - const BigramEntry updateHistoricalInfoAndGetEntry( - const HistoricalInfo *const historicalInfo) const { - return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId); - } - - bool isValid() const { - return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; - } - - bool hasNext() const { - return mHasNext; - } - - int getProbability() const { - return mProbability; - } - - bool hasHistoricalInfo() const { - return mHistoricalInfo.isValid(); - } - - const HistoricalInfo *getHistoricalInfo() const { - return &mHistoricalInfo; - } - - int getTargetTerminalId() const { - return mTargetTerminalId; - } - - private: - // Copy constructor is public to use this class as a type of return value. - DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry); - DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry); - - const bool mHasNext; - const int mProbability; - const HistoricalInfo mHistoricalInfo; - const int mTargetTerminalId; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h deleted file mode 100644 index 0f2f25534..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/dict_content.h - */ - -#ifndef LATINIME_BACKWARD_V402_DICT_CONTENT_H -#define LATINIME_BACKWARD_V402_DICT_CONTENT_H - -#include "defines.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class DictContent { - public: - virtual ~DictContent() {} - virtual bool isValid() const = 0; - - protected: - DictContent() {} - - private: - DISALLOW_COPY_AND_ASSIGN(DictContent); -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp deleted file mode 100644 index 61ef4aa42..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" - -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { -namespace backward { -namespace v402 { - -const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int terminalId) const { - if (terminalId < 0 || terminalId >= mSize) { - // This method can be called with invalid terminal id during GC. - return ProbabilityEntry(0 /* flags */, NOT_A_PROBABILITY); - } - const BufferWithExtendableBuffer *const buffer = getBuffer(); - int entryPos = getEntryPos(terminalId); - const int flags = buffer->readUintAndAdvancePosition( - Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos); - const int probability = buffer->readUintAndAdvancePosition( - Ver4DictConstants::PROBABILITY_SIZE, &entryPos); - if (mHasHistoricalInfo) { - const int timestamp = buffer->readUintAndAdvancePosition( - Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos); - const int level = buffer->readUintAndAdvancePosition( - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); - const int count = buffer->readUintAndAdvancePosition( - Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); - // Hack for better migration. - const HistoricalInfo historicalInfo(timestamp, level, count + level); - return ProbabilityEntry(flags, probability, &historicalInfo); - } else { - return ProbabilityEntry(flags, probability); - } -} - -bool ProbabilityDictContent::setProbabilityEntry(const int terminalId, - const ProbabilityEntry *const probabilityEntry) { - if (terminalId < 0) { - return false; - } - const int entryPos = getEntryPos(terminalId); - if (terminalId >= mSize) { - ProbabilityEntry dummyEntry; - // Write new entry. - int writingPos = getBuffer()->getTailPosition(); - while (writingPos <= entryPos) { - // Fulfilling with dummy entries until writingPos. - if (!writeEntry(&dummyEntry, writingPos)) { - AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize); - return false; - } - writingPos += getEntrySize(); - } - mSize = terminalId + 1; - } - return writeEntry(probabilityEntry, entryPos); -} - -bool ProbabilityDictContent::flushToFile(const char *const dictPath) const { - if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { - ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo); - for (int i = 0; i < mSize; ++i) { - const ProbabilityEntry probabilityEntry = getProbabilityEntry(i); - if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) { - AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i); - return false; - } - } - return probabilityDictContentToWrite.flush(dictPath, - Ver4DictConstants::FREQ_FILE_EXTENSION); - } else { - return flush(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION); - } -} - -bool ProbabilityDictContent::runGC( - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const ProbabilityDictContent *const originalProbabilityDictContent) { - for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); - it != terminalIdMap->end(); ++it) { - const ProbabilityEntry probabilityEntry = - originalProbabilityDictContent->getProbabilityEntry(it->first); - if (!setProbabilityEntry(it->second, &probabilityEntry)) { - AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second); - return false; - } - } - return true; -} - -int ProbabilityDictContent::getEntrySize() const { - if (mHasHistoricalInfo) { - return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE - + Ver4DictConstants::PROBABILITY_SIZE - + Ver4DictConstants::TIME_STAMP_FIELD_SIZE - + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE - + Ver4DictConstants::WORD_COUNT_FIELD_SIZE; - } else { - return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE - + Ver4DictConstants::PROBABILITY_SIZE; - } -} - -int ProbabilityDictContent::getEntryPos(const int terminalId) const { - return terminalId * getEntrySize(); -} - -bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry, - const int entryPos) { - BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer(); - int writingPos = entryPos; - if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(), - Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { - AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos); - return false; - } - if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(), - Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { - AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos); - return false; - } - if (mHasHistoricalInfo) { - const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo(); - if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), - Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { - AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); - return false; - } - if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(), - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) { - AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos); - return false; - } - if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(), - Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) { - AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos); - return false; - } - } - return true; -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h deleted file mode 100644 index 3734797d4..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h - */ - -#ifndef LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H -#define LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class ProbabilityEntry; - -class ProbabilityDictContent : public SingleDictContent { - public: - ProbabilityDictContent(const char *const dictPath, const bool hasHistoricalInfo, - const bool isUpdatable) - : SingleDictContent(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable), - mHasHistoricalInfo(hasHistoricalInfo), - mSize(getBuffer()->getTailPosition() / getEntrySize()) {} - - ProbabilityDictContent(const bool hasHistoricalInfo) - : mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {} - - const ProbabilityEntry getProbabilityEntry(const int terminalId) const; - - bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry); - - bool flushToFile(const char *const dictPath) const; - - bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const ProbabilityDictContent *const originalProbabilityDictContent); - - private: - DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent); - - int getEntrySize() const; - - int getEntryPos(const int terminalId) const; - - bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos); - - bool mHasHistoricalInfo; - int mSize; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h deleted file mode 100644 index 4111a49c0..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h - */ - -#ifndef LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H -#define LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H - -#include "defines.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class ProbabilityEntry { - public: - ProbabilityEntry(const ProbabilityEntry &probabilityEntry) - : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), - mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} - - // Dummy entry - ProbabilityEntry() - : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {} - - // Entry without historical information - ProbabilityEntry(const int flags, const int probability) - : mFlags(flags), mProbability(probability), mHistoricalInfo() {} - - // Entry with historical information. - ProbabilityEntry(const int flags, const int probability, - const HistoricalInfo *const historicalInfo) - : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {} - - const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const { - return ProbabilityEntry(mFlags, probability, &mHistoricalInfo); - } - - const ProbabilityEntry createEntryWithUpdatedHistoricalInfo( - const HistoricalInfo *const historicalInfo) const { - return ProbabilityEntry(mFlags, mProbability, historicalInfo); - } - - bool hasHistoricalInfo() const { - return mHistoricalInfo.isValid(); - } - - int getFlags() const { - return mFlags; - } - - int getProbability() const { - return mProbability; - } - - const HistoricalInfo *getHistoricalInfo() const { - return &mHistoricalInfo; - } - - private: - // Copy constructor is public to use this class as a type of return value. - DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); - - const int mFlags; - const int mProbability; - const HistoricalInfo mHistoricalInfo; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp deleted file mode 100644 index 56bc8b98d..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { -namespace backward { -namespace v402 { - -void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, - int *const outCodePoint, int *const outCodePointCount, int *const outProbability, - bool *const outhasNext, int *const shortcutEntryPos) const { - const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); - if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { - AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", - *shortcutEntryPos, shortcutListBuffer->getTailPosition()); - ASSERT(false); - if (outhasNext) { - *outhasNext = false; - } - if (outCodePointCount) { - *outCodePointCount = 0; - } - return; - } - - const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); - if (outProbability) { - *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; - } - if (outhasNext) { - *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; - } - if (outCodePoint && outCodePointCount) { - shortcutListBuffer->readCodePointsAndAdvancePosition( - maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); - } -} - -int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { - const SparseTable *const addressLookupTable = getAddressLookupTable(); - if (!addressLookupTable->contains(terminalId)) { - return NOT_A_DICT_POS; - } - return addressLookupTable->get(terminalId); -} - -bool ShortcutDictContent::flushToFile(const char *const dictPath) const { - return flush(dictPath, Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, - Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, - Ver4DictConstants::SHORTCUT_FILE_EXTENSION); -} - -bool ShortcutDictContent::runGC( - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const ShortcutDictContent *const originalShortcutDictContent) { - for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); - it != terminalIdMap->end(); ++it) { - const int originalShortcutListPos = - originalShortcutDictContent->getShortcutListHeadPos(it->first); - if (originalShortcutListPos == NOT_A_DICT_POS) { - continue; - } - const int shortcutListPos = getContentBuffer()->getTailPosition(); - // Copy shortcut list from original content. - if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, - shortcutListPos)) { - AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", - originalShortcutListPos, shortcutListPos); - return false; - } - // Set shortcut list position to the lookup table. - if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { - AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", - it->second, shortcutListPos); - return false; - } - } - return true; -} - -bool ShortcutDictContent::createNewShortcutList(const int terminalId) { - const int shortcutListListPos = getContentBuffer()->getTailPosition(); - return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); -} - -bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { - return copyShortcutListFromDictContent(shortcutListPos, this, toPos); -} - -bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, - const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { - bool hasNext = true; - int readingPos = shortcutListPos; - int writingPos = toPos; - int codePoints[MAX_WORD_LENGTH]; - while (hasNext) { - int probability = 0; - int codePointCount = 0; - sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, - codePoints, &codePointCount, &probability, &hasNext, &readingPos); - if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, - hasNext, &writingPos)) { - AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); - return false; - } - } - return true; -} - -bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { - BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); - const int shortcutFlags = shortcutListBuffer->readUint( - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); - const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; - const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); - return shortcutListBuffer->writeUint(shortcutFlagsToWrite, - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); -} - -bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, - const int codePointCount, const int probability, const bool hasNext, - int *const shortcutEntryPos) { - BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); - const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); - if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { - AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); - return false; - } - if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, - true /* writesTerminator */, shortcutEntryPos)) { - AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); - return false; - } - return true; -} - -// Find a shortcut entry that has specified target and return its position. -int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, - const int *const targetCodePointsToFind, const int codePointCount) const { - bool hasNext = true; - int readingPos = shortcutListPos; - int targetCodePoints[MAX_WORD_LENGTH]; - while (hasNext) { - const int entryPos = readingPos; - int probability = 0; - int targetCodePointCount = 0; - getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, - &probability, &hasNext, &readingPos); - if (targetCodePointCount != codePointCount) { - continue; - } - bool matched = true; - for (int i = 0; i < codePointCount; ++i) { - if (targetCodePointsToFind[i] != targetCodePoints[i]) { - matched = false; - break; - } - } - if (matched) { - return entryPos; - } - } - return NOT_A_DICT_POS; -} - -int ShortcutDictContent::createAndGetShortcutFlags(const int probability, - const bool hasNext) const { - return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) - | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h deleted file mode 100644 index 179cec5bb..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h - */ - -#ifndef LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H -#define LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class ShortcutDictContent : public SparseTableDictContent { - public: - ShortcutDictContent(const char *const dictPath, const bool isUpdatable) - : SparseTableDictContent(dictPath, - Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, - Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, - Ver4DictConstants::SHORTCUT_FILE_EXTENSION, isUpdatable, - Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} - - ShortcutDictContent() - : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} - - void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, - int *const outCodePointCount, int *const outProbability, bool *const outhasNext, - const int shortcutEntryPos) { - int readingPos = shortcutEntryPos; - return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, - outCodePointCount, outProbability, outhasNext, &readingPos); - } - - void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, - int *const outCodePoint, int *const outCodePointCount, int *const outProbability, - bool *const outhasNext, int *const shortcutEntryPos) const; - - // Returns head position of shortcut list for a PtNode specified by terminalId. - int getShortcutListHeadPos(const int terminalId) const; - - bool flushToFile(const char *const dictPath) const; - - bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const ShortcutDictContent *const originalShortcutDictContent); - - bool createNewShortcutList(const int terminalId); - - bool copyShortcutList(const int shortcutListPos, const int toPos); - - bool setProbability(const int probability, const int shortcutEntryPos); - - bool writeShortcutEntry(const int *const codePoint, const int codePointCount, - const int probability, const bool hasNext, const int shortcutEntryPos) { - int writingPos = shortcutEntryPos; - return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, - hasNext, &writingPos); - } - - bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, - const int codePointCount, const int probability, const bool hasNext, - int *const shortcutEntryPos); - - int findShortcutEntryAndGetPos(const int shortcutListPos, - const int *const targetCodePointsToFind, const int codePointCount) const; - - private: - DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); - - bool copyShortcutListFromDictContent(const int shortcutListPos, - const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); - - int createAndGetShortcutFlags(const int probability, const bool hasNext) const; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h deleted file mode 100644 index 49f446814..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h - */ - -#ifndef LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H -#define LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" -#include "utils/byte_array_view.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class SingleDictContent : public DictContent { - public: - SingleDictContent(const char *const dictPath, const char *const contentFileName, - const bool isUpdatable) - : mMmappedBuffer(MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), - mExpandableContentBuffer( - mMmappedBuffer ? mMmappedBuffer->getReadWriteByteArrayView() : - ReadWriteByteArrayView(), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mIsValid(mMmappedBuffer) {} - - SingleDictContent() - : mMmappedBuffer(nullptr), - mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mIsValid(true) {} - - virtual ~SingleDictContent() {} - - virtual bool isValid() const { - return mIsValid; - } - - bool isNearSizeLimit() const { - return mExpandableContentBuffer.isNearSizeLimit(); - } - - protected: - BufferWithExtendableBuffer *getWritableBuffer() { - return &mExpandableContentBuffer; - } - - const BufferWithExtendableBuffer *getBuffer() const { - return &mExpandableContentBuffer; - } - - bool flush(const char *const dictPath, const char *const contentFileNameSuffix) const { - return DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, - contentFileNameSuffix, &mExpandableContentBuffer); - } - - private: - DISALLOW_COPY_AND_ASSIGN(SingleDictContent); - - const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; - BufferWithExtendableBuffer mExpandableContentBuffer; - const bool mIsValid; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp deleted file mode 100644 index 7c9b4967a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" - -namespace latinime { -namespace backward { -namespace v402 { - -bool SparseTableDictContent::flush(const char *const dictPath, - const char *const lookupTableFileNameSuffix, const char *const addressTableFileNameSuffix, - const char *const contentFileNameSuffix) const { - if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, lookupTableFileNameSuffix, - &mExpandableLookupTableBuffer)){ - return false; - } - if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, addressTableFileNameSuffix, - &mExpandableAddressTableBuffer)) { - return false; - } - if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, contentFileNameSuffix, - &mExpandableContentBuffer)) { - return false; - } - return true; -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h deleted file mode 100644 index 3c626df11..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h - */ - -#ifndef LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H -#define LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" -#include "suggest/policyimpl/dictionary/utils/sparse_table.h" -#include "utils/byte_array_view.h" - -namespace latinime { -namespace backward { -namespace v402 { - -// TODO: Support multiple contents. -class SparseTableDictContent : public DictContent { - public: - AK_FORCE_INLINE SparseTableDictContent(const char *const dictPath, - const char *const lookupTableFileName, const char *const addressTableFileName, - const char *const contentFileName, const bool isUpdatable, - const int sparseTableBlockSize, const int sparseTableDataSize) - : mLookupTableBuffer( - MmappedBuffer::openBuffer(dictPath, lookupTableFileName, isUpdatable)), - mAddressTableBuffer( - MmappedBuffer::openBuffer(dictPath, addressTableFileName, isUpdatable)), - mContentBuffer( - MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), - mExpandableLookupTableBuffer( - mLookupTableBuffer ? mLookupTableBuffer->getReadWriteByteArrayView() : - ReadWriteByteArrayView(), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableAddressTableBuffer( - mAddressTableBuffer ? mAddressTableBuffer->getReadWriteByteArrayView() : - ReadWriteByteArrayView(), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableContentBuffer( - mContentBuffer ? mContentBuffer->getReadWriteByteArrayView() : - ReadWriteByteArrayView(), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, - sparseTableBlockSize, sparseTableDataSize), - mIsValid(mLookupTableBuffer && mAddressTableBuffer && mContentBuffer) {} - - SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) - : mLookupTableBuffer(), mAddressTableBuffer(), mContentBuffer(), - mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, - sparseTableBlockSize, sparseTableDataSize), mIsValid(true) {} - - virtual ~SparseTableDictContent() {} - - virtual bool isValid() const { - return mIsValid; - } - - bool isNearSizeLimit() const { - return mExpandableLookupTableBuffer.isNearSizeLimit() - || mExpandableAddressTableBuffer.isNearSizeLimit() - || mExpandableContentBuffer.isNearSizeLimit(); - } - - protected: - SparseTable *getUpdatableAddressLookupTable() { - return &mAddressLookupTable; - } - - const SparseTable *getAddressLookupTable() const { - return &mAddressLookupTable; - } - - BufferWithExtendableBuffer *getWritableContentBuffer() { - return &mExpandableContentBuffer; - } - - const BufferWithExtendableBuffer *getContentBuffer() const { - return &mExpandableContentBuffer; - } - - bool flush(const char *const dictDirPath, const char *const lookupTableFileName, - const char *const addressTableFileName, const char *const contentFileName) const; - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); - - const MmappedBuffer::MmappedBufferPtr mLookupTableBuffer; - const MmappedBuffer::MmappedBufferPtr mAddressTableBuffer; - const MmappedBuffer::MmappedBufferPtr mContentBuffer; - BufferWithExtendableBuffer mExpandableLookupTableBuffer; - BufferWithExtendableBuffer mExpandableAddressTableBuffer; - BufferWithExtendableBuffer mExpandableContentBuffer; - SparseTable mAddressLookupTable; - const bool mIsValid; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp deleted file mode 100644 index a9f841779..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { -namespace backward { -namespace v402 { - -int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { - if (terminalId < 0 || terminalId >= mSize) { - return NOT_A_DICT_POS; - } - const int terminalPos = getBuffer()->readUint( - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); - return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? - NOT_A_DICT_POS : terminalPos; -} - -bool TerminalPositionLookupTable::setTerminalPtNodePosition( - const int terminalId, const int terminalPtNodePos) { - if (terminalId < 0) { - return NOT_A_DICT_POS; - } - while (terminalId >= mSize) { - // Write new entry. - if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { - return false; - } - mSize++; - } - const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? - terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; - return getWritableBuffer()->writeUint(terminalPos, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); -} - -bool TerminalPositionLookupTable::flushToFile(const char *const dictPath) const { - // If the used buffer size is smaller than the actual buffer size, regenerate the lookup - // table and write the new table to the file. - if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { - TerminalPositionLookupTable lookupTableToWrite; - for (int i = 0; i < mSize; ++i) { - const int terminalPtNodePosition = getTerminalPtNodePosition(i); - if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { - AKLOGE("Cannot set terminal position to lookupTableToWrite." - " terminalId: %d, position: %d", i, terminalPtNodePosition); - return false; - } - } - return lookupTableToWrite.flush(dictPath, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); - } else { - // We can simply use this lookup table because the buffer size has not been - // changed. - return flush(dictPath, Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); - } -} - -bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { - int removedEntryCount = 0; - int nextNewTerminalId = 0; - for (int i = 0; i < mSize; ++i) { - const int terminalPos = getBuffer()->readUint( - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); - if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { - // This entry is a garbage. - removedEntryCount++; - } else { - // Give a new terminal id to the entry. - if (!getWritableBuffer()->writeUint(terminalPos, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, - getEntryPos(nextNewTerminalId))) { - return false; - } - // Memorize the mapping to the old terminal id to the new terminal id. - terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); - nextNewTerminalId++; - } - } - mSize = nextNewTerminalId; - return true; -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h deleted file mode 100644 index eadfe0faa..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h - */ - -#ifndef LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H -#define LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class TerminalPositionLookupTable : public SingleDictContent { - public: - typedef std::unordered_map TerminalIdMap; - - TerminalPositionLookupTable(const char *const dictPath, const bool isUpdatable) - : SingleDictContent(dictPath, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION, isUpdatable), - mSize(getBuffer()->getTailPosition() - / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} - - TerminalPositionLookupTable() : mSize(0) {} - - int getTerminalPtNodePosition(const int terminalId) const; - - bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); - - int getNextTerminalId() const { - return mSize; - } - - bool flushToFile(const char *const dictPath) const; - - bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); - - private: - DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); - - int getEntryPos(const int terminalId) const { - return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; - } - - int mSize; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif // LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h deleted file mode 100644 index 941fda748..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! - * Do not edit this file other than updating policy's interface. - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H -#define LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H - -#include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { - public: - Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, - const TerminalPositionLookupTable *const terminalPositionLookupTable) - : mShortcutDictContent(shortcutDictContent) {} - - ~Ver4ShortcutListPolicy() {} - - int getStartPos(const int pos) const { - // The first shortcut entry is located at the head position of the shortcut list. - return pos; - } - - void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, - int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, - int *const pos) const { - int probability = 0; - mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, - outCodePoint, outCodePointCount, &probability, outHasNext, pos); - if (outIsWhitelist) { - *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); - } - } - - void skipAllShortcuts(int *const pos) const { - // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. - } - - bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, - const int probability) { - const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); - if (shortcutListPos == NOT_A_DICT_POS) { - // Create shortcut list. - if (!mShortcutDictContent->createNewShortcutList(terminalId)) { - AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); - return false; - } - const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); - return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, - false /* hasNext */, writingPos); - } - const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, - codePoints, codePointCount); - if (entryPos == NOT_A_DICT_POS) { - // Add new entry to the shortcut list. - // Create new shortcut list. - if (!mShortcutDictContent->createNewShortcutList(terminalId)) { - AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); - return false; - } - int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); - if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, - codePointCount, probability, true /* hasNext */, &writingPos)) { - AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, - writingPos); - return false; - } - return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); - } - // Overwrite existing entry. - bool hasNext = false; - mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, - 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); - if (!mShortcutDictContent->writeShortcutEntry(codePoints, - codePointCount, probability, hasNext, entryPos)) { - AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, - entryPos); - return false; - } - return true; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); - - ShortcutDictContent *const mShortcutDictContent; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif // LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp deleted file mode 100644 index 3dfbd1c94..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" - -#include -#include -#include -#include - -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "utils/byte_array_view.h" - -namespace latinime { -namespace backward { -namespace v402 { - -/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( - const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer, - const FormatUtils::FORMAT_VERSION formatVersion) { - if (!headerBuffer) { - ASSERT(false); - AKLOGE("The header buffer must be valid to open ver4 dict buffers."); - return Ver4DictBuffersPtr(nullptr); - } - // TODO: take only dictDirPath, and open both header and trie files in the constructor below - const bool isUpdatable = headerBuffer->isUpdatable(); - return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable, - formatVersion)); -} - -bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, - const BufferWithExtendableBuffer *const headerBuffer) const { - // Create temporary directory. - const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, - DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); - char tmpDirPath[tmpDirPathBufSize]; - FileUtils::getFilePathWithSuffix(dictDirPath, - DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, - tmpDirPath); - if (FileUtils::existsDir(tmpDirPath)) { - if (!FileUtils::removeDirAndFiles(tmpDirPath)) { - AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); - ASSERT(false); - return false; - } - } - umask(S_IWGRP | S_IWOTH); - if (mkdir(tmpDirPath, S_IRWXU) == -1) { - AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); - return false; - } - // Get dictionary base path. - const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; - char dictName[dictNameBufSize]; - FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); - const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); - char dictPath[dictPathBufSize]; - FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); - - // Write header file. - if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, - Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { - AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, - Ver4DictConstants::HEADER_FILE_EXTENSION); - return false; - } - // Write trie file. - if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, - Ver4DictConstants::TRIE_FILE_EXTENSION, &mExpandableTrieBuffer)) { - AKLOGE("Dictionary trie file %s%s cannot be written.", tmpDirPath, - Ver4DictConstants::TRIE_FILE_EXTENSION); - return false; - } - // Write dictionary contents. - if (!mTerminalPositionLookupTable.flushToFile(dictPath)) { - AKLOGE("Terminal position lookup table cannot be written. %s", tmpDirPath); - return false; - } - if (!mProbabilityDictContent.flushToFile(dictPath)) { - AKLOGE("Probability dict content cannot be written. %s", tmpDirPath); - return false; - } - if (!mBigramDictContent.flushToFile(dictPath)) { - AKLOGE("Bigram dict content cannot be written. %s", tmpDirPath); - return false; - } - if (!mShortcutDictContent.flushToFile(dictPath)) { - AKLOGE("Shortcut dict content cannot be written. %s", tmpDirPath); - return false; - } - // Remove existing dictionary. - if (!FileUtils::removeDirAndFiles(dictDirPath)) { - AKLOGE("Existing directory %s cannot be removed.", dictDirPath); - ASSERT(false); - return false; - } - // Rename temporary directory. - if (rename(tmpDirPath, dictDirPath) != 0) { - AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); - ASSERT(false); - return false; - } - return true; -} - -Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath, - MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, - const FormatUtils::FORMAT_VERSION formatVersion) - : mHeaderBuffer(std::move(headerBuffer)), - mDictBuffer(MmappedBuffer::openBuffer(dictPath, - Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)), - mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion), - mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableTrieBuffer( - mDictBuffer ? mDictBuffer->getReadWriteByteArrayView() : - ReadWriteByteArrayView(), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mTerminalPositionLookupTable(dictPath, isUpdatable), - mProbabilityDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), - mBigramDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), - mShortcutDictContent(dictPath, isUpdatable), - mIsUpdatable(isUpdatable) {} - -Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) - : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy), - mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), - mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()), - mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(), - mIsUpdatable(true) {} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h deleted file mode 100644 index e775be52e..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H -#define LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" - -namespace latinime { -namespace backward { -namespace v402 { - -class Ver4DictBuffers { - public: - typedef std::unique_ptr Ver4DictBuffersPtr; - - static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, - MmappedBuffer::MmappedBufferPtr headerBuffer, - const FormatUtils::FORMAT_VERSION formatVersion); - - static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( - const HeaderPolicy *const headerPolicy, const int maxTrieSize) { - return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize)); - } - - AK_FORCE_INLINE bool isValid() const { - return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid() - && mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid() - && mBigramDictContent.isValid() && mShortcutDictContent.isValid(); - } - - AK_FORCE_INLINE bool isNearSizeLimit() const { - return mExpandableTrieBuffer.isNearSizeLimit() - || mTerminalPositionLookupTable.isNearSizeLimit() - || mProbabilityDictContent.isNearSizeLimit() - || mBigramDictContent.isNearSizeLimit() - || mShortcutDictContent.isNearSizeLimit(); - } - - AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { - return &mHeaderPolicy; - } - - AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { - return &mExpandableHeaderBuffer; - } - - AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { - return &mExpandableTrieBuffer; - } - - AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { - return &mExpandableTrieBuffer; - } - - AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { - return &mTerminalPositionLookupTable; - } - - AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { - return &mTerminalPositionLookupTable; - } - - AK_FORCE_INLINE ProbabilityDictContent *getMutableProbabilityDictContent() { - return &mProbabilityDictContent; - } - - AK_FORCE_INLINE const ProbabilityDictContent *getProbabilityDictContent() const { - return &mProbabilityDictContent; - } - - AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() { - return &mBigramDictContent; - } - - AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const { - return &mBigramDictContent; - } - - AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { - return &mShortcutDictContent; - } - - AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { - return &mShortcutDictContent; - } - - AK_FORCE_INLINE bool isUpdatable() const { - return mIsUpdatable; - } - - bool flush(const char *const dictDirPath) const { - return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); - } - - bool flushHeaderAndDictBuffers(const char *const dictDirPath, - const BufferWithExtendableBuffer *const headerBuffer) const; - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); - - Ver4DictBuffers(const char *const dictDirPath, - const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, - const FormatUtils::FORMAT_VERSION formatVersion); - - Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); - - const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; - const MmappedBuffer::MmappedBufferPtr mDictBuffer; - const HeaderPolicy mHeaderPolicy; - BufferWithExtendableBuffer mExpandableHeaderBuffer; - BufferWithExtendableBuffer mExpandableTrieBuffer; - TerminalPositionLookupTable mTerminalPositionLookupTable; - ProbabilityDictContent mProbabilityDictContent; - BigramDictContent mBigramDictContent; - ShortcutDictContent mShortcutDictContent; - const int mIsUpdatable; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp deleted file mode 100644 index 81d85f495..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" - -namespace latinime { -namespace backward { -namespace v402 { - -// These values MUST match the definitions in FormatSpec.java. -const char *const Ver4DictConstants::TRIE_FILE_EXTENSION = ".trie"; -const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; -const char *const Ver4DictConstants::FREQ_FILE_EXTENSION = ".freq"; -// tat = Terminal Address Table -const char *const Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; -const char *const Ver4DictConstants::BIGRAM_FILE_EXTENSION = ".bigram_freq"; -const char *const Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup"; -const char *const Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION = ".bigram_index_freq"; -const char *const Ver4DictConstants::SHORTCUT_FILE_EXTENSION = ".shortcut_shortcut"; -const char *const Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION = ".shortcut_lookup"; -const char *const Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION = - ".shortcut_index_shortcut"; - -// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. -const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; -// Extended region size, which is not GCed region size in dict file + additional buffer size, is -// limited to 1MB to prevent from inefficient traversing. -const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; - -const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; -const int Ver4DictConstants::PROBABILITY_SIZE = 1; -const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1; -const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; -const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; -const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; -const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; -const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; -const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; - -const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; -const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4; -const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; -const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; - -const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3; -// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing -// invalid terminal ID in bigram lists. -const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID = - (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1; -const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1; -const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F; -const int Ver4DictConstants::BIGRAM_HAS_NEXT_MASK = 0x80; -const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1; - -const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; -const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; -const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h deleted file mode 100644 index 88ebd6a75..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H -#define LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H - -#include "defines.h" - -namespace latinime { -namespace backward { -namespace v402 { - -// TODO: Create PtConstants under the pt_common and move some constant values there. -// Note that there are corresponding definitions in FormatSpec.java. -class Ver4DictConstants { - public: - static const char *const TRIE_FILE_EXTENSION; - static const char *const HEADER_FILE_EXTENSION; - static const char *const FREQ_FILE_EXTENSION; - static const char *const TERMINAL_ADDRESS_TABLE_FILE_EXTENSION; - static const char *const BIGRAM_FILE_EXTENSION; - static const char *const BIGRAM_LOOKUP_TABLE_FILE_EXTENSION; - static const char *const BIGRAM_CONTENT_TABLE_FILE_EXTENSION; - static const char *const SHORTCUT_FILE_EXTENSION; - static const char *const SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION; - static const char *const SHORTCUT_CONTENT_TABLE_FILE_EXTENSION; - - static const int MAX_DICTIONARY_SIZE; - static const int MAX_DICT_EXTENDED_REGION_SIZE; - - static const int NOT_A_TERMINAL_ID; - static const int PROBABILITY_SIZE; - static const int FLAGS_IN_PROBABILITY_FILE_SIZE; - static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; - static const int NOT_A_TERMINAL_ADDRESS; - static const int TERMINAL_ID_FIELD_SIZE; - static const int TIME_STAMP_FIELD_SIZE; - static const int WORD_LEVEL_FIELD_SIZE; - static const int WORD_COUNT_FIELD_SIZE; - - static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE; - static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE; - static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; - static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; - - static const int BIGRAM_FLAGS_FIELD_SIZE; - static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; - static const int INVALID_BIGRAM_TARGET_TERMINAL_ID; - static const int BIGRAM_PROBABILITY_MASK; - static const int BIGRAM_HAS_NEXT_MASK; - // Used when bigram list has time stamp. - static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE; - - static const int SHORTCUT_FLAGS_FIELD_SIZE; - static const int SHORTCUT_PROBABILITY_MASK; - static const int SHORTCUT_HAS_NEXT_MASK; - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp deleted file mode 100644 index 5c639b19c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" - -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { -namespace backward { -namespace v402 { - -const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( - const int ptNodePos, const int siblingNodePos) const { - if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", - ptNodePos, mBuffer->getTailPosition()); - ASSERT(false); - return PtNodeParams(); - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int pos = ptNodePos; - const int headPos = ptNodePos; - if (usesAdditionalBuffer) { - pos -= mBuffer->getOriginalBufferSize(); - } - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const int parentPosOffset = - DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( - dictBuf, &pos); - const int parentPos = - DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); - int codePoints[MAX_WORD_LENGTH]; - const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos); - int terminalIdFieldPos = NOT_A_DICT_POS; - int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - int probability = NOT_A_PROBABILITY; - if (PatriciaTrieReadingUtils::isTerminal(flags)) { - terminalIdFieldPos = pos; - if (usesAdditionalBuffer) { - terminalIdFieldPos += mBuffer->getOriginalBufferSize(); - } - terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); - const ProbabilityEntry probabilityEntry = - mProbabilityDictContent->getProbabilityEntry(terminalId); - if (probabilityEntry.hasHistoricalInfo()) { - probability = ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), mHeaderPolicy); - } else { - probability = probabilityEntry.getProbability(); - } - } - int childrenPosFieldPos = pos; - if (usesAdditionalBuffer) { - childrenPosFieldPos += mBuffer->getOriginalBufferSize(); - } - int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( - dictBuf, &pos); - if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { - childrenPos += mBuffer->getOriginalBufferSize(); - } - if (usesAdditionalBuffer) { - pos += mBuffer->getOriginalBufferSize(); - } - // Sibling position is the tail position of original PtNode. - int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; - // Read destination node if the read node is a moved node. - if (DynamicPtReadingUtils::isMoved(flags)) { - // The destination position is stored at the same place as the parent position. - return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); - } else { - return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, - terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, - newSiblingNodePos); - } -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h deleted file mode 100644 index 1999a51a6..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H -#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" - -namespace latinime { -namespace backward { -namespace v402 { - -} // namespace v402 -} // namespace backward -class BufferWithExtendableBuffer; -namespace backward { -namespace v402 { -} // namespace v402 -} // namespace backward -class HeaderPolicy; -namespace backward { -namespace v402 { -class ProbabilityDictContent; - -/* - * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved - * node and reads node attributes including probability form probabilityBuffer. - */ -class Ver4PatriciaTrieNodeReader : public PtNodeReader { - public: - Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, - const ProbabilityDictContent *const probabilityDictContent, - const HeaderPolicy *const headerPolicy) - : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent), - mHeaderPolicy(headerPolicy) {} - - ~Ver4PatriciaTrieNodeReader() {} - - virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { - return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, - NOT_A_DICT_POS /* siblingNodePos */); - } - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); - - const BufferWithExtendableBuffer *const mBuffer; - const ProbabilityDictContent *const mProbabilityDictContent; - const HeaderPolicy *const mHeaderPolicy; - - const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, - const int siblingNodePos) const; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp deleted file mode 100644 index d558b949a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" - -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { -namespace backward { -namespace v402 { - -const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; - -bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( - const PtNodeParams *const toBeUpdatedPtNodeParams) { - int pos = toBeUpdatedPtNodeParams->getHeadPos(); - const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mTrieBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, - true /* isDeleted */, false /* willBecomeNonTerminal */); - int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); - // Update flags. - if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, - &writingPos)) { - return false; - } - if (toBeUpdatedPtNodeParams->isTerminal()) { - // The PtNode is a terminal. Delete entry from the terminal position lookup table. - return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( - toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); - } else { - return true; - } -} - -bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( - const PtNodeParams *const toBeUpdatedPtNodeParams, - const int movedPos, const int bigramLinkedNodePos) { - int pos = toBeUpdatedPtNodeParams->getHeadPos(); - const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mTrieBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, - false /* isDeleted */, false /* willBecomeNonTerminal */); - int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); - // Update flags. - if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, - &writingPos)) { - return false; - } - // Update moved position, which is stored in the parent offset field. - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( - mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { - return false; - } - if (toBeUpdatedPtNodeParams->hasChildren()) { - // Update children's parent position. - mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); - while (!mReadingHelper.isEnd()) { - const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); - int parentOffsetFieldPos = childPtNodeParams.getHeadPos() - + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( - mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), - &parentOffsetFieldPos)) { - // Parent offset cannot be written because of a bug or a broken dictionary; thus, - // we give up to update dictionary. - return false; - } - mReadingHelper.readNextSiblingNode(childPtNodeParams); - } - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( - const PtNodeParams *const toBeUpdatedPtNodeParams) { - int pos = toBeUpdatedPtNodeParams->getHeadPos(); - const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mTrieBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, - false /* isDeleted */, true /* willBecomeNonTerminal */); - if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( - toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { - AKLOGE("Cannot update terminal position lookup table. terminal id: %d", - toBeUpdatedPtNodeParams->getTerminalId()); - return false; - } - // Update flags. - int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); - return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, - &writingPos); -} - -bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( - const PtNodeParams *const toBeUpdatedPtNodeParams, - const UnigramProperty *const unigramProperty) { - // Update probability and historical information. - // TODO: Update other information in the unigram property. - if (!toBeUpdatedPtNodeParams->isTerminal()) { - return false; - } - const ProbabilityEntry originalProbabilityEntry = - mBuffers->getProbabilityDictContent()->getProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId()); - const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, - unigramProperty); - return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); -} - -bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( - const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { - if (!toBeUpdatedPtNodeParams->isTerminal()) { - AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); - return false; - } - const ProbabilityEntry originalProbabilityEntry = - mBuffers->getProbabilityDictContent()->getProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId()); - if (originalProbabilityEntry.hasHistoricalInfo()) { - const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); - const ProbabilityEntry probabilityEntry = - originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); - if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { - AKLOGE("Cannot write updated probability entry. terminalId: %d", - toBeUpdatedPtNodeParams->getTerminalId()); - return false; - } - const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); - if (!isValid) { - if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { - AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); - return false; - } - } - *outNeedsToKeepPtNode = isValid; - } else { - // No need to update probability. - *outNeedsToKeepPtNode = true; - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( - const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { - int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); - return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, - newChildrenPosition, &childrenPosFieldPos); -} - -bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newTerminalId) { - return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, - toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); -} - -bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( - const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { - return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, - ptNodeWritingPos); -} - - -bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( - const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, - int *const ptNodeWritingPos) { - int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, - ptNodeWritingPos)) { - return false; - } - // Write probability. - ProbabilityEntry newProbabilityEntry; - const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( - &newProbabilityEntry, unigramProperty); - return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId, - &probabilityEntryToWrite); -} - -bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { - if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, ngramProperty, outAddedNewEntry)) { - AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d", - prevWordIds[0], wordId); - return false; - } - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(prevWordIds[0]); - const PtNodeParams sourcePtNodeParams = - mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (!sourcePtNodeParams.hasBigrams()) { - // Update has bigrams flag. - return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(), - sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(), - sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(), - true /* hasBigrams */, - sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */); - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, - const int wordId) { - return mBigramPolicy->removeEntry(prevWordIds[0], wordId); -} - -bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( - const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { - return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries( - sourcePtNodeParams->getTerminalId(), outBigramEntryCount); -} - -bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( - const PtNodeParams *const toBeUpdatedPtNodeParams, - const DictPositionRelocationMap *const dictPositionRelocationMap, - int *const outBigramEntryCount) { - int parentPos = toBeUpdatedPtNodeParams->getParentPos(); - if (parentPos != NOT_A_DICT_POS) { - PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = - dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); - if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { - parentPos = it->second; - } - } - int writingPos = toBeUpdatedPtNodeParams->getHeadPos() - + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; - // Write updated parent offset. - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, - parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { - return false; - } - - // Updates children position. - int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); - if (childrenPos != NOT_A_DICT_POS) { - PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = - dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); - if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { - childrenPos = it->second; - } - } - if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { - return false; - } - - // Counts bigram entries. - if (outBigramEntryCount) { - *outBigramEntryCount = mBigramPolicy->getBigramEntryConut( - toBeUpdatedPtNodeParams->getTerminalId()); - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, - const int *const targetCodePoints, const int targetCodePointCount, - const int shortcutProbability) { - if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), - targetCodePoints, targetCodePointCount, shortcutProbability)) { - AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); - return false; - } - if (!ptNodeParams->hasShortcutTargets()) { - // Update has shortcut targets flag. - return updatePtNodeFlags(ptNodeParams->getHeadPos(), - ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(), - ptNodeParams->isTerminal(), true /* hasShortcutTargets */, - ptNodeParams->hasBigrams(), - ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags( - const PtNodeParams *const ptNodeParams) { - const bool hasBigrams = mBuffers->getBigramDictContent()->getBigramListHeadPos( - ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; - const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos( - ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; - return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(), - ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets, - hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); -} - -bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( - const PtNodeParams *const ptNodeParams, int *const outTerminalId, - int *const ptNodeWritingPos) { - const int nodePos = *ptNodeWritingPos; - // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the - // PtNode writing. - if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, - 0 /* nodeFlags */, ptNodeWritingPos)) { - return false; - } - // Calculate a parent offset and write the offset. - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, - ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { - return false; - } - // Write code points - if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, - ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { - return false; - } - int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - if (!ptNodeParams->willBecomeNonTerminal()) { - if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { - terminalId = ptNodeParams->getTerminalId(); - } else if (ptNodeParams->isTerminal()) { - // Write terminal information using a new terminal id. - // Get a new unused terminal id. - terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); - } - } - const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; - if (isTerminal) { - // Update the lookup table. - if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( - terminalId, nodePos)) { - return false; - } - // Write terminal Id. - if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, - Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { - return false; - } - if (outTerminalId) { - *outTerminalId = terminalId; - } - } - // Write children position - if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, - ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { - return false; - } - return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(), - ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(), - ptNodeParams->hasBigrams(), - ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); -} - -const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, - const UnigramProperty *const unigramProperty) const { - // TODO: Consolidate historical info and probability. - if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - const HistoricalInfo &historicalInfoForUpdate = unigramProperty->getHistoricalInfo(); - const HistoricalInfo updatedHistoricalInfo = - ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalProbabilityEntry->getHistoricalInfo(), - unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); - return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( - &updatedHistoricalInfo); - } else { - return originalProbabilityEntry->createEntryWithUpdatedProbability( - unigramProperty->getProbability()); - } -} - -bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, - const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, - const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars) { - // Create node flags and write them. - PatriciaTrieReadingUtils::NodeFlags nodeFlags = - PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, - hasShortcutTargets, hasBigrams, hasMultipleChars, - CHILDREN_POSITION_FIELD_SIZE); - if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { - AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); - return false; - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::suppressUnigramEntry(const PtNodeParams *const ptNodeParams) { - if (!mHeaderPolicy->hasHistoricalInfoOfWords()) { - // Require historical info to suppress unigram entry. - return false; - } - const HistoricalInfo suppressedHistorycalInfo(0 /* timestamp */, 0 /* level */, 0 /* count */); - const ProbabilityEntry probabilityEntryToWrite = - ProbabilityEntry().createEntryWithUpdatedHistoricalInfo(&suppressedHistorycalInfo); - return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( - ptNodeParams->getTerminalId(), &probabilityEntryToWrite); -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h deleted file mode 100644 index d0bab50f8..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H -#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" -#include "utils/int_array_view.h" - -namespace latinime { -namespace backward { -namespace v402 { - -} // namespace v402 -} // namespace backward -class BufferWithExtendableBuffer; -namespace backward { -namespace v402 { -} // namespace v402 -} // namespace backward -class HeaderPolicy; -namespace backward { -namespace v402 { -class Ver4BigramListPolicy; -class Ver4DictBuffers; -class Ver4PatriciaTrieNodeReader; -class Ver4PtNodeArrayReader; -class Ver4ShortcutListPolicy; - -/* - * This class is used for helping to writes nodes of ver4 patricia trie. - */ -class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { - public: - Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, - Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy, - const PtNodeReader *const ptNodeReader, - const PtNodeArrayReader *const ptNodeArrayReader, - Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) - : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy), - mPtNodeReader(ptNodeReader), mReadingHelper(ptNodeReader, ptNodeArrayReader), - mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} - - virtual ~Ver4PatriciaTrieNodeWriter() {} - - virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); - - virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int movedPos, const int bigramLinkedNodePos); - - virtual bool markPtNodeAsWillBecomeNonTerminal( - const PtNodeParams *const toBeUpdatedPtNodeParams); - - virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, - const UnigramProperty *const unigramProperty); - - virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( - const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); - - virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newChildrenPosition); - - bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newTerminalId); - - virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - int *const ptNodeWritingPos); - - virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); - - virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); - - virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); - - virtual bool updateAllBigramEntriesAndDeleteUselessEntries( - const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); - - virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, - const DictPositionRelocationMap *const dictPositionRelocationMap, - int *const outBigramEntryCount); - - virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, - const int *const targetCodePoints, const int targetCodePointCount, - const int shortcutProbability); - - bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); - - // Suppress unigram not to use the word for generating suggestions. So, this method can be used - // only for dictionaries with historical info. Also, suppressed entries are included in unigram - // count. They will be removed from the dictionary during GC. - bool suppressUnigramEntry(const PtNodeParams *const ptNodeParams); - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); - - bool writePtNodeAndGetTerminalIdAndAdvancePosition( - const PtNodeParams *const ptNodeParams, int *const outTerminalId, - int *const ptNodeWritingPos); - - // Create updated probability entry using given unigram property. In addition to the - // probability, this method updates historical information if needed. - // TODO: Update flags belonging to the unigram property. - const ProbabilityEntry createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, - const UnigramProperty *const unigramProperty) const; - - bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, - const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, - const bool hasMultipleChars); - - static const int CHILDREN_POSITION_FIELD_SIZE; - - BufferWithExtendableBuffer *const mTrieBuffer; - Ver4DictBuffers *const mBuffers; - const HeaderPolicy *const mHeaderPolicy; - const PtNodeReader *const mPtNodeReader; - DynamicPtReadingHelper mReadingHelper; - Ver4BigramListPolicy *const mBigramPolicy; - Ver4ShortcutListPolicy *const mShortcutPolicy; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp deleted file mode 100644 index 051aed45a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ /dev/null @@ -1,662 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! - * Do not edit this file other than updating policy's interface. - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" - -#include - -#include "suggest/core/dicnode/dic_node.h" -#include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/multi_bigram_map.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/dictionary/property/ngram_property.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/session/ngram_context.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" - -namespace latinime { -namespace backward { -namespace v402 { - -// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and -// BinaryDictionaryDecayingTests. -const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; -const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; -const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = - Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; -const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; - -void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const { - if (!dicNode->hasChildren()) { - return; - } - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); - while (!readingHelper.isEnd()) { - const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); - if (!ptNodeParams.isValid()) { - break; - } - bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); - if (isTerminal && mHeaderPolicy->isDecayingDict()) { - // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose - // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a - // valid terminal DicNode. - isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; - } - readingHelper.readNextSiblingNode(ptNodeParams); - if (ptNodeParams.representsNonWordInfo()) { - // Skip PtNodes that represent non-word information. - continue; - } - const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID; - childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), - wordId, ptNodeParams.getCodePointArrayView()); - } - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); - } -} - -int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, - const int maxCodePointCount, int *const outCodePoints) const { - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); - readingHelper.initWithPtNodePos(ptNodePos); - const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( - maxCodePointCount, outCodePoints); - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); - } - return codePointCount; -} - -int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, - const bool forceLowerCaseSearch) const { - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), - wordCodePoints.size(), forceLowerCaseSearch); - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in getWordId()."); - } - return getWordIdFromTerminalPtNodePos(ptNodePos); -} - -const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( - const WordIdArrayView prevWordIds, const int wordId, - MultiBigramMap *const multiBigramMap) const { - if (wordId == NOT_A_WORD_ID) { - return WordAttributes(); - } - const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (multiBigramMap) { - const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, - prevWordIds, wordId, ptNodeParams.getProbability()); - return getWordAttributes(probability, ptNodeParams); - } - if (!prevWordIds.empty()) { - const int probability = getProbabilityOfWord(prevWordIds, wordId); - if (probability != NOT_A_PROBABILITY) { - return getWordAttributes(probability, ptNodeParams); - } - } - return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), - ptNodeParams); -} - -const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, - const PtNodeParams &ptNodeParams) const { - return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), - ptNodeParams.getProbability() == 0); -} - -int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, - const int bigramProbability) const { - // In the v4 format, bigramProbability is a conditional probability. - const int bigramConditionalProbability = bigramProbability; - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } - if (bigramConditionalProbability == NOT_A_PROBABILITY) { - return ProbabilityUtils::backoff(unigramProbability); - } - return bigramConditionalProbability; -} - -int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, - const int wordId) const { - if (wordId == NOT_A_WORD_ID) { - return NOT_A_PROBABILITY; - } - const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) { - return NOT_A_PROBABILITY; - } - if (prevWordIds.empty()) { - return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); - } - if (prevWordIds[0] == NOT_A_WORD_ID) { - return NOT_A_PROBABILITY; - } - const PtNodeParams prevWordPtNodeParams = - mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); - if (prevWordPtNodeParams.isDeleted()) { - return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); - } - const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( - prevWordPtNodeParams.getTerminalId()); - BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == ptNodePos - && bigramsIt.getProbability() != NOT_A_PROBABILITY) { - const int bigramConditionalProbability = getBigramConditionalProbability( - prevWordPtNodeParams.getProbability(), - prevWordPtNodeParams.representsBeginningOfSentence(), - bigramsIt.getProbability()); - return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability); - } - } - return NOT_A_PROBABILITY; -} - -void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, - NgramListener *const listener) const { - if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) { - return; - } - const PtNodeParams prevWordPtNodeParams = - mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); - if (prevWordPtNodeParams.isDeleted()) { - return; - } - const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( - prevWordPtNodeParams.getTerminalId()); - BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - const int bigramConditionalProbability = getBigramConditionalProbability( - prevWordPtNodeParams.getProbability(), - prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability()); - listener->onVisitEntry(bigramConditionalProbability, - getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); - } -} - -int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability, - const bool isInBeginningOfSentenceContext, const int bigramProbability) const { - if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - if (isInBeginningOfSentenceContext) { - return bigramProbability; - } - // Calculate conditional probability. - return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability, - MAX_PROBABILITY); - } else { - // bigramProbability is a conditional probability. - return bigramProbability; - } -} - -BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( - const int wordId) const { - const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); - return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); -} - -int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted()) { - return NOT_A_DICT_POS; - } - return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( - ptNodeParams.getTerminalId()); -} - -int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted()) { - return NOT_A_DICT_POS; - } - return mBuffers->getBigramDictContent()->getBigramListHeadPos( - ptNodeParams.getTerminalId()); -} - -bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, - const UnigramProperty *const unigramProperty) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - if (wordCodePoints.size() > MAX_WORD_LENGTH) { - AKLOGE("The word is too long to insert to the dictionary, length: %zd", - wordCodePoints.size()); - return false; - } - for (const auto &shortcut : unigramProperty->getShortcuts()) { - if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", - shortcut.getTargetCodePoints()->size()); - return false; - } - } - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - bool addedNewUnigram = false; - int codePointsToAdd[MAX_WORD_LENGTH]; - int codePointCountToAdd = wordCodePoints.size(); - memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); - if (unigramProperty->representsBeginningOfSentence()) { - codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, - codePointCountToAdd, MAX_WORD_LENGTH); - } - if (codePointCountToAdd <= 0) { - return false; - } - const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); - if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, - &addedNewUnigram)) { - if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { - mEntryCounters.incrementNgramCount(NgramType::Unigram); - } - if (unigramProperty->getShortcuts().size() > 0) { - // Add shortcut target. - const int wordPos = getTerminalPtNodePosFromWordId( - getWordId(codePointArrayView, false /* forceLowerCaseSearch */)); - if (wordPos == NOT_A_DICT_POS) { - AKLOGE("Cannot find terminal PtNode position to add shortcut target."); - return false; - } - for (const auto &shortcut : unigramProperty->getShortcuts()) { - if (!mUpdatingHelper.addShortcutTarget(wordPos, - CodePointArrayView(*shortcut.getTargetCodePoints()), - shortcut.getProbability())) { - AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " - "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), - shortcut.getProbability()); - return false; - } - } - } - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); - return false; - } - const int ptNodePos = getTerminalPtNodePosFromWordId( - getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); - if (ptNodePos == NOT_A_DICT_POS) { - return false; - } - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - return mNodeWriter.suppressUnigramEntry(&ptNodeParams); -} - -bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - const NgramContext *const ngramContext = ngramProperty->getNgramContext(); - if (!ngramContext->isValid()) { - AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); - return false; - } - if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("The word is too long to insert the ngram to the dictionary. " - "length: %zd", ngramProperty->getTargetCodePoints()->size()); - return false; - } - WordIdArray prevWordIdArray; - const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, - false /* tryLowerCaseSearch */); - if (prevWordIds.empty()) { - return false; - } - if (prevWordIds[0] == NOT_A_WORD_ID) { - if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { - const UnigramProperty beginningOfSentenceUnigramProperty( - true /* representsBeginningOfSentence */, true /* isNotAWord */, - false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo()); - if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), - &beginningOfSentenceUnigramProperty)) { - AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); - return false; - } - // Refresh word ids. - ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); - } else { - return false; - } - } - const int wordPos = getTerminalPtNodePosFromWordId(getWordId( - CodePointArrayView(*ngramProperty->getTargetCodePoints()), - false /* forceLowerCaseSearch */)); - if (wordPos == NOT_A_DICT_POS) { - return false; - } - bool addedNewBigram = false; - const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); - if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), - wordPos, ngramProperty, &addedNewBigram)) { - if (addedNewBigram) { - mEntryCounters.incrementNgramCount(NgramType::Bigram); - } - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - if (!ngramContext->isValid()) { - AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); - return false; - } - if (wordCodePoints.size() > MAX_WORD_LENGTH) { - AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", - wordCodePoints.size()); - } - WordIdArray prevWordIdArray; - const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, - false /* tryLowerCaseSerch */); - if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { - return false; - } - const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints, - false /* forceLowerCaseSearch */)); - if (wordPos == NOT_A_DICT_POS) { - return false; - } - const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); - if (mUpdatingHelper.removeNgramEntry( - PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { - mEntryCounters.decrementNgramCount(NgramType::Bigram); - return true; - } else { - return false; - } -} - - -bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( - const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, - const bool isValidWord, const HistoricalInfo historicalInfo) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " - "dictionary."); - return false; - } - const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY; - const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, - false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo); - if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { - AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); - return false; - } - const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) - ? NOT_A_PROBABILITY : probability; - const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram, - historicalInfo); - if (!addNgramEntry(&ngramProperty)) { - AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return false; - } - if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { - AKLOGE("Cannot flush the dictionary to file."); - mIsCorrupted = true; - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return false; - } - if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { - AKLOGE("Cannot flush the dictionary to file with GC."); - mIsCorrupted = true; - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); - return false; - } - if (mBuffers->isNearSizeLimit()) { - // Additional buffer size is near the limit. - return true; - } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() - > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { - // Total extended region size of the trie exceeds the limit. - return true; - } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS - && mDictBuffer->getUsedAdditionalBufferSize() > 0) { - // Needs to reduce dictionary size. - return true; - } else if (mHeaderPolicy->isDecayingDict()) { - return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), - mHeaderPolicy); - } - return false; -} - -void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, - char *const outResult, const int maxResultLength) { - const int compareLength = queryLength + 1 /* terminator */; - if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mEntryCounters.getNgramCount(NgramType::Unigram)); - } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); - } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getEntryCountHardLimit( - mHeaderPolicy->getMaxNgramCounts().getNgramCount( - NgramType::Unigram)) : - static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); - } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getEntryCountHardLimit( - mHeaderPolicy->getMaxNgramCounts().getNgramCount( - NgramType::Bigram)) : - static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); - } -} - -const WordProperty Ver4PatriciaTriePolicy::getWordProperty( - const CodePointArrayView wordCodePoints) const { - const int ptNodePos = getTerminalPtNodePosFromWordId( - getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); - if (ptNodePos == NOT_A_DICT_POS) { - AKLOGE("getWordProperty is called for invalid word."); - return WordProperty(); - } - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - const ProbabilityEntry probabilityEntry = - mBuffers->getProbabilityDictContent()->getProbabilityEntry( - ptNodeParams.getTerminalId()); - const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); - // Fetch bigram information. - std::vector ngrams; - const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); - if (bigramListPos != NOT_A_DICT_POS) { - int bigramWord1CodePoints[MAX_WORD_LENGTH]; - const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); - const TerminalPositionLookupTable *const terminalPositionLookupTable = - mBuffers->getTerminalPositionLookupTable(); - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const BigramEntry bigramEntry = - bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - const int word1TerminalId = bigramEntry.getTargetTerminalId(); - const int word1TerminalPtNodePos = - terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); - if (word1TerminalPtNodePos == NOT_A_DICT_POS) { - continue; - } - const int codePointCount = getCodePointsAndReturnCodePointCount( - getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH, - bigramWord1CodePoints); - const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); - const int rawBigramProbability = bigramEntry.hasHistoricalInfo() - ? ForgettingCurveUtils::decodeProbability( - bigramEntry.getHistoricalInfo(), mHeaderPolicy) - : bigramEntry.getProbability(); - const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(), - ptNodeParams.representsBeginningOfSentence(), rawBigramProbability); - ngrams.emplace_back( - NgramContext(wordCodePoints.data(), wordCodePoints.size(), - ptNodeParams.representsBeginningOfSentence()), - CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(), - probability, *historicalInfo); - } - } - // Fetch shortcut information. - std::vector shortcuts; - int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); - if (shortcutPos != NOT_A_DICT_POS) { - int shortcutTarget[MAX_WORD_LENGTH]; - const ShortcutDictContent *const shortcutDictContent = - mBuffers->getShortcutDictContent(); - bool hasNext = true; - while (hasNext) { - int shortcutTargetLength = 0; - int shortcutProbability = NOT_A_PROBABILITY; - shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, - &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); - shortcuts.emplace_back( - CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), - shortcutProbability); - } - } - const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), - ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), - ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts)); - return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); -} - -int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount) { - *outCodePointCount = 0; - if (token == 0) { - mTerminalPtNodePositionsForIteratingWords.clear(); - DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( - &mTerminalPtNodePositionsForIteratingWords); - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); - } - const int terminalPtNodePositionsVectorSize = - static_cast(mTerminalPtNodePositionsForIteratingWords.size()); - if (token < 0 || token >= terminalPtNodePositionsVectorSize) { - AKLOGE("Given token %d is invalid.", token); - return 0; - } - const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; - *outCodePointCount = getCodePointsAndReturnCodePointCount( - getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); - const int nextToken = token + 1; - if (nextToken >= terminalPtNodePositionsVectorSize) { - // All words have been iterated. - mTerminalPtNodePositionsForIteratingWords.clear(); - return 0; - } - return nextToken; -} - -int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { - return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; -} - -int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { - return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h deleted file mode 100644 index 80b1111b4..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! - * Do not edit this file other than updating policy's interface. - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H -#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H - -#include - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" -#include "utils/int_array_view.h" - -namespace latinime { -namespace backward { -namespace v402 { - -} // namespace v402 -} // namespace backward -class DicNode; -namespace backward { -namespace v402 { -} // namespace v402 -} // namespace backward -class DicNodeVector; -namespace backward { -namespace v402 { - -// Word id = Position of a PtNode that represents the word. -// Max supported n-gram is bigram. -class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { - public: - Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) - : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), - mDictBuffer(mBuffers->getWritableTrieBuffer()), - mBigramPolicy(mBuffers->getMutableBigramDictContent(), - mBuffers->getTerminalPositionLookupTable(), mHeaderPolicy), - mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), - mBuffers->getTerminalPositionLookupTable()), - mNodeReader(mDictBuffer, mBuffers->getProbabilityDictContent(), mHeaderPolicy), - mPtNodeArrayReader(mDictBuffer), - mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, - &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), - mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), - mWritingHelper(mBuffers.get()), - mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), - mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; - - virtual int getRootPosition() const { - return 0; - } - - void createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const; - - int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, - int *const outCodePoints) const; - - int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - - const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, - const int wordId, MultiBigramMap *const multiBigramMap) const; - - int getProbability(const int unigramProbability, const int bigramProbability) const; - - int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; - - void iterateNgramEntries(const WordIdArrayView prevWordIds, - NgramListener *const listener) const; - - BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; - - const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { - return mHeaderPolicy; - } - - bool addUnigramEntry(const CodePointArrayView wordCodePoints, - const UnigramProperty *const unigramProperty); - - bool removeUnigramEntry(const CodePointArrayView wordCodePoints); - - bool addNgramEntry(const NgramProperty *const ngramProperty); - - bool removeNgramEntry(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints); - - bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints, const bool isValidWord, - const HistoricalInfo historicalInfo); - - bool flush(const char *const filePath); - - bool flushWithGC(const char *const filePath); - - bool needsToRunGC(const bool mindsBlockByGC) const; - - void getProperty(const char *const query, const int queryLength, char *const outResult, - const int maxResultLength); - - const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; - - int getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount); - - bool isCorrupted() const { - return mIsCorrupted; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); - - static const char *const UNIGRAM_COUNT_QUERY; - static const char *const BIGRAM_COUNT_QUERY; - static const char *const MAX_UNIGRAM_COUNT_QUERY; - static const char *const MAX_BIGRAM_COUNT_QUERY; - // When the dictionary size is near the maximum size, we have to refuse dynamic operations to - // prevent the dictionary from overflowing. - static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; - static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; - static const int DUMMY_PROBABILITY_FOR_VALID_WORDS; - - const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; - const HeaderPolicy *const mHeaderPolicy; - BufferWithExtendableBuffer *const mDictBuffer; - Ver4BigramListPolicy mBigramPolicy; - Ver4ShortcutListPolicy mShortcutPolicy; - Ver4PatriciaTrieNodeReader mNodeReader; - Ver4PtNodeArrayReader mPtNodeArrayReader; - Ver4PatriciaTrieNodeWriter mNodeWriter; - DynamicPtUpdatingHelper mUpdatingHelper; - Ver4PatriciaTrieWritingHelper mWritingHelper; - MutableEntryCounters mEntryCounters; - std::vector mTerminalPtNodePositionsForIteratingWords; - mutable bool mIsCorrupted; - - int getBigramsPositionOfPtNode(const int ptNodePos) const; - int getShortcutPositionOfPtNode(const int ptNodePos) const; - int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; - int getTerminalPtNodePosFromWordId(const int wordId) const; - const WordAttributes getWordAttributes(const int probability, - const PtNodeParams &ptNodeParams) const; - int getBigramConditionalProbability(const int prevWordUnigramProbability, - const bool isInBeginningOfSentenceContext, const int bigramProbability) const; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif // LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp deleted file mode 100644 index 80d531198..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { -namespace backward { -namespace v402 { - -/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( - const uint8_t *const buffer, int *pos) { - return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h deleted file mode 100644 index 3579c26d6..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H -#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H - -#include - -#include "defines.h" - -namespace latinime { -namespace backward { -namespace v402 { - -} // namespace v402 -} // namespace backward -class BufferWithExtendableBuffer; -namespace backward { -namespace v402 { - -class Ver4PatriciaTrieReadingUtils { - public: - static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, - int *const pos); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp deleted file mode 100644 index 985c16803..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" - -#include -#include - -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { -namespace backward { -namespace v402 { - -bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, - const EntryCounts &entryCounts) const { - const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); - BufferWithExtendableBuffer headerBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - const int extendedRegionSize = headerPolicy->getExtendedRegionSize() - + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, - entryCounts, extendedRegionSize, &headerBuffer)) { - AKLOGE("Cannot write header structure to buffer. " - "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " - "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), - entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize); - return false; - } - return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); -} - -bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, - const char *const dictDirPath) { - const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); - Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( - Ver4DictBuffers::createVer4DictBuffers(headerPolicy, - Ver4DictConstants::MAX_DICTIONARY_SIZE)); - int unigramCount = 0; - int bigramCount = 0; - if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { - return false; - } - BufferWithExtendableBuffer headerBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - MutableEntryCounters entryCounters; - entryCounters.setNgramCount(NgramType::Unigram, unigramCount); - entryCounters.setNgramCount(NgramType::Bigram, bigramCount); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { - return false; - } - return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); -} - -bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, - int *const outUnigramCount, int *const outBigramCount) { - Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), - mBuffers->getProbabilityDictContent(), headerPolicy); - Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); - Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), - mBuffers->getTerminalPositionLookupTable(), headerPolicy); - Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), - mBuffers->getTerminalPositionLookupTable()); - Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), - mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, - &shortcutPolicy); - - DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners - ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - &ptNodeWriter); - if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { - return false; - } - const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - .getValidUnigramCount(); - const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram); - if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { - if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { - AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, - maxUnigramCount); - return false; - } - } - - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability - traversePolicyToUpdateBigramProbability(&ptNodeWriter); - if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateBigramProbability)) { - return false; - } - const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); - const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram); - if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { - if (!truncateBigrams(maxBigramCount)) { - AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); - return false; - } - } - - // Mapping from positions in mBuffer to positions in bufferToWrite. - PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, - &shortcutPolicy); - DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, - buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); - if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { - return false; - } - - // Create policy instances for the GCed dictionary. - Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), - buffersToWrite->getProbabilityDictContent(), headerPolicy); - Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); - Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), - buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); - Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), - buffersToWrite->getTerminalPositionLookupTable()); - Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, - &newShortcutPolicy); - // Re-assign terminal IDs for valid terminal PtNodes. - TerminalPositionLookupTable::TerminalIdMap terminalIdMap; - if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( - &terminalIdMap)) { - return false; - } - // Run GC for probability dict content. - if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, - mBuffers->getProbabilityDictContent())) { - return false; - } - // Run GC for bigram dict content. - if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, - mBuffers->getBigramDictContent(), outBigramCount)) { - return false; - } - // Run GC for shortcut dict content. - if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, - mBuffers->getShortcutDictContent())) { - return false; - } - DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); - newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields - traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); - if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToUpdateAllPositionFields)) { - return false; - } - newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); - if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { - return false; - } - *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); - return true; -} - -bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( - const Ver4PatriciaTrieNodeReader *const ptNodeReader, - Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { - const TerminalPositionLookupTable *const terminalPosLookupTable = - mBuffers->getTerminalPositionLookupTable(); - const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); - std::priority_queue, DictProbabilityComparator> - priorityQueue; - for (int i = 0; i < nextTerminalId; ++i) { - const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); - if (terminalPos == NOT_A_DICT_POS) { - continue; - } - const ProbabilityEntry probabilityEntry = - mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); - const int probability = probabilityEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : - probabilityEntry.getProbability(); - priorityQueue.push(DictProbability(terminalPos, probability, - probabilityEntry.getHistoricalInfo()->getTimestamp())); - } - - // Delete unigrams. - while (static_cast(priorityQueue.size()) > maxUnigramCount) { - const int ptNodePos = priorityQueue.top().getDictPos(); - priorityQueue.pop(); - const PtNodeParams ptNodeParams = - ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.representsNonWordInfo()) { - continue; - } - if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { - AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); - return false; - } - } - return true; -} - -bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { - const TerminalPositionLookupTable *const terminalPosLookupTable = - mBuffers->getTerminalPositionLookupTable(); - const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); - std::priority_queue, DictProbabilityComparator> - priorityQueue; - BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); - for (int i = 0; i < nextTerminalId; ++i) { - const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); - if (bigramListPos == NOT_A_DICT_POS) { - continue; - } - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const int entryPos = readingPos; - const BigramEntry bigramEntry = - bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - if (!bigramEntry.isValid()) { - continue; - } - const int probability = bigramEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : - bigramEntry.getProbability(); - priorityQueue.push(DictProbability(entryPos, probability, - bigramEntry.getHistoricalInfo()->getTimestamp())); - } - } - - // Delete bigrams. - while (static_cast(priorityQueue.size()) > maxBigramCount) { - const int entryPos = priorityQueue.top().getDictPos(); - const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); - const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); - if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { - AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); - return false; - } - priorityQueue.pop(); - } - return true; -} - -bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - if (!ptNodeParams->isTerminal()) { - return true; - } - TerminalPositionLookupTable::TerminalIdMap::const_iterator it = - mTerminalIdMap->find(ptNodeParams->getTerminalId()); - if (it == mTerminalIdMap->end()) { - AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", - ptNodeParams->getTerminalId(), mTerminalIdMap->size()); - return false; - } - if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { - AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); - } - return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h deleted file mode 100644 index 1aad33e38..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H -#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" - -namespace latinime { -namespace backward { -namespace v402 { - -} // namespace v402 -} // namespace backward -class HeaderPolicy; -namespace backward { -namespace v402 { -class Ver4DictBuffers; -class Ver4PatriciaTrieNodeReader; -class Ver4PatriciaTrieNodeWriter; - -class Ver4PatriciaTrieWritingHelper { - public: - Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) - : mBuffers(buffers) {} - - bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; - - // This method cannot be const because the original dictionary buffer will be updated to detect - // useless PtNodes during GC. - bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); - - class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - : public DynamicPtReadingHelper::TraversingEventListener { - public: - TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( - Ver4PatriciaTrieNodeWriter *const ptNodeWriter, - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) - : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} - - bool onAscend() { return true; } - - bool onDescend(const int ptNodeArrayPos) { return true; } - - bool onReadingPtNodeArrayTail() { return true; } - - bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); - - Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; - const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; - }; - - // For truncateUnigrams() and truncateBigrams(). - class DictProbability { - public: - DictProbability(const int dictPos, const int probability, const int timestamp) - : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {} - - int getDictPos() const { - return mDictPos; - } - - int getProbability() const { - return mProbability; - } - - int getTimestamp() const { - return mTimestamp; - } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability); - - int mDictPos; - int mProbability; - int mTimestamp; - }; - - // For truncateUnigrams() and truncateBigrams(). - class DictProbabilityComparator { - public: - bool operator()(const DictProbability &left, const DictProbability &right) { - if (left.getProbability() != right.getProbability()) { - return left.getProbability() > right.getProbability(); - } - if (left.getTimestamp() != right.getTimestamp()) { - return left.getTimestamp() < right.getTimestamp(); - } - return left.getDictPos() > right.getDictPos(); - } - - private: - DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator); - }; - - bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, - Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, - int *const outBigramCount); - - bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader, - Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount); - - bool truncateBigrams(const int maxBigramCount); - - Ver4DictBuffers *const mBuffers; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime - -#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp deleted file mode 100644 index 537a6d420..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp - */ - -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" - -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { -namespace backward { -namespace v402 { - -bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, - int *const outPtNodeCount, int *const outFirstPtNodePos) const { - if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { - // Reading invalid position because of a bug or a broken dictionary. - AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", - ptNodeArrayPos, mBuffer->getTailPosition()); - ASSERT(false); - return false; - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int readingPos = ptNodeArrayPos; - if (usesAdditionalBuffer) { - readingPos -= mBuffer->getOriginalBufferSize(); - } - const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - dictBuf, &readingPos); - if (usesAdditionalBuffer) { - readingPos += mBuffer->getOriginalBufferSize(); - } - if (ptNodeCountInArray < 0) { - AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); - return false; - } - *outPtNodeCount = ptNodeCountInArray; - *outFirstPtNodePos = readingPos; - return true; -} - -bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, - int *const outNextPtNodeArrayPos) const { - if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", - forwordLinkPos, mBuffer->getTailPosition()); - ASSERT(false); - return false; - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int readingPos = forwordLinkPos; - if (usesAdditionalBuffer) { - readingPos -= mBuffer->getOriginalBufferSize(); - } - const int nextPtNodeArrayOffset = - DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); - if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { - *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; - } else { - *outNextPtNodeArrayPos = NOT_A_DICT_POS; - } - return true; -} - -} // namespace v402 -} // namespace backward -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h deleted file mode 100644 index 4f8056801..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * !!!!! DO NOT EDIT THIS FILE !!!!! - * - * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h - */ - -#ifndef LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H -#define LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" - -namespace latinime { -namespace backward { -namespace v402 { - -} // namespace v402 -} // namespace backward -class BufferWithExtendableBuffer; -namespace backward { -namespace v402 { - -class Ver4PtNodeArrayReader : public PtNodeArrayReader { - public: - Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; - - virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, - int *const outPtNodeCount, int *const outFirstPtNodePos) const; - virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, - int *const outNextPtNodeArrayPos) const; - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); - - const BufferWithExtendableBuffer *const mBuffer; -}; -} // namespace v402 -} // namespace backward -} // namespace latinime -#endif /* LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp deleted file mode 100644 index 9a9a21b6b..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr - DictionaryStructureWithBufferPolicyFactory::newPolicyForExistingDictFile( - const char *const path, const int bufOffset, const int size, - const bool isUpdatable) { - if (FileUtils::existsDir(path)) { - // Given path represents a directory. - return newPolicyForDirectoryDict(path, isUpdatable); - } else { - if (isUpdatable) { - AKLOGE("One file dictionaries don't support updating. path: %s", path); - ASSERT(false); - return nullptr; - } - return newPolicyForFileDict(path, bufOffset, size); - } -} - -/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr - DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict( - const int formatVersion, const std::vector &locale, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { - FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); - switch (dictFormatVersion) { - case FormatUtils::VERSION_402: { - return newPolicyForOnMemoryV4Dict( - dictFormatVersion, locale, attributeMap); - } - case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_403: { - return newPolicyForOnMemoryV4Dict( - dictFormatVersion, locale, attributeMap); - } - default: - AKLOGE("DICT: dictionary format %d is not supported for on memory dictionary", - formatVersion); - break; - } - return nullptr; -} - -template -/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr - DictionaryStructureWithBufferPolicyFactory::newPolicyForOnMemoryV4Dict( - const FormatUtils::FORMAT_VERSION formatVersion, - const std::vector &locale, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { - HeaderPolicy headerPolicy(formatVersion, locale, attributeMap); - DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, - DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); - if (!DynamicPtWritingUtils::writeEmptyDictionary( - dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { - AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); - return nullptr; - } - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( - new StructurePolicy(std::move(dictBuffers))); -} - -/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr - DictionaryStructureWithBufferPolicyFactory::newPolicyForDirectoryDict( - const char *const path, const bool isUpdatable) { - const int headerFilePathBufSize = PATH_MAX + 1 /* terminator */; - char headerFilePath[headerFilePathBufSize]; - getHeaderFilePathInDictDir(path, headerFilePathBufSize, headerFilePath); - // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of - // MmappedBufferPtr if the instance has the responsibility. - MmappedBuffer::MmappedBufferPtr mmappedBuffer = - MmappedBuffer::openBuffer(headerFilePath, isUpdatable); - if (!mmappedBuffer) { - return nullptr; - } - const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion( - mmappedBuffer->getReadOnlyByteArrayView()); - switch (formatVersion) { - case FormatUtils::VERSION_2: - case FormatUtils::VERSION_201: - case FormatUtils::VERSION_202: - AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path); - break; - case FormatUtils::VERSION_402: { - return newPolicyForV4Dict( - headerFilePath, formatVersion, std::move(mmappedBuffer)); - } - case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_403: { - return newPolicyForV4Dict( - headerFilePath, formatVersion, std::move(mmappedBuffer)); - } - default: - AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); - break; - } - ASSERT(false); - return nullptr; -} - -template -/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr - DictionaryStructureWithBufferPolicyFactory::newPolicyForV4Dict( - const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, - MmappedBuffer::MmappedBufferPtr &&mmappedBuffer) { - const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */; - char dictPath[dictDirPathBufSize]; - if (!FileUtils::getFilePathWithoutSuffix(headerFilePath, - DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) { - AKLOGE("Dictionary file name is not valid as a ver4 dictionary. header path: %s", - headerFilePath); - ASSERT(false); - return nullptr; - } - DictBuffersPtr dictBuffers = - DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion); - if (!dictBuffers || !dictBuffers->isValid()) { - AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s", - dictPath); - ASSERT(false); - return nullptr; - } - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( - new StructurePolicy(std::move(dictBuffers))); -} - -/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr - DictionaryStructureWithBufferPolicyFactory::newPolicyForFileDict( - const char *const path, const int bufOffset, const int size) { - // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of - // MmappedBufferPtr if the instance has the responsibility. - MmappedBuffer::MmappedBufferPtr mmappedBuffer( - MmappedBuffer::openBuffer(path, bufOffset, size, false /* isUpdatable */)); - if (!mmappedBuffer) { - return nullptr; - } - switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { - case FormatUtils::VERSION_2: - case FormatUtils::VERSION_201: - AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); - break; - case FormatUtils::VERSION_202: - return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( - new PatriciaTriePolicy(std::move(mmappedBuffer))); - case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_402: - case FormatUtils::VERSION_403: - AKLOGE("Given path is a file but the format is version 4. path: %s", path); - break; - default: - AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); - break; - } - ASSERT(false); - return nullptr; -} - -/* static */ void DictionaryStructureWithBufferPolicyFactory::getHeaderFilePathInDictDir( - const char *const dictDirPath, const int outHeaderFileBufSize, - char *const outHeaderFilePath) { - const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; - char dictName[dictNameBufSize]; - FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); - snprintf(outHeaderFilePath, outHeaderFileBufSize, "%s/%s%s", dictDirPath, - dictName, Ver4DictConstants::HEADER_FILE_EXTENSION); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h deleted file mode 100644 index 768454d8d..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H -#define LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H - -#include - -#include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" - -namespace latinime { - -class DictionaryStructureWithBufferPolicyFactory { - public: - static DictionaryStructureWithBufferPolicy::StructurePolicyPtr - newPolicyForExistingDictFile(const char *const path, const int bufOffset, - const int size, const bool isUpdatable); - - static DictionaryStructureWithBufferPolicy::StructurePolicyPtr - newPolicyForOnMemoryDict(const int formatVersion, const std::vector &locale, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); - - template - static DictionaryStructureWithBufferPolicy::StructurePolicyPtr - newPolicyForOnMemoryV4Dict(const FormatUtils::FORMAT_VERSION formatVersion, - const std::vector &locale, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); - - static DictionaryStructureWithBufferPolicy::StructurePolicyPtr - newPolicyForDirectoryDict(const char *const path, const bool isUpdatable); - - template - static DictionaryStructureWithBufferPolicy::StructurePolicyPtr newPolicyForV4Dict( - const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, - MmappedBuffer::MmappedBufferPtr &&mmappedBuffer); - - static DictionaryStructureWithBufferPolicy::StructurePolicyPtr - newPolicyForFileDict(const char *const path, const int bufOffset, const int size); - - static void getHeaderFilePathInDictDir(const char *const dirPath, - const int outHeaderFileBufSize, char *const outHeaderFilePath); -}; -} // namespace latinime -#endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp deleted file mode 100644 index 1b2f857ab..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = - 0x30; -const BigramListReadWriteUtils::BigramFlags - BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; -const BigramListReadWriteUtils::BigramFlags - BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; -const BigramListReadWriteUtils::BigramFlags - BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; -const BigramListReadWriteUtils::BigramFlags - BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; -// Flag for presence of more attributes -const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT = - 0x80; -// Mask for attribute probability, stored on 4 bits inside the flags byte. -const BigramListReadWriteUtils::BigramFlags - BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; - -/* static */ bool BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( - const ReadOnlyByteArrayView buffer, BigramFlags *const outBigramFlags, - int *const outTargetPtNodePos, int *const bigramEntryPos) { - if (static_cast(buffer.size()) <= *bigramEntryPos) { - AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %zd, " - "bigramEntryPos: %d.", buffer.size(), *bigramEntryPos); - return false; - } - const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), - bigramEntryPos); - if (outBigramFlags) { - *outBigramFlags = bigramFlags; - } - const int targetPos = getBigramAddressAndAdvancePosition(buffer, bigramFlags, bigramEntryPos); - if (outTargetPtNodePos) { - *outTargetPtNodePos = targetPos; - } - return true; -} - -/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const ReadOnlyByteArrayView buffer, - int *const bigramListPos) { - BigramFlags flags; - do { - if (!getBigramEntryPropertiesAndAdvancePosition(buffer, &flags, 0 /* outTargetPtNodePos */, - bigramListPos)) { - return false; - } - } while(hasNext(flags)); - return true; -} - -/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition( - const ReadOnlyByteArrayView buffer, const BigramFlags flags, int *const pos) { - int offset = 0; - const int origin = *pos; - switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { - case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: - offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); - break; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: - offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos); - break; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: - offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer.data(), pos); - break; - } - if (isOffsetNegative(flags)) { - return origin - offset; - } else { - return origin + offset; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h deleted file mode 100644 index a0f7d5e83..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H -#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H - -#include -#include - -#include "defines.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class BufferWithExtendableBuffer; - -class BigramListReadWriteUtils { -public: - typedef uint8_t BigramFlags; - - static bool getBigramEntryPropertiesAndAdvancePosition(const ReadOnlyByteArrayView buffer, - BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, - int *const bigramEntryPos); - - static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { - return flags & MASK_ATTRIBUTE_PROBABILITY; - } - - static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) { - return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; - } - - // Bigrams reading methods - static bool skipExistingBigrams(const ReadOnlyByteArrayView buffer, int *const bigramListPos); - -private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); - - static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; - static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; - static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; - static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; - static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; - static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; - static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; - - static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { - return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; - } - - static int getBigramAddressAndAdvancePosition(const ReadOnlyByteArrayView buffer, - const BigramFlags flags, int *const pos); -}; -} // namespace latinime -#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp deleted file mode 100644 index db1a802d0..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" - -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" - -namespace latinime { - -bool DynamicPtGcEventListeners - ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless - // children. - bool isUselessPtNode = !ptNodeParams->isTerminal(); - if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { - bool needsToKeepPtNode = true; - if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( - ptNodeParams, &needsToKeepPtNode)) { - AKLOGE("Cannot update PtNode probability or get needs to keep PtNode after GC."); - return false; - } - if (!needsToKeepPtNode) { - isUselessPtNode = true; - } - } - if (mChildrenValue > 0) { - isUselessPtNode = false; - } else if (ptNodeParams->isTerminal()) { - // Remove children as all children are useless. - if (!mPtNodeWriter->updateChildrenPosition(ptNodeParams, - NOT_A_DICT_POS /* newChildrenPosition */)) { - return false; - } - } - if (isUselessPtNode) { - // Current PtNode is no longer needed. Mark it as deleted. - if (!mPtNodeWriter->markPtNodeAsDeleted(ptNodeParams)) { - return false; - } - } else { - mValueStack.back() += 1; - if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { - mValidUnigramCount += 1; - } - } - return true; -} - -bool DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability - ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - if (!ptNodeParams->isDeleted()) { - int bigramEntryCount = 0; - if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams, - &bigramEntryCount)) { - return false; - } - mValidBigramEntryCount += bigramEntryCount; - } - return true; -} - -// Writes dummy PtNode array size when the head of PtNode array is read. -bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - ::onDescend(const int ptNodeArrayPos) { - mValidPtNodeCount = 0; - int writingPos = mBufferToWrite->getTailPosition(); - mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( - PtNodeWriter::PtNodeArrayPositionRelocationMap::value_type(ptNodeArrayPos, writingPos)); - // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. - // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. - mPtNodeArraySizeFieldPos = writingPos; - return DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( - mBufferToWrite, 0 /* arraySize */, &writingPos); -} - -// Write PtNode array terminal and actual PtNode array size. -bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - ::onReadingPtNodeArrayTail() { - int writingPos = mBufferToWrite->getTailPosition(); - // Write PtNode array terminal. - if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( - mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - // Write actual PtNode array size. - if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( - mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { - return false; - } - return true; -} - -// Write valid PtNode to buffer and memorize mapping from the old position to the new position. -bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - if (ptNodeParams->isDeleted()) { - // Current PtNode is not written in new buffer because it has been deleted. - mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( - PtNodeWriter::PtNodePositionRelocationMap::value_type( - ptNodeParams->getHeadPos(), NOT_A_DICT_POS)); - return true; - } - int writingPos = mBufferToWrite->getTailPosition(); - mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( - PtNodeWriter::PtNodePositionRelocationMap::value_type( - ptNodeParams->getHeadPos(), writingPos)); - mValidPtNodeCount++; - // Writes current PtNode. - return mPtNodeWriter->writePtNodeAndAdvancePosition(ptNodeParams, &writingPos); -} - -bool DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields - ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - // Updates parent position. - int bigramCount = 0; - if (!mPtNodeWriter->updateAllPositionFields(ptNodeParams, mDictPositionRelocationMap, - &bigramCount)) { - return false; - } - mBigramCount += bigramCount; - if (ptNodeParams->isTerminal()) { - mUnigramCount++; - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h deleted file mode 100644 index b8a4a92e8..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H -#define LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -class PtNodeParams; - -class DynamicPtGcEventListeners { - public: - // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or - // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC. - // TODO: Concatenate non-terminal PtNodes. - class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - : public DynamicPtReadingHelper::TraversingEventListener { - public: - TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - PtNodeWriter *const ptNodeWriter) - : mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0), - mValidUnigramCount(0) {} - - ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; - - bool onAscend() { - if (mValueStack.empty()) { - return false; - } - mChildrenValue = mValueStack.back(); - mValueStack.pop_back(); - return true; - } - - bool onDescend(const int ptNodeArrayPos) { - mValueStack.push_back(0); - mChildrenValue = 0; - return true; - } - - bool onReadingPtNodeArrayTail() { return true; } - - bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); - - int getValidUnigramCount() const { - return mValidUnigramCount; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS( - TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); - - PtNodeWriter *const mPtNodeWriter; - std::vector mValueStack; - int mChildrenValue; - int mValidUnigramCount; - }; - - // TODO: Remove when we stop supporting v402 format. - // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram - // entries. - class TraversePolicyToUpdateBigramProbability - : public DynamicPtReadingHelper::TraversingEventListener { - public: - TraversePolicyToUpdateBigramProbability(PtNodeWriter *const ptNodeWriter) - : mPtNodeWriter(ptNodeWriter), mValidBigramEntryCount(0) {} - - bool onAscend() { return true; } - - bool onDescend(const int ptNodeArrayPos) { return true; } - - bool onReadingPtNodeArrayTail() { return true; } - - bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); - - int getValidBigramEntryCount() const { - return mValidBigramEntryCount; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); - - PtNodeWriter *const mPtNodeWriter; - int mValidBigramEntryCount; - }; - - class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - : public DynamicPtReadingHelper::TraversingEventListener { - public: - TraversePolicyToPlaceAndWriteValidPtNodesToBuffer( - PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const bufferToWrite, - PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) - : mPtNodeWriter(ptNodeWriter), mBufferToWrite(bufferToWrite), - mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0), - mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {}; - - bool onAscend() { return true; } - - bool onDescend(const int ptNodeArrayPos); - - bool onReadingPtNodeArrayTail(); - - bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer); - - PtNodeWriter *const mPtNodeWriter; - BufferWithExtendableBuffer *const mBufferToWrite; - PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; - int mValidPtNodeCount; - int mPtNodeArraySizeFieldPos; - }; - - class TraversePolicyToUpdateAllPositionFields - : public DynamicPtReadingHelper::TraversingEventListener { - public: - TraversePolicyToUpdateAllPositionFields(PtNodeWriter *const ptNodeWriter, - const PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) - : mPtNodeWriter(ptNodeWriter), - mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0), - mBigramCount(0) {}; - - bool onAscend() { return true; } - - bool onDescend(const int ptNodeArrayPos) { return true; } - - bool onReadingPtNodeArrayTail() { return true; } - - bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); - - int getUnigramCount() const { - return mUnigramCount; - } - - int getBigramCount() const { - return mBigramCount; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); - - PtNodeWriter *const mPtNodeWriter; - const PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; - int mUnigramCount; - int mBigramCount; - }; - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtGcEventListeners); -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp deleted file mode 100644 index 5e4a4b166..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" - -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" -#include "utils/char_utils.h" - -namespace latinime { - -// To avoid infinite loop caused by invalid or malicious forward links. -const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; -const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; -const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; - -bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode( - const PtNodeParams *const ptNodeParams) { - if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) { - mTerminalPositions->push_back(ptNodeParams->getHeadPos()); - } - return true; -} - -// Visits all PtNodes in post-order depth first manner. -// For example, visits c -> b -> y -> x -> a for the following dictionary: -// a _ b _ c -// \ x _ y -bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( - TraversingEventListener *const listener) { - bool alreadyVisitedChildren = false; - // Descend from the root to the root PtNode array. - if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { - return false; - } - while (!isEnd()) { - const PtNodeParams ptNodeParams(getPtNodeParams()); - if (!ptNodeParams.isValid()) { - break; - } - if (!alreadyVisitedChildren) { - if (ptNodeParams.hasChildren()) { - // Move to the first child. - if (!listener->onDescend(ptNodeParams.getChildrenPos())) { - return false; - } - pushReadingStateToStack(); - readChildNode(ptNodeParams); - } else { - alreadyVisitedChildren = true; - } - } else { - if (!listener->onVisitingPtNode(&ptNodeParams)) { - return false; - } - readNextSiblingNode(ptNodeParams); - if (isEnd()) { - // All PtNodes in current linked PtNode arrays have been visited. - // Return to the parent. - if (!listener->onReadingPtNodeArrayTail()) { - return false; - } - if (mReadingStateStack.size() <= 0) { - break; - } - if (!listener->onAscend()) { - return false; - } - popReadingStateFromStack(); - alreadyVisitedChildren = true; - } else { - // Process sibling PtNode. - alreadyVisitedChildren = false; - } - } - } - // Ascend from the root PtNode array to the root. - if (!listener->onAscend()) { - return false; - } - return !isError(); -} - -// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order -// that PtNodes are written in the dictionary buffer. -// For example, visits a -> b -> x -> c -> y for the following dictionary: -// a _ b _ c -// \ x _ y -bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - TraversingEventListener *const listener) { - bool alreadyVisitedAllPtNodesInArray = false; - bool alreadyVisitedChildren = false; - // Descend from the root to the root PtNode array. - if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { - return false; - } - if (isEnd()) { - // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. - if (!listener->onReadingPtNodeArrayTail()) { - return false; - } - } - pushReadingStateToStack(); - while (!isEnd()) { - const PtNodeParams ptNodeParams(getPtNodeParams()); - if (!ptNodeParams.isValid()) { - break; - } - if (alreadyVisitedAllPtNodesInArray) { - if (alreadyVisitedChildren) { - // Move to next sibling PtNode's children. - readNextSiblingNode(ptNodeParams); - if (isEnd()) { - // Return to the parent PTNode. - if (!listener->onAscend()) { - return false; - } - if (mReadingStateStack.size() <= 0) { - break; - } - popReadingStateFromStack(); - alreadyVisitedChildren = true; - alreadyVisitedAllPtNodesInArray = true; - } else { - alreadyVisitedChildren = false; - } - } else { - if (ptNodeParams.hasChildren()) { - // Move to the first child. - if (!listener->onDescend(ptNodeParams.getChildrenPos())) { - return false; - } - pushReadingStateToStack(); - readChildNode(ptNodeParams); - // Push state to return the head of PtNode array. - pushReadingStateToStack(); - alreadyVisitedAllPtNodesInArray = false; - alreadyVisitedChildren = false; - } else { - alreadyVisitedChildren = true; - } - } - } else { - if (!listener->onVisitingPtNode(&ptNodeParams)) { - return false; - } - readNextSiblingNode(ptNodeParams); - if (isEnd()) { - if (!listener->onReadingPtNodeArrayTail()) { - return false; - } - // Return to the head of current PtNode array. - popReadingStateFromStack(); - alreadyVisitedAllPtNodesInArray = true; - } - } - } - popReadingStateFromStack(); - // Ascend from the root PtNode array to the root. - if (!listener->onAscend()) { - return false; - } - return !isError(); -} - -int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount, - int *const outCodePoints) { - // This method traverses parent nodes from the terminal by following parent pointers; thus, - // node code points are stored in the buffer in the reverse order. - int reverseCodePoints[maxCodePointCount]; - const PtNodeParams terminalPtNodeParams(getPtNodeParams()); - // First, read the terminal node and get its probability. - if (!isValidTerminalNode(terminalPtNodeParams)) { - // Node at the ptNodePos is not a valid terminal node. - return 0; - } - // Then, following parent node link to the dictionary root and fetch node code points. - int totalCodePointCount = 0; - while (!isEnd()) { - const PtNodeParams ptNodeParams(getPtNodeParams()); - totalCodePointCount = getTotalCodePointCount(ptNodeParams); - if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) { - // The ptNodePos is not a valid terminal node position in the dictionary. - return 0; - } - // Store node code points to buffer in the reverse order. - fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(), - reverseCodePoints); - // Follow parent node toward the root node. - readParentNode(ptNodeParams); - } - if (isError()) { - // The node position or the dictionary is invalid. - return 0; - } - // Reverse the stored code points to output them. - for (int i = 0; i < totalCodePointCount; ++i) { - outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1]; - } - return totalCodePointCount; -} - -int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord, - const size_t length, const bool forceLowerCaseSearch) { - int searchCodePoints[length]; - for (size_t i = 0; i < length; ++i) { - searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; - } - while (!isEnd()) { - const PtNodeParams ptNodeParams(getPtNodeParams()); - const int matchedCodePointCount = getPrevTotalCodePointCount(); - if (getTotalCodePointCount(ptNodeParams) > length - || !isMatchedCodePoint(ptNodeParams, 0 /* index */, - searchCodePoints[matchedCodePointCount])) { - // Current node has too many code points or its first code point is different from - // target code point. Skip this node and read the next sibling node. - readNextSiblingNode(ptNodeParams); - continue; - } - // Check following merged node code points. - const int nodeCodePointCount = ptNodeParams.getCodePointCount(); - for (int j = 1; j < nodeCodePointCount; ++j) { - if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) { - // Different code point is found. The given word is not included in the dictionary. - return NOT_A_DICT_POS; - } - } - // All characters are matched. - if (length == getTotalCodePointCount(ptNodeParams)) { - if (!ptNodeParams.isTerminal()) { - return NOT_A_DICT_POS; - } - // Terminal position is found. - return ptNodeParams.getHeadPos(); - } - if (!ptNodeParams.hasChildren()) { - return NOT_A_DICT_POS; - } - // Advance to the children nodes. - readChildNode(ptNodeParams); - } - // If we already traversed the tree further than the word is long, there means - // there was no match (or we would have found it). - return NOT_A_DICT_POS; -} - -// Read node array size and process empty node arrays. Nodes and arrays are counted up in this -// method to avoid an infinite loop. -void DynamicPtReadingHelper::nextPtNodeArray() { - int ptNodeCountInArray = 0; - int firstPtNodePos = NOT_A_DICT_POS; - if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid( - mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) { - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - return; - } - mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos; - mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray; - mReadingState.mPos = firstPtNodePos; - // Count up nodes and node arrays to avoid infinite loop. - mReadingState.mTotalPtNodeIndexInThisArrayChain += - mReadingState.mRemainingPtNodeCountInThisArray; - mReadingState.mPtNodeArrayIndexInThisArrayChain++; - if (mReadingState.mRemainingPtNodeCountInThisArray < 0 - || mReadingState.mTotalPtNodeIndexInThisArrayChain - > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP - || mReadingState.mPtNodeArrayIndexInThisArrayChain - > MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { - // Invalid dictionary. - AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" - "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", - mReadingState.mRemainingPtNodeCountInThisArray, - mReadingState.mTotalPtNodeIndexInThisArrayChain, - MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, - mReadingState.mPtNodeArrayIndexInThisArrayChain, - MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); - ASSERT(false); - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - return; - } - if (mReadingState.mRemainingPtNodeCountInThisArray == 0) { - // Empty node array. Try following forward link. - followForwardLink(); - } -} - -// Follow the forward link and read the next node array if exists. -void DynamicPtReadingHelper::followForwardLink() { - int nextPtNodeArrayPos = NOT_A_DICT_POS; - if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid( - mReadingState.mPos, &nextPtNodeArrayPos)) { - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - return; - } - mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; - if (nextPtNodeArrayPos != NOT_A_DICT_POS) { - // Follow the forward link. - mReadingState.mPos = nextPtNodeArrayPos; - nextPtNodeArray(); - } else { - // All node arrays have been read. - mReadingState.mPos = NOT_A_DICT_POS; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h deleted file mode 100644 index 21c287fdc..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PT_READING_HELPER_H -#define LATINIME_DYNAMIC_PT_READING_HELPER_H - -#include -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" - -namespace latinime { - -class DictionaryShortcutsStructurePolicy; -class PtNodeArrayReader; - -/* - * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and - * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. - */ -class DynamicPtReadingHelper { - public: - class TraversingEventListener { - public: - virtual ~TraversingEventListener() {}; - - // Returns whether the event handling was succeeded or not. - virtual bool onAscend() = 0; - - // Returns whether the event handling was succeeded or not. - virtual bool onDescend(const int ptNodeArrayPos) = 0; - - // Returns whether the event handling was succeeded or not. - virtual bool onReadingPtNodeArrayTail() = 0; - - // Returns whether the event handling was succeeded or not. - virtual bool onVisitingPtNode(const PtNodeParams *const node) = 0; - - protected: - TraversingEventListener() {}; - - private: - DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); - }; - - class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener { - public: - TraversePolicyToGetAllTerminalPtNodePositions(std::vector *const terminalPositions) - : mTerminalPositions(terminalPositions) {} - bool onAscend() { return true; } - bool onDescend(const int ptNodeArrayPos) { return true; } - bool onReadingPtNodeArrayTail() { return true; } - bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions); - - std::vector *const mTerminalPositions; - }; - - DynamicPtReadingHelper(const PtNodeReader *const ptNodeReader, - const PtNodeArrayReader *const ptNodeArrayReader) - : mIsError(false), mReadingState(), mPtNodeReader(ptNodeReader), - mPtNodeArrayReader(ptNodeArrayReader), mReadingStateStack() {} - - ~DynamicPtReadingHelper() {} - - AK_FORCE_INLINE bool isError() const { - return mIsError; - } - - AK_FORCE_INLINE bool isEnd() const { - return mReadingState.mPos == NOT_A_DICT_POS; - } - - // Initialize reading state with the head position of a PtNode array. - AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { - if (ptNodeArrayPos == NOT_A_DICT_POS) { - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mIsError = false; - mReadingState.mPos = ptNodeArrayPos; - mReadingState.mTotalCodePointCountSinceInitialization = 0; - mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; - mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - mReadingStateStack.clear(); - nextPtNodeArray(); - } - } - - // Initialize reading state with the head position of a node. - AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { - if (ptNodePos == NOT_A_DICT_POS) { - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mIsError = false; - mReadingState.mPos = ptNodePos; - mReadingState.mRemainingPtNodeCountInThisArray = 1; - mReadingState.mTotalCodePointCountSinceInitialization = 0; - mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; - mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; - mReadingStateStack.clear(); - } - } - - AK_FORCE_INLINE const PtNodeParams getPtNodeParams() const { - if (isEnd()) { - return PtNodeParams(); - } - return mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(mReadingState.mPos); - } - - AK_FORCE_INLINE bool isValidTerminalNode(const PtNodeParams &ptNodeParams) const { - return !isEnd() && !ptNodeParams.isDeleted() && ptNodeParams.isTerminal(); - } - - AK_FORCE_INLINE bool isMatchedCodePoint(const PtNodeParams &ptNodeParams, const int index, - const int codePoint) const { - return ptNodeParams.getCodePoints()[index] == codePoint; - } - - // Return code point count exclude the last read node's code points. - AK_FORCE_INLINE size_t getPrevTotalCodePointCount() const { - return mReadingState.mTotalCodePointCountSinceInitialization; - } - - // Return code point count include the last read node's code points. - AK_FORCE_INLINE size_t getTotalCodePointCount(const PtNodeParams &ptNodeParams) const { - return mReadingState.mTotalCodePointCountSinceInitialization - + ptNodeParams.getCodePointCount(); - } - - AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder(const PtNodeParams &ptNodeParams, - const int index, int *const outCodePoints) const { - const int nodeCodePointCount = ptNodeParams.getCodePointCount(); - const int *const nodeCodePoints = ptNodeParams.getCodePoints(); - for (int i = 0; i < nodeCodePointCount; ++i) { - outCodePoints[index + i] = nodeCodePoints[nodeCodePointCount - 1 - i]; - } - } - - AK_FORCE_INLINE void readNextSiblingNode(const PtNodeParams &ptNodeParams) { - mReadingState.mRemainingPtNodeCountInThisArray -= 1; - mReadingState.mPos = ptNodeParams.getSiblingNodePos(); - if (mReadingState.mRemainingPtNodeCountInThisArray <= 0) { - // All nodes in the current node array have been read. - followForwardLink(); - } - } - - // Read the first child node of the current node. - AK_FORCE_INLINE void readChildNode(const PtNodeParams &ptNodeParams) { - if (ptNodeParams.hasChildren()) { - mReadingState.mTotalCodePointCountSinceInitialization += - ptNodeParams.getCodePointCount(); - mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; - mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; - mReadingState.mPos = ptNodeParams.getChildrenPos(); - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - // Read children node array. - nextPtNodeArray(); - } else { - mReadingState.mPos = NOT_A_DICT_POS; - } - } - - // Read the parent node of the current node. - AK_FORCE_INLINE void readParentNode(const PtNodeParams &ptNodeParams) { - if (ptNodeParams.getParentPos() != NOT_A_DICT_POS) { - mReadingState.mTotalCodePointCountSinceInitialization += - ptNodeParams.getCodePointCount(); - mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; - mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; - mReadingState.mRemainingPtNodeCountInThisArray = 1; - mReadingState.mPos = ptNodeParams.getParentPos(); - mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; - mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; - } else { - mReadingState.mPos = NOT_A_DICT_POS; - } - } - - AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { - return mReadingState.mPosOfLastForwardLinkField; - } - - AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { - return mReadingState.mPosOfThisPtNodeArrayHead; - } - - bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); - - bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - TraversingEventListener *const listener); - - int getCodePointsAndReturnCodePointCount(const int maxCodePointCount, int *const outCodePoints); - - int getTerminalPtNodePositionOfWord(const int *const inWord, const size_t length, - const bool forceLowerCaseSearch); - - private: - DISALLOW_COPY_AND_ASSIGN(DynamicPtReadingHelper); - - // This class encapsulates the reading state of a position in the dictionary. It points at a - // specific PtNode in the dictionary. - class PtNodeReadingState { - public: - // Note that copy constructor and assignment operator are used for this class to use - // std::vector. - PtNodeReadingState() : mPos(NOT_A_DICT_POS), mRemainingPtNodeCountInThisArray(0), - mTotalCodePointCountSinceInitialization(0), mTotalPtNodeIndexInThisArrayChain(0), - mPtNodeArrayIndexInThisArrayChain(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), - mPosOfThisPtNodeArrayHead(NOT_A_DICT_POS) {} - - int mPos; - // Remaining node count in the current array. - int mRemainingPtNodeCountInThisArray; - size_t mTotalCodePointCountSinceInitialization; - // Counter of PtNodes used to avoid infinite loops caused by broken or malicious links. - int mTotalPtNodeIndexInThisArrayChain; - // Counter of PtNode arrays used to avoid infinite loops caused by cyclic links of empty - // PtNode arrays. - int mPtNodeArrayIndexInThisArrayChain; - int mPosOfLastForwardLinkField; - int mPosOfThisPtNodeArrayHead; - }; - - static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; - static const int MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; - static const size_t MAX_READING_STATE_STACK_SIZE; - - // TODO: Introduce error code to track what caused the error. - bool mIsError; - PtNodeReadingState mReadingState; - const PtNodeReader *const mPtNodeReader; - const PtNodeArrayReader *const mPtNodeArrayReader; - std::vector mReadingStateStack; - - void nextPtNodeArray(); - - void followForwardLink(); - - AK_FORCE_INLINE void pushReadingStateToStack() { - if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { - AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); - ASSERT(false); - mIsError = true; - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mReadingStateStack.push_back(mReadingState); - } - } - - AK_FORCE_INLINE void popReadingStateFromStack() { - if (mReadingStateStack.empty()) { - mReadingState.mPos = NOT_A_DICT_POS; - } else { - mReadingState = mReadingStateStack.back(); - mReadingStateStack.pop_back(); - } - } -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PT_READING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp deleted file mode 100644 index 3586b50ab..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" - -#include "defines.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::MASK_MOVED = 0xC0; -const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; -const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_MOVED = 0x40; -const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_DELETED = 0x80; -const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_WILL_BECOME_NON_TERMINAL = 0x00; - -// TODO: Make DICT_OFFSET_ZERO_OFFSET = 0. -// Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum -// value of offsets, which is 0x7FFFFF is used to represent 0 offset. -const int DynamicPtReadingUtils::DICT_OFFSET_INVALID = 0; -const int DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; - -/* static */ int DynamicPtReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, - const int pos) { - int linkAddressPos = pos; - return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos); -} - -/* static */ int DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); -} - -/* static */ int DynamicPtReadingUtils::getParentPtNodePos(const int parentOffset, - const int ptNodePos) { - if (parentOffset == DICT_OFFSET_INVALID) { - return NOT_A_DICT_POS; - } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) { - return ptNodePos; - } else { - return parentOffset + ptNodePos; - } -} - -/* static */ int DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - const int base = *pos; - const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); - if (offset == DICT_OFFSET_INVALID) { - // The PtNode does not have children. - return NOT_A_DICT_POS; - } else if (offset == DICT_OFFSET_ZERO_OFFSET) { - return base; - } else { - return base + offset; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h deleted file mode 100644 index b13a075d5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PT_READING_UTILS_H -#define LATINIME_DYNAMIC_PT_READING_UTILS_H - -#include - -#include "defines.h" - -namespace latinime { - -class DynamicPtReadingUtils { - public: - typedef uint8_t NodeFlags; - - static const int DICT_OFFSET_INVALID; - static const int DICT_OFFSET_ZERO_OFFSET; - - static int getForwardLinkPosition(const uint8_t *const buffer, const int pos); - - static AK_FORCE_INLINE bool isValidForwardLinkPosition(const int forwardLinkAddress) { - return forwardLinkAddress != 0; - } - - static int getParentPtNodePosOffsetAndAdvancePosition(const uint8_t *const buffer, - int *const pos); - - static int getParentPtNodePos(const int parentOffset, const int ptNodePos); - - static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, int *const pos); - - /** - * Node Flags - */ - static AK_FORCE_INLINE bool isMoved(const NodeFlags flags) { - return FLAG_IS_MOVED == (MASK_MOVED & flags); - } - - static AK_FORCE_INLINE bool isDeleted(const NodeFlags flags) { - return FLAG_IS_DELETED == (MASK_MOVED & flags); - } - - static AK_FORCE_INLINE bool willBecomeNonTerminal(const NodeFlags flags) { - return FLAG_WILL_BECOME_NON_TERMINAL == (MASK_MOVED & flags); - } - - static AK_FORCE_INLINE NodeFlags updateAndGetFlags(const NodeFlags originalFlags, - const bool isMoved, const bool isDeleted, const bool willBecomeNonTerminal) { - NodeFlags flags = originalFlags; - flags = willBecomeNonTerminal ? - ((flags & (~MASK_MOVED)) | FLAG_WILL_BECOME_NON_TERMINAL) : flags; - flags = isMoved ? ((flags & (~MASK_MOVED)) | FLAG_IS_MOVED) : flags; - flags = isDeleted ? ((flags & (~MASK_MOVED)) | FLAG_IS_DELETED) : flags; - flags = (!isMoved && !isDeleted && !willBecomeNonTerminal) ? - ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags; - return flags; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtReadingUtils); - - static const NodeFlags MASK_MOVED; - static const NodeFlags FLAG_IS_NOT_MOVED; - static const NodeFlags FLAG_IS_MOVED; - static const NodeFlags FLAG_IS_DELETED; - static const NodeFlags FLAG_WILL_BECOME_NON_TERMINAL; -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PT_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp deleted file mode 100644 index e524e86e5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" - -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; - -bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper, - const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, - bool *const outAddedNewUnigram) { - int parentPos = NOT_A_DICT_POS; - while (!readingHelper->isEnd()) { - const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); - if (!ptNodeParams.isValid()) { - break; - } - const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); - if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, - wordCodePoints[matchedCodePointCount])) { - // The first code point is different from target code point. Skip this node and read - // the next sibling node. - readingHelper->readNextSiblingNode(ptNodeParams); - continue; - } - // Check following merged node code points. - const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size(); - for (size_t j = 1; j < nodeCodePointCount; ++j) { - const size_t nextIndex = matchedCodePointCount + j; - if (nextIndex >= wordCodePoints.size() - || !readingHelper->isMatchedCodePoint(ptNodeParams, j, - wordCodePoints[matchedCodePointCount + j])) { - *outAddedNewUnigram = true; - return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, - wordCodePoints.skip(matchedCodePointCount)); - } - } - // All characters are matched. - if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) { - return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); - } - if (!ptNodeParams.hasChildren()) { - *outAddedNewUnigram = true; - return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, - wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams))); - } - // Advance to the children nodes. - parentPos = ptNodeParams.getHeadPos(); - readingHelper->readChildNode(ptNodeParams); - } - if (readingHelper->isError()) { - // The dictionary is invalid. - return false; - } - int pos = readingHelper->getPosOfLastForwardLinkField(); - *outAddedNewUnigram = true; - return createAndInsertNodeIntoPtNodeArray(parentPos, - wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty, - &pos); -} - -bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, - const int wordPos, const NgramProperty *const ngramProperty, - bool *const outAddedNewEntry) { - if (prevWordsPtNodePos.empty()) { - return false; - } - ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); - int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { - prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( - prevWordsPtNodePos[i]).getTerminalId(); - } - const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); - const int wordId = - mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); - return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry); -} - -bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, - const int wordPos) { - if (prevWordsPtNodePos.empty()) { - return false; - } - ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); - int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { - prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( - prevWordsPtNodePos[i]).getTerminalId(); - } - const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); - const int wordId = - mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); - return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId); -} - -bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, - const CodePointArrayView targetCodePoints, const int shortcutProbability) { - const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); - return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(), - targetCodePoints.size(), shortcutProbability); -} - -bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, - const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, - int *const forwardLinkFieldPos) { - const int newPtNodeArrayPos = mBuffer->getTailPosition(); - if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - newPtNodeArrayPos, forwardLinkFieldPos)) { - return false; - } - return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty); -} - -bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, - const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { - if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) { - // Overwrites the probability. - *outAddedNewUnigram = false; - return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); - } else { - // Make the node terminal and write the probability. - *outAddedNewUnigram = true; - const int movedPos = mBuffer->getTailPosition(); - int writingPos = movedPos; - const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, - unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), - true /* isTerminal */, originalPtNodeParams->getParentPos(), - originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability())); - if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, - unigramProperty, &writingPos)) { - return false; - } - if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { - return false; - } - } - return true; -} - -bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( - const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, - const CodePointArrayView codePoints) { - const int newPtNodeArrayPos = mBuffer->getTailPosition(); - if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { - return false; - } - return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, - unigramProperty); -} - -bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( - const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints, - const UnigramProperty *const unigramProperty) { - int writingPos = mBuffer->getTailPosition(); - if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, - 1 /* arraySize */, &writingPos)) { - return false; - } - const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), - true /* isTerminal */, parentPtNodePos, ptNodeCodePoints, - unigramProperty->getProbability())); - if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, - unigramProperty, &writingPos)) { - return false; - } - if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - return true; -} - -// Returns whether the dictionary updating was succeeded or not. -bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( - const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount, - const UnigramProperty *const unigramProperty, - const CodePointArrayView newPtNodeCodePoints) { - // When addsExtraChild is true, split the reallocating PtNode and add new child. - // Reallocating PtNode: abcde, newNode: abcxy. - // abc (1st, not terminal) __ de (2nd) - // \_ xy (extra child, terminal) - // Otherwise, this method makes 1st part terminal and write information in unigramProperty. - // Reallocating PtNode: abcde, newNode: abc. - // abc (1st, terminal) __ de (2nd) - const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount; - const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); - int writingPos = firstPartOfReallocatedPtNodePos; - // Write the 1st part of the reallocating node. The children position will be updated later - // with actual children position. - const CodePointArrayView firstPtNodeCodePoints = - reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount); - if (addsExtraChild) { - const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */, - reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints, - NOT_A_PROBABILITY)); - if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { - return false; - } - } else { - const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), - true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), - firstPtNodeCodePoints, unigramProperty->getProbability())); - if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, - unigramProperty, &writingPos)) { - return false; - } - } - const int actualChildrenPos = writingPos; - // Create new children PtNode array. - const size_t newPtNodeCount = addsExtraChild ? 2 : 1; - if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, - newPtNodeCount, &writingPos)) { - return false; - } - // Write the 2nd part of the reallocating node. - const int secondPartOfReallocatedPtNodePos = writingPos; - const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, - reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(), - reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, - reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount), - reallocatingPtNodeParams->getProbability())); - if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { - return false; - } - if (addsExtraChild) { - const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( - unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), - true /* isTerminal */, firstPartOfReallocatedPtNodePos, - newPtNodeCodePoints.skip(overlappingCodePointCount), - unigramProperty->getProbability())); - if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, - unigramProperty, &writingPos)) { - return false; - } - } - if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - // Update original reallocating PtNode as moved. - if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, - secondPartOfReallocatedPtNodePos)) { - return false; - } - // Load node info. Information of the 1st part will be fetched. - const PtNodeParams ptNodeParams( - mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); - // Update children position. - return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); -} - -const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( - const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, - const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, - const CodePointArrayView codePoints, const int probability) const { - const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( - isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, - false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, - CHILDREN_POSITION_FIELD_SIZE); - return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability); -} - -const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord, - const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, - const CodePointArrayView codePoints, const int probability) const { - const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( - isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, - false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, - CHILDREN_POSITION_FIELD_SIZE); - return PtNodeParams(flags, parentPos, codePoints, probability); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h deleted file mode 100644 index db5f6ab17..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PT_UPDATING_HELPER_H -#define LATINIME_DYNAMIC_PT_UPDATING_HELPER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "utils/int_array_view.h" - -namespace latinime { - -class NgramProperty; -class BufferWithExtendableBuffer; -class DynamicPtReadingHelper; -class PtNodeReader; -class PtNodeWriter; -class UnigramProperty; - -class DynamicPtUpdatingHelper { - public: - DynamicPtUpdatingHelper(BufferWithExtendableBuffer *const buffer, - const PtNodeReader *const ptNodeReader, PtNodeWriter *const ptNodeWriter) - : mBuffer(buffer), mPtNodeReader(ptNodeReader), mPtNodeWriter(ptNodeWriter) {} - - ~DynamicPtUpdatingHelper() {} - - // Add a word to the dictionary. If the word already exists, update the probability. - bool addUnigramWord(DynamicPtReadingHelper *const readingHelper, - const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, - bool *const outAddedNewUnigram); - - // TODO: Remove after stopping supporting v402. - // Add an n-gram entry. - bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos, - const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); - - // TODO: Remove after stopping supporting v402. - // Remove an n-gram entry. - bool removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos); - - // Add a shortcut target. - bool addShortcutTarget(const int wordPos, const CodePointArrayView targetCodePoints, - const int shortcutProbability); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtUpdatingHelper); - - static const int CHILDREN_POSITION_FIELD_SIZE; - - BufferWithExtendableBuffer *const mBuffer; - const PtNodeReader *const mPtNodeReader; - PtNodeWriter *const mPtNodeWriter; - - bool createAndInsertNodeIntoPtNodeArray(const int parentPos, - const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, - int *const forwardLinkFieldPos); - - bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, - const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); - - bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, - const UnigramProperty *const unigramProperty, - const CodePointArrayView remainingCodePoints); - - bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, - const CodePointArrayView ptNodeCodePoints, - const UnigramProperty *const unigramProperty); - - bool reallocatePtNodeAndAddNewPtNodes(const PtNodeParams *const reallocatingPtNodeParams, - const size_t overlappingCodePointCount, const UnigramProperty *const unigramProperty, - const CodePointArrayView newPtNodeCodePoints); - - const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, - const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal, - const int parentPos, const CodePointArrayView codePoints, const int probability) const; - - const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, - const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, - const CodePointArrayView codePoints, const int probability) const; -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp deleted file mode 100644 index 664aeebbb..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" - -#include -#include -#include - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F; -const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF; -const int DynamicPtWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1; -const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2; -const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; -const int DynamicPtWritingUtils::DICT_OFFSET_FIELD_SIZE = 3; -const int DynamicPtWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF; -const int DynamicPtWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF; -const int DynamicPtWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; -const int DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE = 1; - -/* static */ bool DynamicPtWritingUtils::writeEmptyDictionary( - BufferWithExtendableBuffer *const buffer, const int rootPos) { - int writingPos = rootPos; - if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) { - return false; - } - return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */, - &writingPos); -} - -/* static */ bool DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, - int *const forwardLinkFieldPos) { - return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos); -} - -/* static */ bool DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const size_t arraySize, - int *const arraySizeFieldPos) { - // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to - // simplify updating process. - // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays. - /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { - return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE, - arraySizeFieldPos); - } else */ - if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { - uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; - return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE, - arraySizeFieldPos); - } else { - AKLOGI("PtNode array size cannot be written because arraySize is too large: %zd", - arraySize); - ASSERT(false); - return false; - } -} - -/* static */ bool DynamicPtWritingUtils::writeFlagsAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, - const DynamicPtReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) { - return buffer->writeUintAndAdvancePosition(nodeFlags, NODE_FLAG_FIELD_SIZE, nodeFlagsFieldPos); -} - -// Note that parentOffset is offset from node's head position. -/* static */ bool DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos, - int *const parentPosFieldPos) { - return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos); -} - -/* static */ bool DynamicPtWritingUtils::writeCodePointsAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const int *const codePoints, - const int codePointCount, int *const codePointFieldPos) { - if (codePointCount <= 0) { - AKLOGI("code points cannot be written because codePointCount is invalid: %d", - codePointCount); - ASSERT(false); - return false; - } - const bool hasMultipleCodePoints = codePointCount > 1; - return buffer->writeCodePointsAndAdvancePosition(codePoints, codePointCount, - hasMultipleCodePoints, codePointFieldPos); -} - -/* static */ bool DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const int childrenPosition, - int *const childrenPositionFieldPos) { - return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos), - childrenPositionFieldPos); -} - -/* static */ bool DynamicPtWritingUtils::writeDictOffset(BufferWithExtendableBuffer *const buffer, - const int targetPos, const int basePos, int *const offsetFieldPos) { - int offset = targetPos - basePos; - if (targetPos == NOT_A_DICT_POS) { - offset = DynamicPtReadingUtils::DICT_OFFSET_INVALID; - } else if (offset == 0) { - offset = DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET; - } - if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { - AKLOGI("offset cannot be written because the offset is too large or too small: %d", - offset); - ASSERT(false); - return false; - } - uint32_t data = 0; - if (offset >= 0) { - data = offset; - } else { - data = abs(offset) | DICT_OFFSET_NEGATIVE_FLAG; - } - return buffer->writeUintAndAdvancePosition(data, DICT_OFFSET_FIELD_SIZE, offsetFieldPos); -} -} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h deleted file mode 100644 index 362fbd1cc..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_PT_WRITING_UTILS_H -#define LATINIME_DYNAMIC_PT_WRITING_UTILS_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" - -namespace latinime { - -class BufferWithExtendableBuffer; - -class DynamicPtWritingUtils { - public: - static const int NODE_FLAG_FIELD_SIZE; - - static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos); - - static bool writeForwardLinkPositionAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, - int *const forwardLinkFieldPos); - - static bool writePtNodeArraySizeAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const size_t arraySize, int *const arraySizeFieldPos); - - static bool writeFlags(BufferWithExtendableBuffer *const buffer, - const DynamicPtReadingUtils::NodeFlags nodeFlags, - const int nodeFlagsFieldPos) { - int writingPos = nodeFlagsFieldPos; - return writeFlagsAndAdvancePosition(buffer, nodeFlags, &writingPos); - } - - static bool writeFlagsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const DynamicPtReadingUtils::NodeFlags nodeFlags, - int *const nodeFlagsFieldPos); - - static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const int parentPosition, const int basePos, int *const parentPosFieldPos); - - static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const int *const codePoints, const int codePointCount, int *const codePointFieldPos); - - static bool writeChildrenPositionAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const int childrenPosition, int *const childrenPositionFieldPos); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtWritingUtils); - - static const size_t MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD; - static const size_t MAX_PTNODE_ARRAY_SIZE; - static const int SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE; - static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE; - static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; - static const int DICT_OFFSET_FIELD_SIZE; - static const int MAX_DICT_OFFSET_VALUE; - static const int MIN_DICT_OFFSET_VALUE; - static const int DICT_OFFSET_NEGATIVE_FLAG; - - static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, - const int basePos, int *const offsetFieldPos); -}; -} // namespace latinime -#endif /* LATINIME_DYNAMIC_PT_WRITING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp deleted file mode 100644 index b8d78bf10..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" - -#include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -typedef PatriciaTrieReadingUtils PtReadingUtils; - -const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; - -// Flag for single/multiple char group -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; -// Flag for terminal PtNodes -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; -// Flag for shortcut targets presence -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; -// Flag for bigram presence -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; -// Flag for non-words (typically, shortcut only entries) -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; -// Flag for possibly offensive words -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; - -/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); - if (firstByte < 0x80) { - return firstByte; - } else { - return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( - buffer, pos); - } -} - -/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); -} - -/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, - const int *const codePointTable, int *const pos) { - return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos); -} - -// Returns the number of read characters. -/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, - const NodeFlags flags, const int maxLength, const int *const codePointTable, - int *const outBuffer, int *const pos) { - int length = 0; - if (hasMultipleChars(flags)) { - length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable, - outBuffer, pos); - } else { - const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos); - if (codePoint == NOT_A_CODE_POINT) { - // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is - // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR - // when the PtNode has a single code point. - length = 0; - AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", - *pos - 1, codePoint, buffer[*pos - 1]); - ASSERT(false); - } else if (maxLength > 0) { - outBuffer[0] = codePoint; - length = 1; - } - } - return length; -} - -// Returns the number of skipped characters. -/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, const int *const codePointTable, int *const pos) { - if (hasMultipleChars(flags)) { - return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); - } else { - if (maxLength > 0) { - getCodePointAndAdvancePosition(buffer, codePointTable, pos); - return 1; - } else { - return 0; - } - } -} - -/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, - int *const pos) { - return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); -} - -/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( - const uint8_t *const buffer, const NodeFlags flags, int *const pos) { - const int base = *pos; - int offset = 0; - switch (MASK_CHILDREN_POSITION_TYPE & flags) { - case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: - offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); - break; - case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: - offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); - break; - case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: - offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); - break; - default: - // If we come here, it means we asked for the children of a word with - // no children. - return NOT_A_DICT_POS; - } - return base + offset; -} - -/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, - const DictionaryShortcutsStructurePolicy *const shortcutPolicy, - const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable, - NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, - int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, - int *const outBigramPos, int *const outSiblingPos) { - int readingPos = ptNodePos; - const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); - *outFlags = flags; - *outCodePointCount = getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos); - *outProbability = isTerminal(flags) ? - readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; - *outChildrenPos = hasChildrenInFlags(flags) ? - readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS; - *outShortcutPos = NOT_A_DICT_POS; - if (hasShortcutTargets(flags)) { - *outShortcutPos = readingPos; - shortcutPolicy->skipAllShortcuts(&readingPos); - } - *outBigramPos = NOT_A_DICT_POS; - if (hasBigrams(flags)) { - *outBigramPos = readingPos; - bigramPolicy->skipAllBigrams(&readingPos); - } - *outSiblingPos = readingPos; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h deleted file mode 100644 index 6a2bf5d3c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PATRICIA_TRIE_READING_UTILS_H -#define LATINIME_PATRICIA_TRIE_READING_UTILS_H - -#include - -#include "defines.h" - -namespace latinime { - -class DictionaryShortcutsStructurePolicy; -class DictionaryBigramsStructurePolicy; - -class PatriciaTrieReadingUtils { - public: - typedef uint8_t NodeFlags; - - static int getPtNodeArraySizeAndAdvancePosition(const uint8_t *const buffer, int *const pos); - - static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); - - static int getCodePointAndAdvancePosition(const uint8_t *const buffer, - const int *const codePointTable, int *const pos); - - // Returns the number of read characters. - static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, const int *const codePointTable, int *const outBuffer, - int *const pos); - - // Returns the number of skipped characters. - static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, const int *const codePointTable, int *const pos); - - static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); - - static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, - const NodeFlags flags, int *const pos); - - /** - * Node Flags - */ - static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) { - return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0; - } - - static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) { - return (flags & FLAG_IS_NOT_A_WORD) != 0; - } - - static AK_FORCE_INLINE bool isTerminal(const NodeFlags flags) { - return (flags & FLAG_IS_TERMINAL) != 0; - } - - static AK_FORCE_INLINE bool hasShortcutTargets(const NodeFlags flags) { - return (flags & FLAG_HAS_SHORTCUT_TARGETS) != 0; - } - - static AK_FORCE_INLINE bool hasBigrams(const NodeFlags flags) { - return (flags & FLAG_HAS_BIGRAMS) != 0; - } - - static AK_FORCE_INLINE bool hasMultipleChars(const NodeFlags flags) { - return (flags & FLAG_HAS_MULTIPLE_CHARS) != 0; - } - - static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) { - return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags); - } - - static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive, - const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, - const bool hasBigrams, const bool hasMultipleChars, - const int childrenPositionFieldSize) { - NodeFlags nodeFlags = 0; - nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags; - nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags; - nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags; - nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags; - nodeFlags = hasBigrams ? (nodeFlags | FLAG_HAS_BIGRAMS) : nodeFlags; - nodeFlags = hasMultipleChars ? (nodeFlags | FLAG_HAS_MULTIPLE_CHARS) : nodeFlags; - if (childrenPositionFieldSize == 1) { - nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; - } else if (childrenPositionFieldSize == 2) { - nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; - } else if (childrenPositionFieldSize == 3) { - nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; - } else { - nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; - } - return nodeFlags; - } - - static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, - const DictionaryShortcutsStructurePolicy *const shortcutPolicy, - const DictionaryBigramsStructurePolicy *const bigramPolicy, - const int *const codePointTable, NodeFlags *const outFlags, - int *const outCodePointCount, int *const outCodePoint, int *const outProbability, - int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos, - int *const outSiblingPos); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); - - static const NodeFlags MASK_CHILDREN_POSITION_TYPE; - static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; - static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; - static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; - static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; - - static const NodeFlags FLAG_HAS_MULTIPLE_CHARS; - static const NodeFlags FLAG_IS_TERMINAL; - static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS; - static const NodeFlags FLAG_HAS_BIGRAMS; - static const NodeFlags FLAG_IS_NOT_A_WORD; - static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE; -}; -} // namespace latinime -#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h deleted file mode 100644 index 6078d8285..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PT_NODE_ARRAY_READER_H -#define LATINIME_PT_NODE_ARRAY_READER_H - -#include "defines.h" - -namespace latinime { - -// Interface class used to read PtNode array information. -class PtNodeArrayReader { - public: - virtual ~PtNodeArrayReader() {} - - // Returns if the position is valid or not. - virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, - int *const outPtNodeCount, int *const outFirstPtNodePos) const = 0; - - // Returns if the position is valid or not. NOT_A_DICT_POS is set to outNextPtNodeArrayPos when - // the next array doesn't exist. - virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, - int *const outNextPtNodeArrayPos) const = 0; - - protected: - PtNodeArrayReader() {}; - - private: - DISALLOW_COPY_AND_ASSIGN(PtNodeArrayReader); -}; -} // namespace latinime -#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h deleted file mode 100644 index e52706e07..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PT_NODE_PARAMS_H -#define LATINIME_PT_NODE_PARAMS_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "utils/char_utils.h" -#include "utils/int_array_view.h" - -namespace latinime { - -// This class has information of a PtNode. This class is immutable. -class PtNodeParams { - public: - // Invalid PtNode. - PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mHasMovedFlag(false), - mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mCodePoints(), - mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), - mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), - mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), - mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), - mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {} - - PtNodeParams(const PtNodeParams& ptNodeParams) - : mHeadPos(ptNodeParams.mHeadPos), mFlags(ptNodeParams.mFlags), - mHasMovedFlag(ptNodeParams.mHasMovedFlag), mParentPos(ptNodeParams.mParentPos), - mCodePointCount(ptNodeParams.mCodePointCount), mCodePoints(), - mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos), - mTerminalId(ptNodeParams.mTerminalId), - mProbabilityFieldPos(ptNodeParams.mProbabilityFieldPos), - mProbability(ptNodeParams.mProbability), - mChildrenPosFieldPos(ptNodeParams.mChildrenPosFieldPos), - mChildrenPos(ptNodeParams.mChildrenPos), - mBigramLinkedNodePos(ptNodeParams.mBigramLinkedNodePos), - mShortcutPos(ptNodeParams.mShortcutPos), mBigramPos(ptNodeParams.mBigramPos), - mSiblingPos(ptNodeParams.mSiblingPos) { - memcpy(mCodePoints, ptNodeParams.getCodePoints(), sizeof(int) * mCodePointCount); - } - - // PtNode read from version 2 dictionary. - PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, - const int codePointCount, const int *const codePoints, const int probability, - const int childrenPos, const int shortcutPos, const int bigramPos, - const int siblingPos) - : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(false), mParentPos(NOT_A_DICT_POS), - mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), - mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), - mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), - mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(childrenPos), - mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(shortcutPos), - mBigramPos(bigramPos), mSiblingPos(siblingPos) { - memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); - } - - // PtNode with a terminal id. - PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, - const int parentPos, const int codePointCount, const int *const codePoints, - const int terminalIdFieldPos, const int terminalId, const int probability, - const int childrenPosFieldPos, const int childrenPos, const int siblingPos) - : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), - mCodePointCount(codePointCount), mCodePoints(), - mTerminalIdFieldPos(terminalIdFieldPos), mTerminalId(terminalId), - mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), - mChildrenPosFieldPos(childrenPosFieldPos), mChildrenPos(childrenPos), - mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(terminalId), - mBigramPos(terminalId), mSiblingPos(siblingPos) { - memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); - } - - // Construct new params by updating existing PtNode params. - PtNodeParams(const PtNodeParams *const ptNodeParams, - const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, - const CodePointArrayView codePoints, const int probability) - : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true), - mParentPos(parentPos), mCodePointCount(codePoints.size()), mCodePoints(), - mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()), - mTerminalId(ptNodeParams->getTerminalId()), - mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()), - mProbability(probability), - mChildrenPosFieldPos(ptNodeParams->getChildrenPosFieldPos()), - mChildrenPos(ptNodeParams->getChildrenPos()), - mBigramLinkedNodePos(ptNodeParams->getBigramLinkedNodePos()), - mShortcutPos(ptNodeParams->getShortcutPos()), - mBigramPos(ptNodeParams->getBigramsPos()), - mSiblingPos(ptNodeParams->getSiblingNodePos()) { - memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); - } - - PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, - const CodePointArrayView codePoints, const int probability) - : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), - mCodePointCount(codePoints.size()), mCodePoints(), - mTerminalIdFieldPos(NOT_A_DICT_POS), - mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), - mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), - mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), - mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), - mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) { - memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); - } - - AK_FORCE_INLINE bool isValid() const { - return mCodePointCount > 0; - } - - // Head position of the PtNode - AK_FORCE_INLINE int getHeadPos() const { - return mHeadPos; - } - - // Flags - AK_FORCE_INLINE bool isDeleted() const { - return mHasMovedFlag && DynamicPtReadingUtils::isDeleted(mFlags); - } - - AK_FORCE_INLINE bool willBecomeNonTerminal() const { - return mHasMovedFlag && DynamicPtReadingUtils::willBecomeNonTerminal(mFlags); - } - - AK_FORCE_INLINE bool hasChildren() const { - return mChildrenPos != NOT_A_DICT_POS; - } - - AK_FORCE_INLINE bool isTerminal() const { - return PatriciaTrieReadingUtils::isTerminal(mFlags); - } - - AK_FORCE_INLINE bool isPossiblyOffensive() const { - return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); - } - - AK_FORCE_INLINE bool isNotAWord() const { - return PatriciaTrieReadingUtils::isNotAWord(mFlags); - } - - AK_FORCE_INLINE bool hasBigrams() const { - return PatriciaTrieReadingUtils::hasBigrams(mFlags); - } - - AK_FORCE_INLINE bool hasShortcutTargets() const { - return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags); - } - - AK_FORCE_INLINE bool representsNonWordInfo() const { - return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0]) - && isNotAWord(); - } - - AK_FORCE_INLINE int representsBeginningOfSentence() const { - return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE - && isNotAWord(); - } - - // Parent node position - AK_FORCE_INLINE int getParentPos() const { - return mParentPos; - } - - AK_FORCE_INLINE const CodePointArrayView getCodePointArrayView() const { - return CodePointArrayView(mCodePoints, mCodePointCount); - } - - // TODO: Remove - // Number of code points - AK_FORCE_INLINE uint8_t getCodePointCount() const { - return mCodePointCount; - } - - // TODO: Remove - AK_FORCE_INLINE const int *getCodePoints() const { - return mCodePoints; - } - - // Probability - AK_FORCE_INLINE int getTerminalIdFieldPos() const { - return mTerminalIdFieldPos; - } - - AK_FORCE_INLINE int getTerminalId() const { - return mTerminalId; - } - - // Probability - AK_FORCE_INLINE int getProbabilityFieldPos() const { - return mProbabilityFieldPos; - } - - AK_FORCE_INLINE int getProbability() const { - return mProbability; - } - - // Children PtNode array position - AK_FORCE_INLINE int getChildrenPosFieldPos() const { - return mChildrenPosFieldPos; - } - - AK_FORCE_INLINE int getChildrenPos() const { - return mChildrenPos; - } - - // Bigram linked node position. - AK_FORCE_INLINE int getBigramLinkedNodePos() const { - return mBigramLinkedNodePos; - } - - // Shortcutlist position - AK_FORCE_INLINE int getShortcutPos() const { - return mShortcutPos; - } - - // Bigrams position - AK_FORCE_INLINE int getBigramsPos() const { - return mBigramPos; - } - - // Sibling node position - AK_FORCE_INLINE int getSiblingNodePos() const { - return mSiblingPos; - } - - private: - // This class have a public copy constructor to be used as a return value. - DISALLOW_ASSIGNMENT_OPERATOR(PtNodeParams); - - const int mHeadPos; - const PatriciaTrieReadingUtils::NodeFlags mFlags; - const bool mHasMovedFlag; - const int mParentPos; - const uint8_t mCodePointCount; - int mCodePoints[MAX_WORD_LENGTH]; - const int mTerminalIdFieldPos; - const int mTerminalId; - const int mProbabilityFieldPos; - const int mProbability; - const int mChildrenPosFieldPos; - const int mChildrenPos; - const int mBigramLinkedNodePos; - const int mShortcutPos; - const int mBigramPos; - const int mSiblingPos; -}; -} // namespace latinime -#endif /* LATINIME_PT_NODE_PARAMS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h deleted file mode 100644 index 31299a707..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PT_NODE_READER_H -#define LATINIME_PT_NODE_READER_H - -#include "defines.h" - -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" - -namespace latinime { - -// Interface class used to read PtNode information. -class PtNodeReader { - public: - virtual ~PtNodeReader() {} - virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos( - const int ptNodePos) const = 0; - - protected: - PtNodeReader() {}; - - private: - DISALLOW_COPY_AND_ASSIGN(PtNodeReader); -}; -} // namespace latinime -#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h deleted file mode 100644 index 954db9b0a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PT_NODE_WRITER_H -#define LATINIME_PT_NODE_WRITER_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "utils/int_array_view.h" - -namespace latinime { - -class NgramProperty; -class UnigramProperty; - -// Interface class used to write PtNode information. -class PtNodeWriter { - public: - typedef std::unordered_map PtNodeArrayPositionRelocationMap; - typedef std::unordered_map PtNodePositionRelocationMap; - struct DictPositionRelocationMap { - public: - DictPositionRelocationMap() - : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} - - PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; - PtNodePositionRelocationMap mPtNodePositionRelocationMap; - - private: - DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); - }; - - virtual ~PtNodeWriter() {} - - virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; - - virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int movedPos, const int bigramLinkedNodePos) = 0; - - virtual bool markPtNodeAsWillBecomeNonTerminal( - const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; - - virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, - const UnigramProperty *const unigramProperty) = 0; - - virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( - const PtNodeParams *const toBeUpdatedPtNodeParams, - bool *const outNeedsToKeepPtNode) = 0; - - virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newChildrenPosition) = 0; - - virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - int *const ptNodeWritingPos) = 0; - - virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0; - - virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) = 0; - - virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0; - - virtual bool updateAllBigramEntriesAndDeleteUselessEntries( - const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) = 0; - - virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, - const DictPositionRelocationMap *const dictPositionRelocationMap, - int *const outBigramEntryCount) = 0; - - virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, - const int *const targetCodePoints, const int targetCodePointCount, - const int shortcutProbability) = 0; - - protected: - PtNodeWriter() {}; - - private: - DISALLOW_COPY_AND_ASSIGN(PtNodeWriter); -}; -} // namespace latinime -#endif /* LATINIME_PT_NODE_WRITER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp deleted file mode 100644 index 40b872055..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -// Flag for presence of more attributes -const ShortcutListReadingUtils::ShortcutFlags - ShortcutListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; -// Mask for attribute probability, stored on 4 bits inside the flags byte. -const ShortcutListReadingUtils::ShortcutFlags - ShortcutListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; -const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; -// The numeric value of the shortcut probability that means 'whitelist'. -const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; - -/* static */ ShortcutListReadingUtils::ShortcutFlags - ShortcutListReadingUtils::getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, - int *const pos) { - return ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); -} - -/* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer( - const ReadOnlyByteArrayView buffer, int *const pos) { - // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. - return ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos) - - SHORTCUT_LIST_SIZE_FIELD_SIZE; -} - -/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer, - const int maxLength, int *const outWord, int *const pos) { - // TODO: Use codePointTable for shortcuts. - return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, - nullptr /* codePointTable */, outWord, pos); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h deleted file mode 100644 index 71cb8cc2c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_SHORTCUT_LIST_READING_UTILS_H -#define LATINIME_SHORTCUT_LIST_READING_UTILS_H - -#include - -#include "defines.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class ShortcutListReadingUtils { - public: - typedef uint8_t ShortcutFlags; - - static ShortcutFlags getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, - int *const pos); - - static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) { - return flags & MASK_ATTRIBUTE_PROBABILITY; - } - - static AK_FORCE_INLINE bool hasNext(const ShortcutFlags flags) { - return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; - } - - // This method returns the size of the shortcut list region excluding the shortcut list size - // field at the beginning. - static int getShortcutListSizeAndForwardPointer(const ReadOnlyByteArrayView buffer, - int *const pos); - - static AK_FORCE_INLINE int getShortcutListSizeFieldSize() { - return SHORTCUT_LIST_SIZE_FIELD_SIZE; - } - - static AK_FORCE_INLINE void skipShortcuts(const ReadOnlyByteArrayView buffer, int *const pos) { - const int shortcutListSize = getShortcutListSizeAndForwardPointer(buffer, pos); - *pos += shortcutListSize; - } - - static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) { - return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; - } - - static int readShortcutTarget(const ReadOnlyByteArrayView buffer, const int maxLength, - int *const outWord, int *const pos); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListReadingUtils); - - static const ShortcutFlags FLAG_ATTRIBUTE_HAS_NEXT; - static const ShortcutFlags MASK_ATTRIBUTE_PROBABILITY; - static const int SHORTCUT_LIST_SIZE_FIELD_SIZE; - static const int WHITELIST_SHORTCUT_PROBABILITY; -}; -} // namespace latinime -#endif // LATINIME_SHORTCUT_LIST_READING_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h deleted file mode 100644 index e2608435c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BIGRAM_LIST_POLICY_H -#define LATINIME_BIGRAM_LIST_POLICY_H - -#include - -#include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class BigramListPolicy : public DictionaryBigramsStructurePolicy { - public: - BigramListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} - - ~BigramListPolicy() {} - - void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, - int *const pos) const { - BigramListReadWriteUtils::BigramFlags flags; - if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBuffer, &flags, - outBigramPos, pos)) { - AKLOGE("Cannot read bigram entry. bufSize: %zd, pos: %d. ", mBuffer.size(), *pos); - *outProbability = NOT_A_PROBABILITY; - *outHasNext = false; - return; - } - *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); - *outHasNext = BigramListReadWriteUtils::hasNext(flags); - } - - bool skipAllBigrams(int *const pos) const { - return BigramListReadWriteUtils::skipExistingBigrams(mBuffer, pos); - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy); - - const ReadOnlyByteArrayView mBuffer; -}; -} // namespace latinime -#endif // LATINIME_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp deleted file mode 100644 index 1a51acad5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ /dev/null @@ -1,526 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" - -#include "defines.h" -#include "suggest/core/dicnode/dic_node.h" -#include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/multi_bigram_map.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/session/ngram_context.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" -#include "utils/char_utils.h" - -namespace latinime { - -void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const { - if (!dicNode->hasChildren()) { - return; - } - int nextPos = dicNode->getChildrenPtNodeArrayPos(); - if (!isValidPos(nextPos)) { - AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd", - nextPos, mBuffer.size()); - mIsCorrupted = true; - ASSERT(false); - return; - } - const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mBuffer.data(), &nextPos); - for (int i = 0; i < childCount; i++) { - if (!isValidPos(nextPos)) { - AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d", - nextPos, mBuffer.size(), i, childCount); - mIsCorrupted = true; - ASSERT(false); - return; - } - nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); - } -} - -int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, - const int maxCodePointCount, int *const outCodePoints) const { - return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount, - outCodePoints, nullptr /* outUnigramProbability */); -} -// This retrieves code points and the probability of the word by its id. -// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, -// it is possible to check for this with advantageous complexity. For each PtNode array, we search -// for PtNodes with children and compare the children position with the position we look for. -// When we shoot the position we look for, it means the word we look for is in the children -// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a -// PtNode array with the last PtNode's children position still less than what we are searching for, -// we must descend the last PtNode's children (for example, if the word we are searching for starts -// with a z, it's the last PtNode of the root array, so all children addresses will be smaller -// than the position we look for, and we have to descend the z PtNode). -/* Parameters : - * wordId: Id of the word we are searching for. - * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. - * outUnigramProbability: a pointer to an int to write the probability into. - * Return value : the code point count, of 0 if the word was not found. - */ -// TODO: Split this function to be more readable -int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const int wordId, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const { - const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); - int pos = getRootPosition(); - int wordPos = 0; - const int *const codePointTable = mHeaderPolicy.getCodePointTable(); - if (outUnigramProbability) { - *outUnigramProbability = NOT_A_PROBABILITY; - } - // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will - // only traverse PtNodes that are actually a part of the terminal we are searching, so each - // time we enter this loop we are one depth level further than last time. - // The only reason we count PtNodes is because we want to reduce the probability of infinite - // looping in case there is a bug. Since we know there is an upper bound to the depth we are - // supposed to traverse, it does not hurt to count iterations. - for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { - int lastCandidatePtNodePos = 0; - // Let's loop through PtNodes in this PtNode array searching for either the terminal - // or one of its ascendants. - if (!isValidPos(pos)) { - AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd", - pos, mBuffer.size()); - mIsCorrupted = true; - ASSERT(false); - return 0; - } - for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) { - const int startPos = pos; - if (!isValidPos(pos)) { - AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size()); - mIsCorrupted = true; - ASSERT(false); - return 0; - } - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos); - const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), codePointTable, &pos); - if (ptNodePos == startPos) { - // We found the position. Copy the rest of the code points in the buffer and return - // the length. - outCodePoints[wordPos] = character; - if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { - int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), codePointTable, &pos); - // We count code points in order to avoid infinite loops if the file is broken - // or if there is some other bug - int charCount = maxCodePointCount; - while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { - outCodePoints[++wordPos] = nextChar; - nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), codePointTable, &pos); - } - } - if (outUnigramProbability) { - *outUnigramProbability = - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( - mBuffer.data(), &pos); - } - return ++wordPos; - } - // We need to skip past this PtNode, so skip any remaining code points after the - // first and possibly the probability. - if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { - PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH, - codePointTable, &pos); - } - if (PatriciaTrieReadingUtils::isTerminal(flags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos); - } - // The fact that this PtNode has children is very important. Since we already know - // that this PtNode does not match, if it has no children we know it is irrelevant - // to what we are searching for. - const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); - // We will write in `found' whether we have passed the children position we are - // searching for. For example if we search for "beer", the children of b are less - // than the address we are searching for and the children of c are greater. When we - // come here for c, we realize this is too big, and that we should descend b. - bool found; - if (hasChildren) { - int currentPos = pos; - // Here comes the tricky part. First, read the children position. - const int childrenPos = PatriciaTrieReadingUtils - ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags, - ¤tPos); - if (childrenPos > ptNodePos) { - // If the children pos is greater than the position, it means the previous - // PtNode, which position is stored in lastCandidatePtNodePos, was the right - // one. - found = true; - } else if (1 >= ptNodeCount) { - // However if we are on the LAST PtNode of this array, and we have NOT shot the - // position we should descend THIS PtNode. So we trick the - // lastCandidatePtNodePos so that we will descend this PtNode, not the previous - // one. - lastCandidatePtNodePos = startPos; - found = true; - } else { - // Else, we should continue looking. - found = false; - } - } else { - // Even if we don't have children here, we could still be on the last PtNode of - // this array. If this is the case, we should descend the last PtNode that had - // children, and their position is already in lastCandidatePtNodePos. - found = (1 >= ptNodeCount); - } - - if (found) { - // Okay, we found the PtNode we should descend. Its position is in - // the lastCandidatePtNodePos variable, so we just re-read it. - if (0 != lastCandidatePtNodePos) { - const PatriciaTrieReadingUtils::NodeFlags lastFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( - mBuffer.data(), &lastCandidatePtNodePos); - const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), codePointTable, &lastCandidatePtNodePos); - // We copy all the characters in this PtNode to the buffer - outCodePoints[wordPos] = lastChar; - if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { - int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), codePointTable, &lastCandidatePtNodePos); - int charCount = maxCodePointCount; - while (-1 != nextChar && --charCount > 0) { - outCodePoints[++wordPos] = nextChar; - nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mBuffer.data(), codePointTable, &lastCandidatePtNodePos); - } - } - ++wordPos; - // Now we only need to branch to the children address. Skip the probability if - // it's there, read pos, and break to resume the search at pos. - if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), - &lastCandidatePtNodePos); - } - pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - mBuffer.data(), lastFlags, &lastCandidatePtNodePos); - break; - } else { - // Here is a little tricky part: we come here if we found out that all children - // addresses in this PtNode are bigger than the address we are searching for. - // Should we conclude the word is not in the dictionary? No! It could still be - // one of the remaining PtNodes in this array, so we have to keep looking in - // this array until we find it (or we realize it's not there either, in which - // case it's actually not in the dictionary). Pass the end of this PtNode, - // ready to start the next one. - if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - mBuffer.data(), flags, &pos); - } - if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - mShortcutListPolicy.skipAllShortcuts(&pos); - } - if (PatriciaTrieReadingUtils::hasBigrams(flags)) { - if (!mBigramListPolicy.skipAllBigrams(&pos)) { - AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), - pos); - mIsCorrupted = true; - ASSERT(false); - return 0; - } - } - } - } else { - // If we did not find it, we should record the last children address for the next - // iteration. - if (hasChildren) lastCandidatePtNodePos = startPos; - // Now skip the end of this PtNode (children pos and the attributes if any) so that - // our pos is after the end of this PtNode, at the start of the next one. - if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - mBuffer.data(), flags, &pos); - } - if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - mShortcutListPolicy.skipAllShortcuts(&pos); - } - if (PatriciaTrieReadingUtils::hasBigrams(flags)) { - if (!mBigramListPolicy.skipAllBigrams(&pos)) { - AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos); - mIsCorrupted = true; - ASSERT(false); - return 0; - } - } - } - - } - } - // If we have looked through all the PtNodes and found no match, the ptNodePos is - // not the position of a terminal in this dictionary. - return 0; -} - -// This function gets the position of the terminal PtNode of the exact matching word in the -// dictionary. If no match is found, it returns NOT_A_WORD_ID. -int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, - const bool forceLowerCaseSearch) const { - DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), - wordCodePoints.size(), forceLowerCaseSearch); - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in getWordId()."); - } - return getWordIdFromTerminalPtNodePos(ptNodePos); -} - -const WordAttributes PatriciaTriePolicy::getWordAttributesInContext( - const WordIdArrayView prevWordIds, const int wordId, - MultiBigramMap *const multiBigramMap) const { - if (wordId == NOT_A_WORD_ID) { - return WordAttributes(); - } - const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); - const PtNodeParams ptNodeParams = - mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (multiBigramMap) { - const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, - prevWordIds, wordId, ptNodeParams.getProbability()); - return getWordAttributes(probability, ptNodeParams); - } - if (!prevWordIds.empty()) { - const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId); - if (bigramProbability != NOT_A_PROBABILITY) { - return getWordAttributes(bigramProbability, ptNodeParams); - } - } - return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), - ptNodeParams); -} - -const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, - const PtNodeParams &ptNodeParams) const { - return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), - ptNodeParams.isPossiblyOffensive()); -} - -int PatriciaTriePolicy::getProbability(const int unigramProbability, - const int bigramProbability) const { - // Due to space constraints, the probability for bigrams is approximate - the lower the unigram - // probability, the worse the precision. The theoritical maximum error in resulting probability - // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means - // that sometimes, we'll see some bigrams interverted here, but it can't get too bad. - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (bigramProbability == NOT_A_PROBABILITY) { - return ProbabilityUtils::backoff(unigramProbability); - } else { - return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, - bigramProbability); - } -} - -int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, - const int wordId) const { - if (wordId == NOT_A_WORD_ID) { - return NOT_A_PROBABILITY; - } - const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); - const PtNodeParams ptNodeParams = - mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.isNotAWord()) { - // If this is not a word, it should behave as having no probability outside of the - // suggestion process (where it should be used for shortcuts). - return NOT_A_PROBABILITY; - } - if (!prevWordIds.empty()) { - const int bigramsPosition = getBigramsPositionOfPtNode( - getTerminalPtNodePosFromWordId(prevWordIds[0])); - BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == ptNodePos - && bigramsIt.getProbability() != NOT_A_PROBABILITY) { - return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); - } - } - return NOT_A_PROBABILITY; - } - return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); -} - -void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, - NgramListener *const listener) const { - if (prevWordIds.empty()) { - return; - } - const int bigramsPosition = getBigramsPositionOfPtNode( - getTerminalPtNodePosFromWordId(prevWordIds[0])); - BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - listener->onVisitEntry(bigramsIt.getProbability(), - getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); - } -} - -BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const { - const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); - return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos); -} - -int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos(); -} - -int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos(); -} - -int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, - const int ptNodePos, DicNodeVector *childDicNodes) const { - PatriciaTrieReadingUtils::NodeFlags flags; - int mergedNodeCodePointCount = 0; - int mergedNodeCodePoints[MAX_WORD_LENGTH]; - int probability = NOT_A_PROBABILITY; - int childrenPos = NOT_A_DICT_POS; - int shortcutPos = NOT_A_DICT_POS; - int bigramPos = NOT_A_DICT_POS; - int siblingPos = NOT_A_DICT_POS; - const int *const codePointTable = mHeaderPolicy.getCodePointTable(); - PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy, - &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount, - mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, - &siblingPos); - // Skip PtNodes don't start with Unicode code point because they represent non-word information. - if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { - const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID; - childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId, - CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount)); - } - return siblingPos; -} - -const WordProperty PatriciaTriePolicy::getWordProperty( - const CodePointArrayView wordCodePoints) const { - const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); - if (wordId == NOT_A_WORD_ID) { - AKLOGE("getWordProperty was called for invalid word."); - return WordProperty(); - } - const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); - const PtNodeParams ptNodeParams = - mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - // Fetch bigram information. - std::vector ngrams; - const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); - int bigramWord1CodePoints[MAX_WORD_LENGTH]; - BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos); - while (bigramsIt.hasNext()) { - // Fetch the next bigram information and forward the iterator. - bigramsIt.next(); - // Skip the entry if the entry has been deleted. This never happens for ver2 dicts. - if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { - int word1Probability = NOT_A_PROBABILITY; - const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( - getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH, - bigramWord1CodePoints, &word1Probability); - const int probability = getProbability(word1Probability, bigramsIt.getProbability()); - ngrams.emplace_back( - NgramContext(wordCodePoints.data(), wordCodePoints.size(), - ptNodeParams.representsBeginningOfSentence()), - CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(), - probability, HistoricalInfo()); - } - } - // Fetch shortcut information. - std::vector shortcuts; - int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); - if (shortcutPos != NOT_A_DICT_POS) { - int shortcutTargetCodePoints[MAX_WORD_LENGTH]; - ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos); - bool hasNext = true; - while (hasNext) { - const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = - ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos); - hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); - const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( - mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); - const int shortcutProbability = - ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); - shortcuts.emplace_back( - CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(), - shortcutProbability); - } - } - const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), - ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), - ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts)); - return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); -} - -int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount) { - *outCodePointCount = 0; - if (token == 0) { - // Start iterating the dictionary. - mTerminalPtNodePositionsForIteratingWords.clear(); - DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( - &mTerminalPtNodePositionsForIteratingWords); - DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); - } - const int terminalPtNodePositionsVectorSize = - static_cast(mTerminalPtNodePositionsForIteratingWords.size()); - if (token < 0 || token >= terminalPtNodePositionsVectorSize) { - AKLOGE("Given token %d is invalid.", token); - return 0; - } - const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; - *outCodePointCount = getCodePointsAndReturnCodePointCount( - getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); - const int nextToken = token + 1; - if (nextToken >= terminalPtNodePositionsVectorSize) { - // All words have been iterated. - mTerminalPtNodePositionsForIteratingWords.clear(); - return 0; - } - return nextToken; -} - -int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { - return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; -} - -int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { - return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; -} - -bool PatriciaTriePolicy::isValidPos(const int pos) const { - return pos >= 0 && pos < static_cast(mBuffer.size()); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h deleted file mode 100644 index 8933962ab..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PATRICIA_TRIE_POLICY_H -#define LATINIME_PATRICIA_TRIE_POLICY_H - -#include -#include - -#include "defines.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" -#include "utils/byte_array_view.h" -#include "utils/int_array_view.h" - -namespace latinime { - -class DicNode; -class DicNodeVector; - -// Word id = Position of a PtNode that represents the word. -// Max supported n-gram is bigram. -class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { - public: - PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) - : mMmappedBuffer(std::move(mmappedBuffer)), - mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), - FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())), - mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())), - mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer), - mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy, - mHeaderPolicy.getCodePointTable()), - mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(), - mIsCorrupted(false) {} - - AK_FORCE_INLINE int getRootPosition() const { - return 0; - } - - void createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const; - - int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, - int *const outCodePoints) const; - - int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - - const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, - const int wordId, MultiBigramMap *const multiBigramMap) const; - - int getProbability(const int unigramProbability, const int bigramProbability) const; - - int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; - - void iterateNgramEntries(const WordIdArrayView prevWordIds, - NgramListener *const listener) const; - - BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; - - const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { - return &mHeaderPolicy; - } - - bool addUnigramEntry(const CodePointArrayView wordCodePoints, - const UnigramProperty *const unigramProperty) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); - return false; - } - - bool removeUnigramEntry(const CodePointArrayView wordCodePoints) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); - return false; - } - - bool addNgramEntry(const NgramProperty *const ngramProperty) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); - return false; - } - - bool removeNgramEntry(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); - return false; - } - - bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints, const bool isValidWord, - const HistoricalInfo historicalInfo) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " - "dictionary."); - return false; - } - - bool flush(const char *const filePath) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: flush() is called for non-updatable dictionary."); - return false; - } - - bool flushWithGC(const char *const filePath) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return false; - } - - bool needsToRunGC(const bool mindsBlockByGC) const { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); - return false; - } - - void getProperty(const char *const query, const int queryLength, char *const outResult, - const int maxResultLength) { - // getProperty is not supported for this class. - if (maxResultLength > 0) { - outResult[0] = '\0'; - } - } - - const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; - - int getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount); - - bool isCorrupted() const { - return mIsCorrupted; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); - - const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; - const HeaderPolicy mHeaderPolicy; - const ReadOnlyByteArrayView mBuffer; - const BigramListPolicy mBigramListPolicy; - const ShortcutListPolicy mShortcutListPolicy; - const Ver2ParticiaTrieNodeReader mPtNodeReader; - const Ver2PtNodeArrayReader mPtNodeArrayReader; - std::vector mTerminalPtNodePositionsForIteratingWords; - mutable bool mIsCorrupted; - - int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId, - const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const; - int getShortcutPositionOfPtNode(const int ptNodePos) const; - int getBigramsPositionOfPtNode(const int ptNodePos) const; - int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, - DicNodeVector *const childDicNodes) const; - int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; - int getTerminalPtNodePosFromWordId(const int wordId) const; - const WordAttributes getWordAttributes(const int probability, - const PtNodeParams &ptNodeParams) const; - bool isValidPos(const int pos) const; -}; -} // namespace latinime -#endif // LATINIME_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h deleted file mode 100644 index 5319dd26c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_SHORTCUT_LIST_POLICY_H -#define LATINIME_SHORTCUT_LIST_POLICY_H - -#include - -#include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { - public: - explicit ShortcutListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} - - ~ShortcutListPolicy() {} - - int getStartPos(const int pos) const { - if (pos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - int listPos = pos; - ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &listPos); - return listPos; - } - - void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, - int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, - int *const pos) const { - const ShortcutListReadingUtils::ShortcutFlags flags = - ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, pos); - if (outHasNext) { - *outHasNext = ShortcutListReadingUtils::hasNext(flags); - } - if (outIsWhitelist) { - *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); - } - if (outCodePoint) { - *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( - mBuffer, maxCodePointCount, outCodePoint, pos); - } - } - - void skipAllShortcuts(int *const pos) const { - const int shortcutListSize = ShortcutListReadingUtils - ::getShortcutListSizeAndForwardPointer(mBuffer, pos); - *pos += shortcutListSize; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy); - - const ReadOnlyByteArrayView mBuffer; -}; -} // namespace latinime -#endif // LATINIME_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp deleted file mode 100644 index 90d4687dd..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" - -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" - -namespace latinime { - -const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNodePos( - const int ptNodePos) const { - if (ptNodePos < 0 || ptNodePos >= static_cast(mBuffer.size())) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %zd", - ptNodePos, mBuffer.size()); - ASSERT(false); - return PtNodeParams(); - } - PatriciaTrieReadingUtils::NodeFlags flags; - int mergedNodeCodePointCount = 0; - int mergedNodeCodePoints[MAX_WORD_LENGTH]; - int probability = NOT_A_PROBABILITY; - int childrenPos = NOT_A_DICT_POS; - int shortcutPos = NOT_A_DICT_POS; - int bigramPos = NOT_A_DICT_POS; - int siblingPos = NOT_A_DICT_POS; - PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortcutPolicy, - mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, - &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); - if (mergedNodeCodePointCount <= 0) { - AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); - ASSERT(false); - return PtNodeParams(); - } - return PtNodeParams(ptNodePos, flags, mergedNodeCodePointCount, mergedNodeCodePoints, - probability, childrenPos, shortcutPos, bigramPos, siblingPos); -} - -} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h deleted file mode 100644 index 838d37314..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H -#define LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class DictionaryBigramsStructurePolicy; -class DictionaryShortcutsStructurePolicy; - -class Ver2ParticiaTrieNodeReader : public PtNodeReader { - public: - Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer, - const DictionaryBigramsStructurePolicy *const bigramPolicy, - const DictionaryShortcutsStructurePolicy *const shortcutPolicy, - const int *const codePointTable) - : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), - mCodePointTable(codePointTable) {} - - virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const; - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader); - - const ReadOnlyByteArrayView mBuffer; - const DictionaryBigramsStructurePolicy *const mBigramPolicy; - const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; - const int *const mCodePointTable; -}; -} // namespace latinime -#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp deleted file mode 100644 index 72ad1eb66..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" - -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" - -namespace latinime { - -bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, - int *const outPtNodeCount, int *const outFirstPtNodePos) const { - if (ptNodeArrayPos < 0 || ptNodeArrayPos >= static_cast(mBuffer.size())) { - // Reading invalid position because of a bug or a broken dictionary. - AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %zd", - ptNodeArrayPos, mBuffer.size()); - ASSERT(false); - return false; - } - int readingPos = ptNodeArrayPos; - const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mBuffer.data(), &readingPos); - *outPtNodeCount = ptNodeCountInArray; - *outFirstPtNodePos = readingPos; - return true; -} - -bool Ver2PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, - int *const outNextPtNodeArrayPos) const { - if (forwordLinkPos < 0 || forwordLinkPos >= static_cast(mBuffer.size())) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %zd", - forwordLinkPos, mBuffer.size()); - ASSERT(false); - return false; - } - // Ver2 dicts don't have forward links. - *outNextPtNodeArrayPos = NOT_A_DICT_POS; - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h deleted file mode 100644 index 548f36bf3..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER2_PT_NODE_ARRAY_READER_H -#define LATINIME_VER2_PT_NODE_ARRAY_READER_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class Ver2PtNodeArrayReader : public PtNodeArrayReader { - public: - Ver2PtNodeArrayReader(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}; - - virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, - int *const outPtNodeCount, int *const outFirstPtNodePos) const; - virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, - int *const outNextPtNodeArrayPos) const; - - private: - DISALLOW_COPY_AND_ASSIGN(Ver2PtNodeArrayReader); - - const ReadOnlyByteArrayView mBuffer; -}; -} // namespace latinime -#endif /* LATINIME_VER2_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp deleted file mode 100644 index 025ee9932..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" - -namespace latinime { - -// Used to provide stable probabilities even if the user's input count is small. -const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2, 1}; - -// Encoded backoff weights. -// Note that we give positive values for trigrams and quadgrams that means the weight is more than -// 1. -// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight. -const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, -4, 2, 8}; - -// This value is used to remove too old entries from the dictionary. -const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS = - 300 * 24 * 60 * 60; // 300 days - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h deleted file mode 100644 index 644ae2ca7..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H -#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H - -#include - -#include "defines.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "utils/ngram_utils.h" -#include "utils/time_keeper.h" - -namespace latinime { - -class DynamicLanguageModelProbabilityUtils { - public: - static float computeRawProbabilityFromCounts(const int count, const int contextCount, - const NgramType ngramType) { - const int minCount = ASSUMED_MIN_COUNTS[static_cast(ngramType)]; - return static_cast(count) / static_cast(std::max(contextCount, minCount)); - } - - static float backoff(const int ngramProbability, const NgramType ngramType) { - const int probability = - ngramProbability + ENCODED_BACKOFF_WEIGHTS[static_cast(ngramType)]; - return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY); - } - - static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) { - const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); - if (elapsedTime < 0) { - AKLOGE("The elapsed time is negatime value. Timestamp overflow?"); - return NOT_A_PROBABILITY; - } - // TODO: Improve this logic. - // We don't modify probability depending on the elapsed time. - return probability; - } - - static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) { - // TODO: Improve this logic. - const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); - return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS; - } - - static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) { - // TODO: Improve this logic. - // More recently input entries get higher priority. - return historicalInfo.getTimestamp(); - } - -private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils); - - static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 3, "Max supported Ngram is Quadgram."); - - static const int ASSUMED_MIN_COUNTS[]; - static const int ENCODED_BACKOFF_WEIGHTS[]; - static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS; -}; - -} // namespace latinime -#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp deleted file mode 100644 index 6db7ea444..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ /dev/null @@ -1,478 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" - -#include -#include - -#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" -#include "utils/ngram_utils.h" - -namespace latinime { - -const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0; -const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1; - -bool LanguageModelDictContent::save(FILE *const file) const { - return mTrieMap.save(file) && mGlobalCounters.save(file); -} - -bool LanguageModelDictContent::runGC( - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const LanguageModelDictContent *const originalContent) { - return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(), - 0 /* nextLevelBitmapEntryIndex */); -} - -const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, - const int wordId, const bool mustMatchAllPrevWords, - const HeaderPolicy *const headerPolicy) const { - int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; - bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); - int maxPrevWordCount = 0; - for (size_t i = 0; i < prevWordIds.size(); ++i) { - const int nextBitmapEntryIndex = - mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex; - if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) { - break; - } - maxPrevWordCount = i + 1; - bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; - } - - const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); - if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) { - // The word should be treated as a invalid word. - return WordAttributes(); - } - for (int i = maxPrevWordCount; i >= 0; --i) { - if (mustMatchAllPrevWords && prevWordIds.size() > static_cast(i)) { - break; - } - const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); - if (!result.mIsValid) { - continue; - } - const ProbabilityEntry probabilityEntry = - ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); - int probability = NOT_A_PROBABILITY; - if (mHasHistoricalInfo) { - const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); - int contextCount = 0; - if (i == 0) { - // unigram - contextCount = mGlobalCounters.getTotalCount(); - } else { - const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( - prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); - if (!prevWordProbabilityEntry.isValid()) { - continue; - } - if (prevWordProbabilityEntry.representsBeginningOfSentence() - && historicalInfo->getCount() == 1) { - // BoS ngram requires multiple contextCount. - continue; - } - contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount(); - } - const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1); - const float rawProbability = - DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts( - historicalInfo->getCount(), contextCount, ngramType); - const int encodedRawProbability = - ProbabilityUtils::encodeRawProbability(rawProbability); - const int decayedProbability = - DynamicLanguageModelProbabilityUtils::getDecayedProbability( - encodedRawProbability, *historicalInfo); - probability = DynamicLanguageModelProbabilityUtils::backoff( - decayedProbability, ngramType); - } else { - probability = probabilityEntry.getProbability(); - } - // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in - // probabilityEntry. - return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), - unigramProbabilityEntry.isNotAWord(), - unigramProbabilityEntry.isPossiblyOffensive()); - } - // Cannot find the word. - return WordAttributes(); -} - -ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( - const WordIdArrayView prevWordIds, const int wordId) const { - const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); - if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { - return ProbabilityEntry(); - } - const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); - if (!result.mIsValid) { - // Not found. - return ProbabilityEntry(); - } - return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); -} - -bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, - const int wordId, const ProbabilityEntry *const probabilityEntry) { - if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) { - return false; - } - const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds); - if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { - return false; - } - return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); -} - -bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, - const int wordId) { - const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); - if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { - // Cannot find bitmap entry for the probability entry. The entry doesn't exist. - return false; - } - return mTrieMap.remove(wordId, bitmapEntryIndex); -} - -LanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries( - const WordIdArrayView prevWordIds) const { - const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); - return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo); -} - -std::vector - LanguageModelDictContent::exportAllNgramEntriesRelatedToWord( - const HeaderPolicy *const headerPolicy, const int wordId) const { - const TrieMap::Result result = mTrieMap.getRoot(wordId); - if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) { - // The word doesn't have any related ngram entries. - return std::vector(); - } - std::vector prevWordIds = { wordId }; - std::vector entries; - exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex, - &prevWordIds, &entries); - return entries; -} - -void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner( - const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex, - std::vector *const prevWordIds, - std::vector *const outBummpedFullEntryInfo) const { - for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { - const int wordId = entry.key(); - const ProbabilityEntry probabilityEntry = - ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); - if (probabilityEntry.isValid()) { - const WordAttributes wordAttributes = getWordAttributes( - WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */, - headerPolicy); - outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId, - wordAttributes, probabilityEntry); - } - if (entry.hasNextLevelMap()) { - prevWordIds->push_back(wordId); - exportAllNgramEntriesRelatedToWordInner(headerPolicy, - entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo); - prevWordIds->pop_back(); - } - } -} - -bool LanguageModelDictContent::truncateEntries(const EntryCounts ¤tEntryCounts, - const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy, - MutableEntryCounters *const outEntryCounters) { - for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) { - const int totalWordCount = prevWordCount + 1; - const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount); - if (currentEntryCounts.getNgramCount(ngramType) - <= maxEntryCounts.getNgramCount(ngramType)) { - outEntryCounters->setNgramCount(ngramType, - currentEntryCounts.getNgramCount(ngramType)); - continue; - } - int entryCount = 0; - if (!turncateEntriesInSpecifiedLevel(headerPolicy, - maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) { - return false; - } - outEntryCounters->setNgramCount(ngramType, entryCount); - } - return true; -} - -bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, - const int wordId, const bool isValid, const HistoricalInfo historicalInfo, - const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) { - if (!mHasHistoricalInfo) { - AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info."); - return false; - } - const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId); - const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom( - originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy); - if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) { - return false; - } - mGlobalCounters.incrementTotalCount(); - mGlobalCounters.updateMaxValueOfCounters( - updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount()); - for (size_t i = 0; i < prevWordIds.size(); ++i) { - if (prevWordIds[i] == NOT_A_WORD_ID) { - break; - } - // TODO: Optimize this code. - const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1); - const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry( - limitedPrevWordIds, wordId); - const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom( - originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy); - if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) { - return false; - } - mGlobalCounters.updateMaxValueOfCounters( - updatedNgramProbabilityEntry.getHistoricalInfo()->getCount()); - if (!originalNgramProbabilityEntry.isValid()) { - // (i + 2) words are used in total because the prevWords consists of (i + 1) words when - // looking at its i-th element. - entryCountersToUpdate->incrementNgramCount( - NgramUtils::getNgramTypeFromWordCount(i + 2)); - } - } - return true; -} - -const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( - const ProbabilityEntry &originalProbabilityEntry, const bool isValid, - const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { - const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(), - 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount() - + historicalInfo.getCount()); - if (originalProbabilityEntry.isValid()) { - return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); - } else { - return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo); - } -} - -bool LanguageModelDictContent::runGCInner( - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) { - for (auto &entry : trieMapRange) { - const auto it = terminalIdMap->find(entry.key()); - if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) { - // The word has been removed. - continue; - } - if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) { - return false; - } - if (entry.hasNextLevelMap()) { - if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(), - mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) { - return false; - } - } - } - return true; -} - -int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) { - int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); - for (const int wordId : prevWordIds) { - const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex); - if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) { - lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; - continue; - } - if (!result.mIsValid) { - if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo), - lastBitmapEntryIndex)) { - AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId, - lastBitmapEntryIndex); - return TrieMap::INVALID_INDEX; - } - } - lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId, - lastBitmapEntryIndex); - } - return lastBitmapEntryIndex; -} - -int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const { - int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); - for (const int wordId : prevWordIds) { - const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); - if (!result.mIsValid) { - return TrieMap::INVALID_INDEX; - } - bitmapEntryIndex = result.mNextLevelBitmapEntryIndex; - } - return bitmapEntryIndex; -} - -bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, - const int prevWordCount, const HeaderPolicy *const headerPolicy, - const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { - for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { - if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { - AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", - prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM); - return false; - } - const ProbabilityEntry probabilityEntry = - ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); - if (prevWordCount > 0 && probabilityEntry.isValid() - && !mTrieMap.getRoot(entry.key()).mIsValid) { - // The entry is related to a word that has been removed. Remove the entry. - if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { - return false; - } - continue; - } - if (mHasHistoricalInfo && probabilityEntry.isValid()) { - const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); - if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( - *originalHistoricalInfo)) { - // Remove the entry. - if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { - return false; - } - continue; - } - if (needsToHalveCounters) { - const int updatedCount = originalHistoricalInfo->getCount() / 2; - if (updatedCount == 0) { - // Remove the entry. - if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { - return false; - } - continue; - } - const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), - originalHistoricalInfo->getLevel(), updatedCount); - const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), - &historicalInfoToSave); - if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), - bitmapEntryIndex)) { - return false; - } - } - } - outEntryCounters->incrementNgramCount( - NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1)); - if (!entry.hasNextLevelMap()) { - continue; - } - if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), - prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { - return false; - } - } - return true; -} - -bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel( - const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, - int *const outEntryCount) { - std::vector prevWordIds; - std::vector entryInfoVector; - if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(), - &prevWordIds, &entryInfoVector)) { - return false; - } - if (static_cast(entryInfoVector.size()) <= maxEntryCount) { - *outEntryCount = static_cast(entryInfoVector.size()); - return true; - } - *outEntryCount = maxEntryCount; - const int entryCountToRemove = static_cast(entryInfoVector.size()) - maxEntryCount; - std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove, - entryInfoVector.end(), - EntryInfoToTurncate::Comparator()); - for (int i = 0; i < entryCountToRemove; ++i) { - const EntryInfoToTurncate &entryInfo = entryInfoVector[i]; - if (!removeNgramProbabilityEntry( - WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount), - entryInfo.mKey)) { - return false; - } - } - return true; -} - -bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy, - const int targetLevel, const int bitmapEntryIndex, std::vector *const prevWordIds, - std::vector *const outEntryInfo) const { - const int prevWordCount = prevWordIds->size(); - for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { - if (prevWordCount < targetLevel) { - if (!entry.hasNextLevelMap()) { - continue; - } - prevWordIds->push_back(entry.key()); - if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(), - prevWordIds, outEntryInfo)) { - return false; - } - prevWordIds->pop_back(); - continue; - } - const ProbabilityEntry probabilityEntry = - ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); - const int priority = mHasHistoricalInfo - ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction( - *probabilityEntry.getHistoricalInfo()) - : probabilityEntry.getProbability(); - outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(), - entry.key(), targetLevel, prevWordIds->data()); - } - return true; -} - -bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( - const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { - if (left.mPriority != right.mPriority) { - return left.mPriority < right.mPriority; - } - if (left.mCount != right.mCount) { - return left.mCount < right.mCount; - } - if (left.mKey != right.mKey) { - return left.mKey < right.mKey; - } - if (left.mPrevWordCount != right.mPrevWordCount) { - return left.mPrevWordCount > right.mPrevWordCount; - } - for (int i = 0; i < left.mPrevWordCount; ++i) { - if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) { - return left.mPrevWordIds[i] < right.mPrevWordIds[i]; - } - } - // left and rigth represent the same entry. - return false; -} - -LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority, - const int count, const int key, const int prevWordCount, const int *const prevWordIds) - : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) { - memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h deleted file mode 100644 index 9678c35f9..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H -#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H - -#include -#include - -#include "defines.h" -#include "suggest/core/dictionary/word_attributes.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" -#include "suggest/policyimpl/dictionary/utils/trie_map.h" -#include "utils/byte_array_view.h" -#include "utils/int_array_view.h" - -namespace latinime { - -class HeaderPolicy; - -/** - * Class representing language model. - * - * This class provides methods to get and store unigram/n-gram probability information and flags. - */ -class LanguageModelDictContent { - public: - // Pair of word id and probability entry used for iteration. - class WordIdAndProbabilityEntry { - public: - WordIdAndProbabilityEntry(const int wordId, const ProbabilityEntry &probabilityEntry) - : mWordId(wordId), mProbabilityEntry(probabilityEntry) {} - - int getWordId() const { return mWordId; } - const ProbabilityEntry getProbabilityEntry() const { return mProbabilityEntry; } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(WordIdAndProbabilityEntry); - DISALLOW_ASSIGNMENT_OPERATOR(WordIdAndProbabilityEntry); - - const int mWordId; - const ProbabilityEntry mProbabilityEntry; - }; - - // Iterator. - class EntryIterator { - public: - EntryIterator(const TrieMap::TrieMapIterator &trieMapIterator, - const bool hasHistoricalInfo) - : mTrieMapIterator(trieMapIterator), mHasHistoricalInfo(hasHistoricalInfo) {} - - const WordIdAndProbabilityEntry operator*() const { - const TrieMap::TrieMapIterator::IterationResult &result = *mTrieMapIterator; - return WordIdAndProbabilityEntry( - result.key(), ProbabilityEntry::decode(result.value(), mHasHistoricalInfo)); - } - - bool operator!=(const EntryIterator &other) const { - return mTrieMapIterator != other.mTrieMapIterator; - } - - const EntryIterator &operator++() { - ++mTrieMapIterator; - return *this; - } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(EntryIterator); - DISALLOW_ASSIGNMENT_OPERATOR(EntryIterator); - - TrieMap::TrieMapIterator mTrieMapIterator; - const bool mHasHistoricalInfo; - }; - - // Class represents range to use range base for loops. - class EntryRange { - public: - EntryRange(const TrieMap::TrieMapRange trieMapRange, const bool hasHistoricalInfo) - : mTrieMapRange(trieMapRange), mHasHistoricalInfo(hasHistoricalInfo) {} - - EntryIterator begin() const { - return EntryIterator(mTrieMapRange.begin(), mHasHistoricalInfo); - } - - EntryIterator end() const { - return EntryIterator(mTrieMapRange.end(), mHasHistoricalInfo); - } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(EntryRange); - DISALLOW_ASSIGNMENT_OPERATOR(EntryRange); - - const TrieMap::TrieMapRange mTrieMapRange; - const bool mHasHistoricalInfo; - }; - - class DumppedFullEntryInfo { - public: - DumppedFullEntryInfo(std::vector &prevWordIds, const int targetWordId, - const WordAttributes &wordAttributes, const ProbabilityEntry &probabilityEntry) - : mPrevWordIds(prevWordIds), mTargetWordId(targetWordId), - mWordAttributes(wordAttributes), mProbabilityEntry(probabilityEntry) {} - - const WordIdArrayView getPrevWordIds() const { return WordIdArrayView(mPrevWordIds); } - int getTargetWordId() const { return mTargetWordId; } - const WordAttributes &getWordAttributes() const { return mWordAttributes; } - const ProbabilityEntry &getProbabilityEntry() const { return mProbabilityEntry; } - - private: - DISALLOW_ASSIGNMENT_OPERATOR(DumppedFullEntryInfo); - - const std::vector mPrevWordIds; - const int mTargetWordId; - const WordAttributes mWordAttributes; - const ProbabilityEntry mProbabilityEntry; - }; - - LanguageModelDictContent(const ReadWriteByteArrayView *const buffers, - const bool hasHistoricalInfo) - : mTrieMap(buffers[TRIE_MAP_BUFFER_INDEX]), - mGlobalCounters(buffers[GLOBAL_COUNTERS_BUFFER_INDEX]), - mHasHistoricalInfo(hasHistoricalInfo) {} - - explicit LanguageModelDictContent(const bool hasHistoricalInfo) - : mTrieMap(), mGlobalCounters(), mHasHistoricalInfo(hasHistoricalInfo) {} - - bool isNearSizeLimit() const { - return mTrieMap.isNearSizeLimit() || mGlobalCounters.needsToHalveCounters(); - } - - bool save(FILE *const file) const; - - bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const LanguageModelDictContent *const originalContent); - - const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, - const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const; - - ProbabilityEntry getProbabilityEntry(const int wordId) const { - return getNgramProbabilityEntry(WordIdArrayView(), wordId); - } - - bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { - mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount()); - return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); - } - - bool removeProbabilityEntry(const int wordId) { - return removeNgramProbabilityEntry(WordIdArrayView(), wordId); - } - - ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds, - const int wordId) const; - - bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId, - const ProbabilityEntry *const probabilityEntry); - - bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId); - - EntryRange getProbabilityEntries(const WordIdArrayView prevWordIds) const; - - std::vector exportAllNgramEntriesRelatedToWord( - const HeaderPolicy *const headerPolicy, const int wordId) const; - - bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy, - MutableEntryCounters *const outEntryCounters) { - if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), - 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(), - outEntryCounters)) { - return false; - } - if (mGlobalCounters.needsToHalveCounters()) { - mGlobalCounters.halveCounters(); - } - return true; - } - - // entryCounts should be created by updateAllProbabilityEntries. - bool truncateEntries(const EntryCounts ¤tEntryCounts, const EntryCounts &maxEntryCounts, - const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters); - - bool updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId, - const bool isValid, const HistoricalInfo historicalInfo, - const HeaderPolicy *const headerPolicy, - MutableEntryCounters *const entryCountersToUpdate); - - private: - DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); - - class EntryInfoToTurncate { - public: - class Comparator { - public: - bool operator()(const EntryInfoToTurncate &left, - const EntryInfoToTurncate &right) const; - private: - DISALLOW_ASSIGNMENT_OPERATOR(Comparator); - }; - - EntryInfoToTurncate(const int priority, const int count, const int key, - const int prevWordCount, const int *const prevWordIds); - - int mPriority; - // TODO: Remove. - int mCount; - int mKey; - int mPrevWordCount; - int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate); - }; - - static const int TRIE_MAP_BUFFER_INDEX; - static const int GLOBAL_COUNTERS_BUFFER_INDEX; - - TrieMap mTrieMap; - LanguageModelDictContentGlobalCounters mGlobalCounters; - const bool mHasHistoricalInfo; - - bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex); - int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds); - int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; - bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, - const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters, - MutableEntryCounters *const outEntryCounters); - bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy, - const int maxEntryCount, const int targetLevel, int *const outEntryCount); - bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel, - const int bitmapEntryIndex, std::vector *const prevWordIds, - std::vector *const outEntryInfo) const; - const ProbabilityEntry createUpdatedEntryFrom(const ProbabilityEntry &originalProbabilityEntry, - const bool isValid, const HistoricalInfo historicalInfo, - const HeaderPolicy *const headerPolicy) const; - void exportAllNgramEntriesRelatedToWordInner(const HeaderPolicy *const headerPolicy, - const int bitmapEntryIndex, std::vector *const prevWordIds, - std::vector *const outBummpedFullEntryInfo) const; -}; -} // namespace latinime -#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp deleted file mode 100644 index d6d91887e..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h" - -#include - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" - -namespace latinime { - -const int LanguageModelDictContentGlobalCounters::COUNTER_VALUE_NEAR_LIMIT_THRESHOLD = - (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 64; -const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD = 1 << 30; -const int LanguageModelDictContentGlobalCounters::COUNTER_SIZE_IN_BYTES = 4; -const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_INDEX = 0; -const int LanguageModelDictContentGlobalCounters::MAX_VALUE_OF_COUNTERS_INDEX = 1; - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h deleted file mode 100644 index 283c2691a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H -#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class LanguageModelDictContentGlobalCounters { - public: - explicit LanguageModelDictContentGlobalCounters(const ReadWriteByteArrayView buffer) - : mBuffer(buffer, 0 /* maxAdditionalBufferSize */), - mTotalCount(readValue(mBuffer, TOTAL_COUNT_INDEX)), - mMaxValueOfCounters(readValue(mBuffer, MAX_VALUE_OF_COUNTERS_INDEX)) {} - - LanguageModelDictContentGlobalCounters() - : mBuffer(0 /* maxAdditionalBufferSize */), mTotalCount(0), mMaxValueOfCounters(0) {} - - bool needsToHalveCounters() const { - return mMaxValueOfCounters >= COUNTER_VALUE_NEAR_LIMIT_THRESHOLD - || mTotalCount >= TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; - } - - int getTotalCount() const { - return mTotalCount; - } - - bool save(FILE *const file) const { - BufferWithExtendableBuffer bufferToWrite( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - if (!bufferToWrite.writeUint(mTotalCount, COUNTER_SIZE_IN_BYTES, - TOTAL_COUNT_INDEX * COUNTER_SIZE_IN_BYTES)) { - return false; - } - if (!bufferToWrite.writeUint(mMaxValueOfCounters, COUNTER_SIZE_IN_BYTES, - MAX_VALUE_OF_COUNTERS_INDEX * COUNTER_SIZE_IN_BYTES)) { - return false; - } - return DictFileWritingUtils::writeBufferToFileTail(file, &bufferToWrite); - } - - void incrementTotalCount() { - mTotalCount += 1; - } - - void addToTotalCount(const int count) { - mTotalCount += count; - } - - void updateMaxValueOfCounters(const int count) { - mMaxValueOfCounters = std::max(count, mMaxValueOfCounters); - } - - void halveCounters() { - mMaxValueOfCounters /= 2; - mTotalCount /= 2; - } - -private: - DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContentGlobalCounters); - - const static int COUNTER_VALUE_NEAR_LIMIT_THRESHOLD; - const static int TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; - const static int COUNTER_SIZE_IN_BYTES; - const static int TOTAL_COUNT_INDEX; - const static int MAX_VALUE_OF_COUNTERS_INDEX; - - BufferWithExtendableBuffer mBuffer; - int mTotalCount; - int mMaxValueOfCounters; - - static int readValue(const BufferWithExtendableBuffer &buffer, const int index) { - const int pos = COUNTER_SIZE_IN_BYTES * index; - if (pos + COUNTER_SIZE_IN_BYTES > buffer.getTailPosition()) { - return 0; - } - return buffer.readUint(COUNTER_SIZE_IN_BYTES, pos); - } -}; -} // namespace latinime -#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h deleted file mode 100644 index 9c4ab18e4..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PROBABILITY_ENTRY_H -#define LATINIME_PROBABILITY_ENTRY_H - -#include -#include - -#include "defines.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "suggest/core/dictionary/property/ngram_property.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" - -namespace latinime { - -class ProbabilityEntry { - public: - ProbabilityEntry(const ProbabilityEntry &probabilityEntry) - : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), - mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} - - // Dummy entry - ProbabilityEntry() - : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY), - mHistoricalInfo() {} - - // Entry without historical information - ProbabilityEntry(const int flags, const int probability) - : mFlags(flags), mProbability(probability), mHistoricalInfo() {} - - // Entry with historical information. - ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo) - : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {} - - // Create from unigram property. - ProbabilityEntry(const UnigramProperty *const unigramProperty) - : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(), - unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), - unigramProperty->isPossiblyOffensive())), - mProbability(unigramProperty->getProbability()), - mHistoricalInfo(unigramProperty->getHistoricalInfo()) {} - - // Create from ngram property. - // TODO: Set flags. - ProbabilityEntry(const NgramProperty *const ngramProperty) - : mFlags(0), mProbability(ngramProperty->getProbability()), - mHistoricalInfo(ngramProperty->getHistoricalInfo()) {} - - bool isValid() const { - return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0; - } - - bool hasHistoricalInfo() const { - return mHistoricalInfo.isValid(); - } - - uint8_t getFlags() const { - return mFlags; - } - - int getProbability() const { - return mProbability; - } - - const HistoricalInfo *getHistoricalInfo() const { - return &mHistoricalInfo; - } - - bool representsBeginningOfSentence() const { - return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0; - } - - bool isNotAWord() const { - return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0; - } - - bool isBlacklisted() const { - return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0; - } - - bool isPossiblyOffensive() const { - return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0; - } - - uint64_t encode(const bool hasHistoricalInfo) const { - uint64_t encodedEntry = static_cast(mFlags); - if (hasHistoricalInfo) { - encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT)) - | static_cast(mHistoricalInfo.getTimestamp()); - encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT)) - | static_cast(mHistoricalInfo.getLevel()); - encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - | static_cast(mHistoricalInfo.getCount()); - } else { - encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT)) - | static_cast(mProbability); - } - return encodedEntry; - } - - static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) { - if (hasHistoricalInfo) { - const int flags = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, - Ver4DictConstants::TIME_STAMP_FIELD_SIZE - + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE - + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); - const int timestamp = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::TIME_STAMP_FIELD_SIZE, - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE - + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); - const int level = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, - Ver4DictConstants::WORD_COUNT_FIELD_SIZE); - const int count = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */); - const HistoricalInfo historicalInfo(timestamp, level, count); - return ProbabilityEntry(flags, &historicalInfo); - } else { - const int flags = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, - Ver4DictConstants::PROBABILITY_SIZE); - const int probability = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */); - return ProbabilityEntry(flags, probability); - } - } - - private: - // Copy constructor is public to use this class as a type of return value. - DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); - - const uint8_t mFlags; - const int mProbability; - const HistoricalInfo mHistoricalInfo; - - static int readFromEncodedEntry(const uint64_t encodedEntry, const int size, const int pos) { - return static_cast( - (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1)); - } - - static uint8_t createFlags(const bool representsBeginningOfSentence, - const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) { - uint8_t flags = 0; - if (representsBeginningOfSentence) { - flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; - } - if (isNotAWord) { - flags |= Ver4DictConstants::FLAG_NOT_A_WORD; - } - if (isBlacklisted) { - flags |= Ver4DictConstants::FLAG_BLACKLISTED; - } - if (isPossiblyOffensive) { - flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE; - } - return flags; - } -}; -} // namespace latinime -#endif /* LATINIME_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp deleted file mode 100644 index 41d9c544c..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, - int *const outCodePoint, int *const outCodePointCount, int *const outProbability, - bool *const outhasNext, int *const shortcutEntryPos) const { - const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); - if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { - AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", - *shortcutEntryPos, shortcutListBuffer->getTailPosition()); - ASSERT(false); - if (outhasNext) { - *outhasNext = false; - } - if (outCodePointCount) { - *outCodePointCount = 0; - } - return; - } - - const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); - if (outProbability) { - *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; - } - if (outhasNext) { - *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; - } - if (outCodePoint && outCodePointCount) { - shortcutListBuffer->readCodePointsAndAdvancePosition( - maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); - } -} - -int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { - const SparseTable *const addressLookupTable = getAddressLookupTable(); - if (!addressLookupTable->contains(terminalId)) { - return NOT_A_DICT_POS; - } - return addressLookupTable->get(terminalId); -} - -bool ShortcutDictContent::runGC( - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const ShortcutDictContent *const originalShortcutDictContent) { - for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); - it != terminalIdMap->end(); ++it) { - const int originalShortcutListPos = - originalShortcutDictContent->getShortcutListHeadPos(it->first); - if (originalShortcutListPos == NOT_A_DICT_POS) { - continue; - } - const int shortcutListPos = getContentBuffer()->getTailPosition(); - // Copy shortcut list from original content. - if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, - shortcutListPos)) { - AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", - originalShortcutListPos, shortcutListPos); - return false; - } - // Set shortcut list position to the lookup table. - if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { - AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", - it->second, shortcutListPos); - return false; - } - } - return true; -} - -bool ShortcutDictContent::createNewShortcutList(const int terminalId) { - const int shortcutListListPos = getContentBuffer()->getTailPosition(); - return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); -} - -bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { - return copyShortcutListFromDictContent(shortcutListPos, this, toPos); -} - -bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, - const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { - bool hasNext = true; - int readingPos = shortcutListPos; - int writingPos = toPos; - int codePoints[MAX_WORD_LENGTH]; - while (hasNext) { - int probability = 0; - int codePointCount = 0; - sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, - codePoints, &codePointCount, &probability, &hasNext, &readingPos); - if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, - hasNext, &writingPos)) { - AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); - return false; - } - } - return true; -} - -bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { - BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); - const int shortcutFlags = shortcutListBuffer->readUint( - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); - const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; - const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); - return shortcutListBuffer->writeUint(shortcutFlagsToWrite, - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); -} - -bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, - const int codePointCount, const int probability, const bool hasNext, - int *const shortcutEntryPos) { - BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); - const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); - if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, - Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { - AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); - return false; - } - if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, - true /* writesTerminator */, shortcutEntryPos)) { - AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); - return false; - } - return true; -} - -// Find a shortcut entry that has specified target and return its position. -int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, - const int *const targetCodePointsToFind, const int codePointCount) const { - bool hasNext = true; - int readingPos = shortcutListPos; - int targetCodePoints[MAX_WORD_LENGTH]; - while (hasNext) { - const int entryPos = readingPos; - int probability = 0; - int targetCodePointCount = 0; - getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, - &probability, &hasNext, &readingPos); - if (targetCodePointCount != codePointCount) { - continue; - } - bool matched = true; - for (int i = 0; i < codePointCount; ++i) { - if (targetCodePointsToFind[i] != targetCodePoints[i]) { - matched = false; - break; - } - } - if (matched) { - return entryPos; - } - } - return NOT_A_DICT_POS; -} - -int ShortcutDictContent::createAndGetShortcutFlags(const int probability, - const bool hasNext) const { - return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) - | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h deleted file mode 100644 index 85c9ce8d8..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_SHORTCUT_DICT_CONTENT_H -#define LATINIME_SHORTCUT_DICT_CONTENT_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" - -namespace latinime { - -class ReadWriteByteArrayView; - -class ShortcutDictContent : public SparseTableDictContent { - public: - ShortcutDictContent(const ReadWriteByteArrayView *const buffers) - : SparseTableDictContent(buffers, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} - - ShortcutDictContent() - : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} - - void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, - int *const outCodePointCount, int *const outProbability, bool *const outhasNext, - const int shortcutEntryPos) { - int readingPos = shortcutEntryPos; - return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, - outCodePointCount, outProbability, outhasNext, &readingPos); - } - - void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, - int *const outCodePoint, int *const outCodePointCount, int *const outProbability, - bool *const outhasNext, int *const shortcutEntryPos) const; - - // Returns head position of shortcut list for a PtNode specified by terminalId. - int getShortcutListHeadPos(const int terminalId) const; - - bool flushToFile(FILE *const file) const { - return flush(file); - } - - bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const ShortcutDictContent *const originalShortcutDictContent); - - bool createNewShortcutList(const int terminalId); - - bool copyShortcutList(const int shortcutListPos, const int toPos); - - bool setProbability(const int probability, const int shortcutEntryPos); - - bool writeShortcutEntry(const int *const codePoint, const int codePointCount, - const int probability, const bool hasNext, const int shortcutEntryPos) { - int writingPos = shortcutEntryPos; - return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, - hasNext, &writingPos); - } - - bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, - const int codePointCount, const int probability, const bool hasNext, - int *const shortcutEntryPos); - - int findShortcutEntryAndGetPos(const int shortcutListPos, - const int *const targetCodePointsToFind, const int codePointCount) const; - - private: - DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); - - bool copyShortcutListFromDictContent(const int shortcutListPos, - const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); - - int createAndGetShortcutFlags(const int probability, const bool hasNext) const; -}; -} // namespace latinime -#endif /* LATINIME_SHORTCUT_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h deleted file mode 100644 index 309c434cf..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_SINGLE_DICT_CONTENT_H -#define LATINIME_SINGLE_DICT_CONTENT_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class SingleDictContent { - public: - SingleDictContent(const ReadWriteByteArrayView buffer) - : mExpandableContentBuffer(buffer, - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} - - SingleDictContent() - : mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE) {} - - virtual ~SingleDictContent() {} - - bool isNearSizeLimit() const { - return mExpandableContentBuffer.isNearSizeLimit(); - } - - protected: - BufferWithExtendableBuffer *getWritableBuffer() { - return &mExpandableContentBuffer; - } - - const BufferWithExtendableBuffer *getBuffer() const { - return &mExpandableContentBuffer; - } - - bool flush(FILE *const file) const { - return DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer); - } - - private: - DISALLOW_COPY_AND_ASSIGN(SingleDictContent); - - BufferWithExtendableBuffer mExpandableContentBuffer; -}; -} // namespace latinime -#endif /* LATINIME_SINGLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp deleted file mode 100644 index 896ce6bd2..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" - -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" - -namespace latinime { - -const int SparseTableDictContent::LOOKUP_TABLE_BUFFER_INDEX = 0; -const int SparseTableDictContent::ADDRESS_TABLE_BUFFER_INDEX = 1; -const int SparseTableDictContent::CONTENT_BUFFER_INDEX = 2; - -bool SparseTableDictContent::flush(FILE *const file) const { - if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableLookupTableBuffer)) { - return false; - } - if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableAddressTableBuffer)) { - return false; - } - if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer)) { - return false; - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h deleted file mode 100644 index 0ce2da7bf..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_SPARSE_TABLE_DICT_CONTENT_H -#define LATINIME_SPARSE_TABLE_DICT_CONTENT_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/sparse_table.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -// TODO: Support multiple contents. -class SparseTableDictContent { - public: - AK_FORCE_INLINE SparseTableDictContent(const ReadWriteByteArrayView *const buffers, - const int sparseTableBlockSize, const int sparseTableDataSize) - : mExpandableLookupTableBuffer(buffers[LOOKUP_TABLE_BUFFER_INDEX], - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableAddressTableBuffer(buffers[ADDRESS_TABLE_BUFFER_INDEX], - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableContentBuffer(buffers[CONTENT_BUFFER_INDEX], - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, - sparseTableBlockSize, sparseTableDataSize) {} - - SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) - : mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, - sparseTableBlockSize, sparseTableDataSize) {} - - virtual ~SparseTableDictContent() {} - - bool isNearSizeLimit() const { - return mExpandableLookupTableBuffer.isNearSizeLimit() - || mExpandableAddressTableBuffer.isNearSizeLimit() - || mExpandableContentBuffer.isNearSizeLimit(); - } - - protected: - SparseTable *getUpdatableAddressLookupTable() { - return &mAddressLookupTable; - } - - const SparseTable *getAddressLookupTable() const { - return &mAddressLookupTable; - } - - BufferWithExtendableBuffer *getWritableContentBuffer() { - return &mExpandableContentBuffer; - } - - const BufferWithExtendableBuffer *getContentBuffer() const { - return &mExpandableContentBuffer; - } - - bool flush(FILE *const file) const; - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); - - static const int LOOKUP_TABLE_BUFFER_INDEX; - static const int ADDRESS_TABLE_BUFFER_INDEX; - static const int CONTENT_BUFFER_INDEX; - - BufferWithExtendableBuffer mExpandableLookupTableBuffer; - BufferWithExtendableBuffer mExpandableAddressTableBuffer; - BufferWithExtendableBuffer mExpandableContentBuffer; - SparseTable mAddressLookupTable; -}; -} // namespace latinime -#endif /* LATINIME_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp deleted file mode 100644 index 7bda3dc95..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { - if (terminalId < 0 || terminalId >= mSize) { - return NOT_A_DICT_POS; - } - const int terminalPos = getBuffer()->readUint( - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); - return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? - NOT_A_DICT_POS : terminalPos; -} - -bool TerminalPositionLookupTable::setTerminalPtNodePosition( - const int terminalId, const int terminalPtNodePos) { - if (terminalId < 0) { - return false; - } - while (terminalId >= mSize) { - // Write new entry. - if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { - return false; - } - mSize++; - } - const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? - terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; - return getWritableBuffer()->writeUint(terminalPos, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); -} - -bool TerminalPositionLookupTable::flushToFile(FILE *const file) const { - // If the used buffer size is smaller than the actual buffer size, regenerate the lookup - // table and write the new table to the file. - if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { - TerminalPositionLookupTable lookupTableToWrite; - for (int i = 0; i < mSize; ++i) { - const int terminalPtNodePosition = getTerminalPtNodePosition(i); - if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { - AKLOGE("Cannot set terminal position to lookupTableToWrite." - " terminalId: %d, position: %d", i, terminalPtNodePosition); - return false; - } - } - return lookupTableToWrite.flush(file); - } else { - // We can simply use this lookup table because the buffer size has not been - // changed. - return flush(file); - } -} - -bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { - int removedEntryCount = 0; - int nextNewTerminalId = 0; - for (int i = 0; i < mSize; ++i) { - const int terminalPos = getBuffer()->readUint( - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); - if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { - // This entry is a garbage. - removedEntryCount++; - } else { - // Give a new terminal id to the entry. - if (!getWritableBuffer()->writeUint(terminalPos, - Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, - getEntryPos(nextNewTerminalId))) { - return false; - } - // Memorize the mapping to the old terminal id to the new terminal id. - terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); - nextNewTerminalId++; - } - } - mSize = nextNewTerminalId; - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h deleted file mode 100644 index febcbe5b4..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H -#define LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H - -#include -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class TerminalPositionLookupTable : public SingleDictContent { - public: - typedef std::unordered_map TerminalIdMap; - - TerminalPositionLookupTable(const ReadWriteByteArrayView buffer) - : SingleDictContent(buffer), - mSize(getBuffer()->getTailPosition() - / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} - - TerminalPositionLookupTable() : mSize(0) {} - - int getTerminalPtNodePosition(const int terminalId) const; - - bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); - - int getNextTerminalId() const { - return mSize; - } - - bool flushToFile(FILE *const file) const; - - bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); - - private: - DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); - - int getEntryPos(const int terminalId) const { - return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; - } - - int mSize; -}; -} // namespace latinime -#endif // LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h deleted file mode 100644 index 790273541..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_SHORTCUT_LIST_POLICY_H -#define LATINIME_VER4_SHORTCUT_LIST_POLICY_H - -#include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" - -namespace latinime { - -class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { - public: - Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, - const TerminalPositionLookupTable *const terminalPositionLookupTable) - : mShortcutDictContent(shortcutDictContent) {} - - ~Ver4ShortcutListPolicy() {} - - int getStartPos(const int pos) const { - // The first shortcut entry is located at the head position of the shortcut list. - return pos; - } - - void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, - int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, - int *const pos) const { - int probability = 0; - mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, - outCodePoint, outCodePointCount, &probability, outHasNext, pos); - if (outIsWhitelist) { - *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); - } - } - - void skipAllShortcuts(int *const pos) const { - // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. - } - - bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, - const int probability) { - const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); - if (shortcutListPos == NOT_A_DICT_POS) { - // Create shortcut list. - if (!mShortcutDictContent->createNewShortcutList(terminalId)) { - AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); - return false; - } - const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); - return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, - false /* hasNext */, writingPos); - } - const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, - codePoints, codePointCount); - if (entryPos == NOT_A_DICT_POS) { - // Add new entry to the shortcut list. - // Create new shortcut list. - if (!mShortcutDictContent->createNewShortcutList(terminalId)) { - AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); - return false; - } - int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); - if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, - codePointCount, probability, true /* hasNext */, &writingPos)) { - AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, - writingPos); - return false; - } - return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); - } - // Overwrite existing entry. - bool hasNext = false; - mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, - 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); - if (!mShortcutDictContent->writeShortcutEntry(codePoints, - codePointCount, probability, hasNext, entryPos)) { - AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, - entryPos); - return false; - } - return true; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); - - ShortcutDictContent *const mShortcutDictContent; -}; -} // namespace latinime -#endif // LATINIME_VER4_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp deleted file mode 100644 index 4d088dcab..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" - -#include -#include -#include -#include -#include -#include - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( - const char *const dictPath, MmappedBuffer::MmappedBufferPtr &&headerBuffer, - const FormatUtils::FORMAT_VERSION formatVersion) { - if (!headerBuffer) { - ASSERT(false); - AKLOGE("The header buffer must be valid to open ver4 dict buffers."); - return Ver4DictBuffersPtr(nullptr); - } - // TODO: take only dictDirPath, and open both header and trie files in the constructor below - const bool isUpdatable = headerBuffer->isUpdatable(); - MmappedBuffer::MmappedBufferPtr bodyBuffer = MmappedBuffer::openBuffer(dictPath, - Ver4DictConstants::BODY_FILE_EXTENSION, isUpdatable); - if (!bodyBuffer) { - return Ver4DictBuffersPtr(nullptr); - } - std::vector buffers; - const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView(); - int position = 0; - while (position < static_cast(buffer.size())) { - const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition( - buffer.data(), &position); - buffers.push_back(buffer.subView(position, bufferSize)); - position += bufferSize; - if (bufferSize < 0 || position < 0 || position > static_cast(buffer.size())) { - AKLOGE("The dict body file is corrupted."); - return Ver4DictBuffersPtr(nullptr); - } - } - if (buffers.size() != Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE) { - AKLOGE("The dict body file is corrupted."); - return Ver4DictBuffersPtr(nullptr); - } - return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer), - formatVersion, buffers)); -} - -bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, - const BufferWithExtendableBuffer *const headerBuffer) const { - // Create temporary directory. - const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, - DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); - char tmpDirPath[tmpDirPathBufSize]; - FileUtils::getFilePathWithSuffix(dictDirPath, - DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, - tmpDirPath); - if (FileUtils::existsDir(tmpDirPath)) { - if (!FileUtils::removeDirAndFiles(tmpDirPath)) { - AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); - ASSERT(false); - return false; - } - } - umask(S_IWGRP | S_IWOTH); - if (mkdir(tmpDirPath, S_IRWXU) == -1) { - AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); - return false; - } - // Get dictionary base path. - const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; - char dictName[dictNameBufSize]; - FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); - const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); - char dictPath[dictPathBufSize]; - FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); - - // Write header file. - if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, - Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { - AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, - Ver4DictConstants::HEADER_FILE_EXTENSION); - return false; - } - - // Write body file. - const int bodyFilePathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictPath, - Ver4DictConstants::BODY_FILE_EXTENSION); - char bodyFilePath[bodyFilePathBufSize]; - FileUtils::getFilePathWithSuffix(dictPath, Ver4DictConstants::BODY_FILE_EXTENSION, - bodyFilePathBufSize, bodyFilePath); - - const int fd = open(bodyFilePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); - if (fd == -1) { - AKLOGE("File %s cannot be opened. errno: %d", bodyFilePath, errno); - ASSERT(false); - return false; - } - FILE *const file = fdopen(fd, "wb"); - if (!file) { - AKLOGE("fdopen failed for the file %s. errno: %d", bodyFilePath, errno); - ASSERT(false); - return false; - } - - if (!flushDictBuffers(file)) { - fclose(file); - return false; - } - fclose(file); - // Remove existing dictionary. - if (!FileUtils::removeDirAndFiles(dictDirPath)) { - AKLOGE("Existing directory %s cannot be removed.", dictDirPath); - ASSERT(false); - return false; - } - // Rename temporary directory. - if (rename(tmpDirPath, dictDirPath) != 0) { - AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); - ASSERT(false); - return false; - } - return true; -} - -bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const { - // Write trie. - if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableTrieBuffer)) { - AKLOGE("Trie cannot be written."); - return false; - } - // Write terminal position lookup table. - if (!mTerminalPositionLookupTable.flushToFile(file)) { - AKLOGE("Terminal position lookup table cannot be written."); - return false; - } - // Write language model content. - if (!mLanguageModelDictContent.save(file)) { - AKLOGE("Language model dict content cannot be written."); - return false; - } - // Write shortcut dict content. - if (!mShortcutDictContent.flushToFile(file)) { - AKLOGE("Shortcut dict content cannot be written."); - return false; - } - return true; -} - -Ver4DictBuffers::Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, - MmappedBuffer::MmappedBufferPtr &&bodyBuffer, - const FormatUtils::FORMAT_VERSION formatVersion, - const std::vector &contentBuffers) - : mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(std::move(bodyBuffer)), - mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion), - mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableTrieBuffer(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX], - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mTerminalPositionLookupTable( - contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]), - mLanguageModelDictContent(&contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX], - mHeaderPolicy.hasHistoricalInfoOfWords()), - mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]), - mIsUpdatable(mDictBuffer->isUpdatable()) {} - -Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) - : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy), - mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), - mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), - mLanguageModelDictContent(headerPolicy->hasHistoricalInfoOfWords()), - mShortcutDictContent(), mIsUpdatable(true) {} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h deleted file mode 100644 index 5407525af..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_DICT_BUFFER_H -#define LATINIME_VER4_DICT_BUFFER_H - -#include -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" - -namespace latinime { - -class Ver4DictBuffers { - public: - typedef std::unique_ptr Ver4DictBuffersPtr; - - static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, - MmappedBuffer::MmappedBufferPtr &&headerBuffer, - const FormatUtils::FORMAT_VERSION formatVersion); - - static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( - const HeaderPolicy *const headerPolicy, const int maxTrieSize) { - return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize)); - } - - AK_FORCE_INLINE bool isValid() const { - return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid(); - } - - AK_FORCE_INLINE bool isNearSizeLimit() const { - return mExpandableTrieBuffer.isNearSizeLimit() - || mTerminalPositionLookupTable.isNearSizeLimit() - || mLanguageModelDictContent.isNearSizeLimit() - || mShortcutDictContent.isNearSizeLimit(); - } - - AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { - return &mHeaderPolicy; - } - - AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { - return &mExpandableHeaderBuffer; - } - - AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { - return &mExpandableTrieBuffer; - } - - AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { - return &mExpandableTrieBuffer; - } - - AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { - return &mTerminalPositionLookupTable; - } - - AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { - return &mTerminalPositionLookupTable; - } - - AK_FORCE_INLINE LanguageModelDictContent *getMutableLanguageModelDictContent() { - return &mLanguageModelDictContent; - } - - AK_FORCE_INLINE const LanguageModelDictContent *getLanguageModelDictContent() const { - return &mLanguageModelDictContent; - } - - AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { - return &mShortcutDictContent; - } - - AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { - return &mShortcutDictContent; - } - - AK_FORCE_INLINE bool isUpdatable() const { - return mIsUpdatable; - } - - bool flush(const char *const dictDirPath) const { - return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); - } - - bool flushHeaderAndDictBuffers(const char *const dictDirPath, - const BufferWithExtendableBuffer *const headerBuffer) const; - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); - - Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, - MmappedBuffer::MmappedBufferPtr &&bodyBuffer, - const FormatUtils::FORMAT_VERSION formatVersion, - const std::vector &contentBuffers); - - Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); - - bool flushDictBuffers(FILE *const file) const; - - const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; - const MmappedBuffer::MmappedBufferPtr mDictBuffer; - const HeaderPolicy mHeaderPolicy; - BufferWithExtendableBuffer mExpandableHeaderBuffer; - BufferWithExtendableBuffer mExpandableTrieBuffer; - TerminalPositionLookupTable mTerminalPositionLookupTable; - LanguageModelDictContent mLanguageModelDictContent; - ShortcutDictContent mShortcutDictContent; - const int mIsUpdatable; -}; -} // namespace latinime -#endif /* LATINIME_VER4_DICT_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp deleted file mode 100644 index bd89b8da7..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" - -namespace latinime { - -const char *const Ver4DictConstants::BODY_FILE_EXTENSION = ".body"; -const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; - -// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. -const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; -// Extended region size, which is not GCed region size in dict file + additional buffer size, is -// limited to 1MB to prevent from inefficient traversing. -const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; - -// NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT for Trie and TerminalAddressLookupTable. -// NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT for language model. -// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for shortcut. -const size_t Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE = - NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT * 2 - + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT - + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; -const int Ver4DictConstants::TRIE_BUFFER_INDEX = 0; -const int Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX = - TRIE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; -const int Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX = - TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; -const int Ver4DictConstants::SHORTCUT_BUFFERS_INDEX = - LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; - -const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; -const int Ver4DictConstants::PROBABILITY_SIZE = 1; -const int Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE = 1; -const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; -const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; -const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; -const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; -const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0; -const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2; - -const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1; -const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2; -const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4; -const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8; -const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10; - -const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; -const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; - -const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; -const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; -const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; - -const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT = 1; -const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT = 3; -const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 2; - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h deleted file mode 100644 index 13d7a5714..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_DICT_CONSTANTS_H -#define LATINIME_VER4_DICT_CONSTANTS_H - -#include "defines.h" - -#include -#include - -namespace latinime { - -// TODO: Create PtConstants under the pt_common and move some constant values there. -// Note that there are corresponding definitions in FormatSpec.java. -class Ver4DictConstants { - public: - static const char *const BODY_FILE_EXTENSION; - static const char *const HEADER_FILE_EXTENSION; - static const int MAX_DICTIONARY_SIZE; - static const int MAX_DICT_EXTENDED_REGION_SIZE; - - static const size_t NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE; - static const int TRIE_BUFFER_INDEX; - static const int TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX; - static const int LANGUAGE_MODEL_BUFFER_INDEX; - static const int BIGRAM_BUFFERS_INDEX; - static const int SHORTCUT_BUFFERS_INDEX; - - static const int NOT_A_TERMINAL_ID; - static const int PROBABILITY_SIZE; - static const int FLAGS_IN_LANGUAGE_MODEL_SIZE; - static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; - static const int NOT_A_TERMINAL_ADDRESS; - static const int TERMINAL_ID_FIELD_SIZE; - static const int TIME_STAMP_FIELD_SIZE; - // TODO: Remove - static const int WORD_LEVEL_FIELD_SIZE; - static const int WORD_COUNT_FIELD_SIZE; - // Flags in probability entry. - static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; - static const uint8_t FLAG_NOT_A_VALID_ENTRY; - static const uint8_t FLAG_NOT_A_WORD; - static const uint8_t FLAG_BLACKLISTED; - static const uint8_t FLAG_POSSIBLY_OFFENSIVE; - - static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; - static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; - - static const int SHORTCUT_FLAGS_FIELD_SIZE; - static const int SHORTCUT_PROBABILITY_MASK; - static const int SHORTCUT_HAS_NEXT_MASK; - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); - - static const size_t NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; - static const size_t NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; - static const size_t NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; -}; -} // namespace latinime -#endif /* LATINIME_VER4_DICT_CONSTANTS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp deleted file mode 100644 index 4110d6036..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" - -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { - -const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( - const int ptNodePos, const int siblingNodePos) const { - if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", - ptNodePos, mBuffer->getTailPosition()); - ASSERT(false); - return PtNodeParams(); - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int pos = ptNodePos; - const int headPos = ptNodePos; - if (usesAdditionalBuffer) { - pos -= mBuffer->getOriginalBufferSize(); - } - const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const int parentPosOffset = - DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( - dictBuf, &pos); - const int parentPos = - DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); - int codePoints[MAX_WORD_LENGTH]; - // Code point table is not used for ver4 dictionaries. - const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, nullptr /* codePointTable */, codePoints, &pos); - int terminalIdFieldPos = NOT_A_DICT_POS; - int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - if (PatriciaTrieReadingUtils::isTerminal(flags)) { - terminalIdFieldPos = pos; - if (usesAdditionalBuffer) { - terminalIdFieldPos += mBuffer->getOriginalBufferSize(); - } - terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); - } - int childrenPosFieldPos = pos; - if (usesAdditionalBuffer) { - childrenPosFieldPos += mBuffer->getOriginalBufferSize(); - } - int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( - dictBuf, &pos); - if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { - childrenPos += mBuffer->getOriginalBufferSize(); - } - if (usesAdditionalBuffer) { - pos += mBuffer->getOriginalBufferSize(); - } - // Sibling position is the tail position of original PtNode. - int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; - // Read destination node if the read node is a moved node. - if (DynamicPtReadingUtils::isMoved(flags)) { - // The destination position is stored at the same place as the parent position. - return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); - } else { - return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, - terminalIdFieldPos, terminalId, NOT_A_PROBABILITY, childrenPosFieldPos, childrenPos, - newSiblingNodePos); - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h deleted file mode 100644 index f4df544e2..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H -#define LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" - -namespace latinime { - -class BufferWithExtendableBuffer; -class HeaderPolicy; -class LanguageModelDictContent; - -/* - * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved - * node and reads node attributes. - */ -class Ver4PatriciaTrieNodeReader : public PtNodeReader { - public: - explicit Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer) - : mBuffer(buffer) {} - - ~Ver4PatriciaTrieNodeReader() {} - - virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { - return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, - NOT_A_DICT_POS /* siblingNodePos */); - } - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); - - const BufferWithExtendableBuffer *const mBuffer; - - const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, - const int siblingNodePos) const; -}; -} // namespace latinime -#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp deleted file mode 100644 index 3488f7d2a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" - -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { - -const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; - -bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( - const PtNodeParams *const toBeUpdatedPtNodeParams) { - int pos = toBeUpdatedPtNodeParams->getHeadPos(); - const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mTrieBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, - true /* isDeleted */, false /* willBecomeNonTerminal */); - int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); - // Update flags. - if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, - &writingPos)) { - return false; - } - if (toBeUpdatedPtNodeParams->isTerminal()) { - // The PtNode is a terminal. Delete entry from the terminal position lookup table. - return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( - toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); - } else { - return true; - } -} - -// TODO: Quit using bigramLinkedNodePos. -bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( - const PtNodeParams *const toBeUpdatedPtNodeParams, - const int movedPos, const int bigramLinkedNodePos) { - int pos = toBeUpdatedPtNodeParams->getHeadPos(); - const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mTrieBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, - false /* isDeleted */, false /* willBecomeNonTerminal */); - int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); - // Update flags. - if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, - &writingPos)) { - return false; - } - // Update moved position, which is stored in the parent offset field. - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( - mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { - return false; - } - if (toBeUpdatedPtNodeParams->hasChildren()) { - // Update children's parent position. - mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); - while (!mReadingHelper.isEnd()) { - const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); - int parentOffsetFieldPos = childPtNodeParams.getHeadPos() - + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( - mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), - &parentOffsetFieldPos)) { - // Parent offset cannot be written because of a bug or a broken dictionary; thus, - // we give up to update dictionary. - return false; - } - mReadingHelper.readNextSiblingNode(childPtNodeParams); - } - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( - const PtNodeParams *const toBeUpdatedPtNodeParams) { - int pos = toBeUpdatedPtNodeParams->getHeadPos(); - const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); - const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); - if (usesAdditionalBuffer) { - pos -= mTrieBuffer->getOriginalBufferSize(); - } - // Read original flags - const PatriciaTrieReadingUtils::NodeFlags originalFlags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const PatriciaTrieReadingUtils::NodeFlags updatedFlags = - DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, - false /* isDeleted */, true /* willBecomeNonTerminal */); - if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( - toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { - AKLOGE("Cannot update terminal position lookup table. terminal id: %d", - toBeUpdatedPtNodeParams->getTerminalId()); - return false; - } - // Update flags. - int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); - return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, - &writingPos); -} - -bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( - const PtNodeParams *const toBeUpdatedPtNodeParams, - const UnigramProperty *const unigramProperty) { - // Update probability and historical information. - // TODO: Update other information in the unigram property. - if (!toBeUpdatedPtNodeParams->isTerminal()) { - return false; - } - const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); - return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntryOfUnigramProperty); -} - -bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( - const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { - if (!toBeUpdatedPtNodeParams->isTerminal()) { - AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); - return false; - } - const ProbabilityEntry originalProbabilityEntry = - mBuffers->getLanguageModelDictContent()->getProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId()); - if (originalProbabilityEntry.isValid()) { - *outNeedsToKeepPtNode = true; - return true; - } - if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { - AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); - return false; - } - *outNeedsToKeepPtNode = false; - return true; -} - -bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( - const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { - int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); - return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, - newChildrenPosition, &childrenPosFieldPos); -} - -bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newTerminalId) { - return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, - toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); -} - -bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( - const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { - return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, - ptNodeWritingPos); -} - -bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( - const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, - int *const ptNodeWritingPos) { - int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, - ptNodeWritingPos)) { - return false; - } - // Write probability. - ProbabilityEntry newProbabilityEntry; - const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); - return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( - terminalId, &probabilityEntryOfUnigramProperty); -} - -// TODO: Support counting ngram entries. -bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) { - LanguageModelDictContent *const languageModelDictContent = - mBuffers->getMutableLanguageModelDictContent(); - const ProbabilityEntry probabilityEntry = - languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId); - const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty); - if (!languageModelDictContent->setNgramProbabilityEntry( - prevWordIds, wordId, &probabilityEntryOfNgramProperty)) { - AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d", - prevWordIds[0], prevWordIds.size(), wordId); - return false; - } - if (!probabilityEntry.isValid() && outAddedNewBigram) { - *outAddedNewBigram = true; - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, - const int wordId) { - LanguageModelDictContent *const languageModelDictContent = - mBuffers->getMutableLanguageModelDictContent(); - return languageModelDictContent->removeNgramProbabilityEntry(prevWordIds, wordId); -} - -// TODO: Remove when we stop supporting v402 format. -bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( - const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { - // Do nothing. - return true; -} - -bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( - const PtNodeParams *const toBeUpdatedPtNodeParams, - const DictPositionRelocationMap *const dictPositionRelocationMap, - int *const outBigramEntryCount) { - int parentPos = toBeUpdatedPtNodeParams->getParentPos(); - if (parentPos != NOT_A_DICT_POS) { - PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = - dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); - if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { - parentPos = it->second; - } - } - int writingPos = toBeUpdatedPtNodeParams->getHeadPos() - + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; - // Write updated parent offset. - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, - parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { - return false; - } - - // Updates children position. - int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); - if (childrenPos != NOT_A_DICT_POS) { - PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = - dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); - if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { - childrenPos = it->second; - } - } - if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { - return false; - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, - const int *const targetCodePoints, const int targetCodePointCount, - const int shortcutProbability) { - if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), - targetCodePoints, targetCodePointCount, shortcutProbability)) { - AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); - return false; - } - return true; -} - -bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( - const PtNodeParams *const ptNodeParams, int *const outTerminalId, - int *const ptNodeWritingPos) { - const int nodePos = *ptNodeWritingPos; - // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the - // PtNode writing. - if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, - 0 /* nodeFlags */, ptNodeWritingPos)) { - return false; - } - // Calculate a parent offset and write the offset. - if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, - ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { - return false; - } - // Write code points - if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, - ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { - return false; - } - int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - if (!ptNodeParams->willBecomeNonTerminal()) { - if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { - terminalId = ptNodeParams->getTerminalId(); - } else if (ptNodeParams->isTerminal()) { - // Write terminal information using a new terminal id. - // Get a new unused terminal id. - terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); - } - } - const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; - if (isTerminal) { - // Update the lookup table. - if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( - terminalId, nodePos)) { - return false; - } - // Write terminal Id. - if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, - Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { - return false; - } - if (outTerminalId) { - *outTerminalId = terminalId; - } - } - // Write children position - if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, - ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { - return false; - } - return updatePtNodeFlags(nodePos, isTerminal, - ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); -} - -bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal, - const bool hasMultipleChars) { - // Create node flags and write them. - PatriciaTrieReadingUtils::NodeFlags nodeFlags = - PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */, - false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */, - false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE); - if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { - AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); - return false; - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h deleted file mode 100644 index 4ecf88729..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H -#define LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" - -namespace latinime { - -class BufferWithExtendableBuffer; -class HeaderPolicy; -class Ver4DictBuffers; -class Ver4PatriciaTrieNodeReader; -class Ver4PtNodeArrayReader; -class Ver4ShortcutListPolicy; - -/* - * This class is used for helping to writes nodes of ver4 patricia trie. - */ -class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { - public: - Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, - Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader, - const PtNodeArrayReader *const ptNodeArrayReader, - Ver4ShortcutListPolicy *const shortcutPolicy) - : mTrieBuffer(trieBuffer), mBuffers(buffers), - mReadingHelper(ptNodeReader, ptNodeArrayReader), mShortcutPolicy(shortcutPolicy) {} - - virtual ~Ver4PatriciaTrieNodeWriter() {} - - virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); - - virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int movedPos, const int bigramLinkedNodePos); - - virtual bool markPtNodeAsWillBecomeNonTerminal( - const PtNodeParams *const toBeUpdatedPtNodeParams); - - virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, - const UnigramProperty *const unigramProperty); - - virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( - const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); - - virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newChildrenPosition); - - bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, - const int newTerminalId); - - virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - int *const ptNodeWritingPos); - - virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, - const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); - - virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); - - virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); - - virtual bool updateAllBigramEntriesAndDeleteUselessEntries( - const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); - - virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, - const DictPositionRelocationMap *const dictPositionRelocationMap, - int *const outBigramEntryCount); - - virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, - const int *const targetCodePoints, const int targetCodePointCount, - const int shortcutProbability); - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); - - bool writePtNodeAndGetTerminalIdAndAdvancePosition( - const PtNodeParams *const ptNodeParams, int *const outTerminalId, - int *const ptNodeWritingPos); - - bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars); - - static const int CHILDREN_POSITION_FIELD_SIZE; - - BufferWithExtendableBuffer *const mTrieBuffer; - Ver4DictBuffers *const mBuffers; - DynamicPtReadingHelper mReadingHelper; - Ver4ShortcutListPolicy *const mShortcutPolicy; -}; -} // namespace latinime -#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp deleted file mode 100644 index a96719533..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ /dev/null @@ -1,603 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" - -#include -#include - -#include "suggest/core/dicnode/dic_node.h" -#include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/multi_bigram_map.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/dictionary/property/ngram_property.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/session/ngram_context.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" -#include "utils/ngram_utils.h" - -namespace latinime { - -// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and -// BinaryDictionaryDecayingTests. -const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; -const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; -const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = - Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; - -void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const { - if (!dicNode->hasChildren()) { - return; - } - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); - while (!readingHelper.isEnd()) { - const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); - if (!ptNodeParams.isValid()) { - break; - } - const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); - const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID; - childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), - wordId, ptNodeParams.getCodePointArrayView()); - readingHelper.readNextSiblingNode(ptNodeParams); - } - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); - } -} - -int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, - const int maxCodePointCount, int *const outCodePoints) const { - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - readingHelper.initWithPtNodePos(ptNodePos); - const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( - maxCodePointCount, outCodePoints); - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); - } - return codePointCount; -} - -int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, - const bool forceLowerCaseSearch) const { - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), - wordCodePoints.size(), forceLowerCaseSearch); - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); - } - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_WORD_ID; - } - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.isDeleted()) { - return NOT_A_WORD_ID; - } - return ptNodeParams.getTerminalId(); -} - -const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( - const WordIdArrayView prevWordIds, const int wordId, - MultiBigramMap *const multiBigramMap) const { - if (wordId == NOT_A_WORD_ID) { - return WordAttributes(); - } - return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, - false /* mustMatchAllPrevWords */, mHeaderPolicy); -} - -int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, - const int wordId) const { - if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { - return NOT_A_PROBABILITY; - } - const WordAttributes wordAttributes = - mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, - true /* mustMatchAllPrevWords */, mHeaderPolicy); - if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { - return NOT_A_PROBABILITY; - } - return wordAttributes.getProbability(); -} - -BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( - const int wordId) const { - const int shortcutPos = getShortcutPositionOfWord(wordId); - return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); -} - -void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, - NgramListener *const listener) const { - if (prevWordIds.empty()) { - return; - } - const auto languageModelDictContent = mBuffers->getLanguageModelDictContent(); - for (size_t i = 1; i <= prevWordIds.size(); ++i) { - for (const auto entry : languageModelDictContent->getProbabilityEntries( - prevWordIds.limit(i))) { - const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry(); - if (!probabilityEntry.isValid()) { - continue; - } - int probability = NOT_A_PROBABILITY; - if (probabilityEntry.hasHistoricalInfo()) { - // TODO: Quit checking count here. - // If count <= 1, the word can be an invaild word. The actual probability should - // be checked using getWordAttributesInContext() in onVisitEntry(). - probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ? - NOT_A_PROBABILITY : 0; - } else { - probability = probabilityEntry.getProbability(); - } - listener->onVisitEntry(probability, entry.getWordId()); - } - } -} - -int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const { - if (wordId == NOT_A_WORD_ID) { - return NOT_A_DICT_POS; - } - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted()) { - return NOT_A_DICT_POS; - } - return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( - ptNodeParams.getTerminalId()); -} - -bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, - const UnigramProperty *const unigramProperty) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - if (wordCodePoints.size() > MAX_WORD_LENGTH) { - AKLOGE("The word is too long to insert to the dictionary, length: %zd", - wordCodePoints.size()); - return false; - } - for (const auto &shortcut : unigramProperty->getShortcuts()) { - if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", - shortcut.getTargetCodePoints()->size()); - return false; - } - } - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - bool addedNewUnigram = false; - int codePointsToAdd[MAX_WORD_LENGTH]; - int codePointCountToAdd = wordCodePoints.size(); - memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); - if (unigramProperty->representsBeginningOfSentence()) { - codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, - codePointCountToAdd, MAX_WORD_LENGTH); - } - if (codePointCountToAdd <= 0) { - return false; - } - const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); - if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, - &addedNewUnigram)) { - if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { - mEntryCounters.incrementNgramCount(NgramType::Unigram); - } - if (unigramProperty->getShortcuts().size() > 0) { - // Add shortcut target. - const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */); - if (wordId == NOT_A_WORD_ID) { - AKLOGE("Cannot find word id to add shortcut target."); - return false; - } - const int wordPos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - for (const auto &shortcut : unigramProperty->getShortcuts()) { - if (!mUpdatingHelper.addShortcutTarget(wordPos, - CodePointArrayView(*shortcut.getTargetCodePoints()), - shortcut.getProbability())) { - AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " - "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), - shortcut.getProbability()); - return false; - } - } - } - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); - return false; - } - const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); - if (wordId == NOT_A_WORD_ID) { - return false; - } - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { - AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); - return false; - } - if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) { - return false; - } - if (!ptNodeParams.representsNonWordInfo()) { - mEntryCounters.decrementNgramCount(NgramType::Unigram); - } - return true; -} - -bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - const NgramContext *const ngramContext = ngramProperty->getNgramContext(); - if (!ngramContext->isValid()) { - AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); - return false; - } - if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("The word is too long to insert the ngram to the dictionary. " - "length: %zd", ngramProperty->getTargetCodePoints()->size()); - return false; - } - WordIdArray prevWordIdArray; - const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, - false /* tryLowerCaseSearch */); - if (prevWordIds.empty()) { - return false; - } - for (size_t i = 0; i < prevWordIds.size(); ++i) { - if (prevWordIds[i] != NOT_A_WORD_ID) { - continue; - } - if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) { - return false; - } - const UnigramProperty beginningOfSentenceUnigramProperty( - true /* representsBeginningOfSentence */, true /* isNotAWord */, - false /* isBlacklisted */, false /* isPossiblyOffensive */, - MAX_PROBABILITY /* probability */, HistoricalInfo()); - if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), - &beginningOfSentenceUnigramProperty)) { - AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); - return false; - } - // Refresh word ids. - ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); - } - const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()), - false /* forceLowerCaseSearch */); - if (wordId == NOT_A_WORD_ID) { - return false; - } - bool addedNewEntry = false; - if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { - if (addedNewEntry) { - mEntryCounters.incrementNgramCount( - NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); - } - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - if (!ngramContext->isValid()) { - AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); - return false; - } - if (wordCodePoints.size() > MAX_WORD_LENGTH) { - AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", - wordCodePoints.size()); - } - WordIdArray prevWordIdArray; - const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, - false /* tryLowerCaseSerch */); - if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) { - return false; - } - const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); - if (wordId == NOT_A_WORD_ID) { - return false; - } - if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { - mEntryCounters.decrementNgramCount( - NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( - const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, - const bool isValidWord, const HistoricalInfo historicalInfo) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " - "dictionary."); - return false; - } - const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? - false : isValidWord; - int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); - if (wordId == NOT_A_WORD_ID) { - // The word is not in the dictionary. - const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, - false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, - NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, - 0 /* count */)); - if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { - AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); - return false; - } - if (!isValidWord) { - return true; - } - wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); - } - - WordIdArray prevWordIdArray; - const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, - false /* tryLowerCaseSearch */); - if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { - if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { - const UnigramProperty beginningOfSentenceUnigramProperty( - true /* representsBeginningOfSentence */, - true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, - HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); - if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), - &beginningOfSentenceUnigramProperty)) { - AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); - return false; - } - // Refresh word ids. - ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); - } - // Update entries for beginning of sentence. - if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( - prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, - mHeaderPolicy, &mEntryCounters)) { - return false; - } - } - if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, - wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return false; - } - if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { - AKLOGE("Cannot flush the dictionary to file."); - mIsCorrupted = true; - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return false; - } - if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { - AKLOGE("Cannot flush the dictionary to file with GC."); - mIsCorrupted = true; - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); - return false; - } - if (mBuffers->isNearSizeLimit()) { - // Additional buffer size is near the limit. - return true; - } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() - > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { - // Total extended region size of the trie exceeds the limit. - return true; - } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS - && mDictBuffer->getUsedAdditionalBufferSize() > 0) { - // Needs to reduce dictionary size. - return true; - } else if (mHeaderPolicy->isDecayingDict()) { - return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), - mHeaderPolicy); - } - return false; -} - -void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, - char *const outResult, const int maxResultLength) { - const int compareLength = queryLength + 1 /* terminator */; - if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mEntryCounters.getNgramCount(NgramType::Unigram)); - } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); - } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getEntryCountHardLimit( - mHeaderPolicy->getMaxNgramCounts().getNgramCount( - NgramType::Unigram)) : - static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); - } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getEntryCountHardLimit( - mHeaderPolicy->getMaxNgramCounts().getNgramCount( - NgramType::Bigram)) : - static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); - } -} - -const WordProperty Ver4PatriciaTriePolicy::getWordProperty( - const CodePointArrayView wordCodePoints) const { - const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); - if (wordId == NOT_A_WORD_ID) { - AKLOGE("getWordProperty is called for invalid word."); - return WordProperty(); - } - const LanguageModelDictContent *const languageModelDictContent = - mBuffers->getLanguageModelDictContent(); - // Fetch ngram information. - std::vector ngrams; - int ngramTargetCodePoints[MAX_WORD_LENGTH]; - int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; - int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord( - mHeaderPolicy, wordId)) { - const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(), - MAX_WORD_LENGTH, ngramTargetCodePoints); - const WordIdArrayView prevWordIds = entry.getPrevWordIds(); - for (size_t i = 0; i < prevWordIds.size(); ++i) { - ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i], - MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]); - ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry( - prevWordIds[i]).representsBeginningOfSentence(); - if (ngramPrevWordIsBeginningOfSentense[i]) { - ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker( - ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]); - } - } - const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount, - ngramPrevWordIsBeginningOfSentense, prevWordIds.size()); - const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry(); - const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo(); - // TODO: Output flags in WordAttributes. - ngrams.emplace_back(ngramContext, - CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(), - entry.getWordAttributes().getProbability(), *historicalInfo); - } - // Fetch shortcut information. - std::vector shortcuts; - int shortcutPos = getShortcutPositionOfWord(wordId); - if (shortcutPos != NOT_A_DICT_POS) { - int shortcutTarget[MAX_WORD_LENGTH]; - const ShortcutDictContent *const shortcutDictContent = - mBuffers->getShortcutDictContent(); - bool hasNext = true; - while (hasNext) { - int shortcutTargetLength = 0; - int shortcutProbability = NOT_A_PROBABILITY; - shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, - &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); - shortcuts.emplace_back( - CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), - shortcutProbability); - } - } - const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( - WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); - const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); - const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); - const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), - wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), - wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), - *historicalInfo, std::move(shortcuts)); - return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); -} - -int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount) { - *outCodePointCount = 0; - if (token == 0) { - mTerminalPtNodePositionsForIteratingWords.clear(); - DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( - &mTerminalPtNodePositionsForIteratingWords); - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); - } - const int terminalPtNodePositionsVectorSize = - static_cast(mTerminalPtNodePositionsForIteratingWords.size()); - if (token < 0 || token >= terminalPtNodePositionsVectorSize) { - AKLOGE("Given token %d is invalid.", token); - return 0; - } - const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; - const PtNodeParams ptNodeParams = - mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos); - *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(), - MAX_WORD_LENGTH, outCodePoints); - const int nextToken = token + 1; - if (nextToken >= terminalPtNodePositionsVectorSize) { - // All words have been iterated. - mTerminalPtNodePositionsForIteratingWords.clear(); - return 0; - } - return nextToken; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h deleted file mode 100644 index 93faa83a0..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H -#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H - -#include - -#include "defines.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" -#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" -#include "utils/int_array_view.h" - -namespace latinime { - -class DicNode; -class DicNodeVector; - -// Word id = Artificial id that is stored in the PtNode looked up by the word. -class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { - public: - Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) - : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), - mDictBuffer(mBuffers->getWritableTrieBuffer()), - mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), - mBuffers->getTerminalPositionLookupTable()), - mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer), - mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader, - &mShortcutPolicy), - mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), - mWritingHelper(mBuffers.get()), - mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), - mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; - - AK_FORCE_INLINE int getRootPosition() const { - return 0; - } - - void createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const; - - int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, - int *const outCodePoints) const; - - int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - - const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, - const int wordId, MultiBigramMap *const multiBigramMap) const; - - // TODO: Remove - int getProbability(const int unigramProbability, const int bigramProbability) const { - // Not used. - return NOT_A_PROBABILITY; - } - - int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; - - void iterateNgramEntries(const WordIdArrayView prevWordIds, - NgramListener *const listener) const; - - BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; - - const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { - return mHeaderPolicy; - } - - bool addUnigramEntry(const CodePointArrayView wordCodePoints, - const UnigramProperty *const unigramProperty); - - bool removeUnigramEntry(const CodePointArrayView wordCodePoints); - - bool addNgramEntry(const NgramProperty *const ngramProperty); - - bool removeNgramEntry(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints); - - bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, - const CodePointArrayView wordCodePoints, const bool isValidWord, - const HistoricalInfo historicalInfo); - - bool flush(const char *const filePath); - - bool flushWithGC(const char *const filePath); - - bool needsToRunGC(const bool mindsBlockByGC) const; - - void getProperty(const char *const query, const int queryLength, char *const outResult, - const int maxResultLength); - - const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; - - int getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount); - - bool isCorrupted() const { - return mIsCorrupted; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); - - static const char *const UNIGRAM_COUNT_QUERY; - static const char *const BIGRAM_COUNT_QUERY; - static const char *const MAX_UNIGRAM_COUNT_QUERY; - static const char *const MAX_BIGRAM_COUNT_QUERY; - // When the dictionary size is near the maximum size, we have to refuse dynamic operations to - // prevent the dictionary from overflowing. - static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; - static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; - - const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; - const HeaderPolicy *const mHeaderPolicy; - BufferWithExtendableBuffer *const mDictBuffer; - Ver4ShortcutListPolicy mShortcutPolicy; - Ver4PatriciaTrieNodeReader mNodeReader; - Ver4PtNodeArrayReader mPtNodeArrayReader; - Ver4PatriciaTrieNodeWriter mNodeWriter; - DynamicPtUpdatingHelper mUpdatingHelper; - Ver4PatriciaTrieWritingHelper mWritingHelper; - MutableEntryCounters mEntryCounters; - std::vector mTerminalPtNodePositionsForIteratingWords; - mutable bool mIsCorrupted; - - int getShortcutPositionOfWord(const int wordId) const; -}; -} // namespace latinime -#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp deleted file mode 100644 index 254022db4..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( - const uint8_t *const buffer, int *pos) { - return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h deleted file mode 100644 index 466ff55d5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H -#define LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H - -#include - -#include "defines.h" - -namespace latinime { - -class BufferWithExtendableBuffer; - -class Ver4PatriciaTrieReadingUtils { - public: - static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, - int *const pos); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); -}; -} // namespace latinime -#endif /* LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp deleted file mode 100644 index 34af76c5d..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" - -#include -#include - -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" -#include "utils/ngram_utils.h" - -namespace latinime { - -bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, - const EntryCounts &entryCounts) const { - const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); - BufferWithExtendableBuffer headerBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - const int extendedRegionSize = headerPolicy->getExtendedRegionSize() - + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, - entryCounts, extendedRegionSize, &headerBuffer)) { - AKLOGE("Cannot write header structure to buffer. " - "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d," - "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), - entryCounts.getNgramCount(NgramType::Bigram), - entryCounts.getNgramCount(NgramType::Trigram), - extendedRegionSize); - return false; - } - return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); -} - -bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, - const char *const dictDirPath) { - const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); - Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( - Ver4DictBuffers::createVer4DictBuffers(headerPolicy, - Ver4DictConstants::MAX_DICTIONARY_SIZE)); - MutableEntryCounters entryCounters; - if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &entryCounters)) { - return false; - } - BufferWithExtendableBuffer headerBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { - return false; - } - return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); -} - -bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, - MutableEntryCounters *const outEntryCounters) { - Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer()); - Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); - Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), - mBuffers->getTerminalPositionLookupTable()); - Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), - mBuffers, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); - - if (!mBuffers->getMutableLanguageModelDictContent()->updateAllProbabilityEntriesForGC( - headerPolicy, outEntryCounters)) { - AKLOGE("Failed to update probabilities in language model dict content."); - return false; - } - if (headerPolicy->isDecayingDict()) { - const EntryCounts &maxEntryCounts = headerPolicy->getMaxNgramCounts(); - if (!mBuffers->getMutableLanguageModelDictContent()->truncateEntries( - outEntryCounters->getEntryCounts(), maxEntryCounts, headerPolicy, - outEntryCounters)) { - AKLOGE("Failed to truncate entries in language model dict content."); - return false; - } - } - - DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners - ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - &ptNodeWriter); - if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { - return false; - } - - // Mapping from positions in mBuffer to positions in bufferToWrite. - PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); - DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, - buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); - if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { - return false; - } - - // Create policy instances for the GCed dictionary. - Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer()); - Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); - Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), - buffersToWrite->getTerminalPositionLookupTable()); - Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader, - &newShortcutPolicy); - // Re-assign terminal IDs for valid terminal PtNodes. - TerminalPositionLookupTable::TerminalIdMap terminalIdMap; - if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( - &terminalIdMap)) { - return false; - } - // Run GC for language model dict content. - if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap, - mBuffers->getLanguageModelDictContent())) { - return false; - } - // Run GC for shortcut dict content. - if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, - mBuffers->getShortcutDictContent())) { - return false; - } - DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); - newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields - traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); - if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToUpdateAllPositionFields)) { - return false; - } - newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); - if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { - return false; - } - return true; -} - -bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - if (!ptNodeParams->isTerminal()) { - return true; - } - TerminalPositionLookupTable::TerminalIdMap::const_iterator it = - mTerminalIdMap->find(ptNodeParams->getTerminalId()); - if (it == mTerminalIdMap->end()) { - AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", - ptNodeParams->getTerminalId(), mTerminalIdMap->size()); - return false; - } - if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { - AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); - return false; - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h deleted file mode 100644 index c56cea5cf..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H -#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" - -namespace latinime { - -class HeaderPolicy; -class Ver4DictBuffers; -class Ver4PatriciaTrieNodeReader; -class Ver4PatriciaTrieNodeWriter; - -class Ver4PatriciaTrieWritingHelper { - public: - Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) - : mBuffers(buffers) {} - - bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; - - // This method cannot be const because the original dictionary buffer will be updated to detect - // useless PtNodes during GC. - bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); - - class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - : public DynamicPtReadingHelper::TraversingEventListener { - public: - TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( - Ver4PatriciaTrieNodeWriter *const ptNodeWriter, - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) - : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} - - bool onAscend() { return true; } - - bool onDescend(const int ptNodeArrayPos) { return true; } - - bool onReadingPtNodeArrayTail() { return true; } - - bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); - - Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; - const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; - }; - - bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, - Ver4DictBuffers *const buffersToWrite, MutableEntryCounters *const outEntryCounters); - - Ver4DictBuffers *const mBuffers; -}; -} // namespace latinime - -#endif /* LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp deleted file mode 100644 index b014c523d..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" - -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, - int *const outPtNodeCount, int *const outFirstPtNodePos) const { - if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { - // Reading invalid position because of a bug or a broken dictionary. - AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", - ptNodeArrayPos, mBuffer->getTailPosition()); - ASSERT(false); - return false; - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int readingPos = ptNodeArrayPos; - if (usesAdditionalBuffer) { - readingPos -= mBuffer->getOriginalBufferSize(); - } - const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - dictBuf, &readingPos); - if (usesAdditionalBuffer) { - readingPos += mBuffer->getOriginalBufferSize(); - } - if (ptNodeCountInArray < 0) { - AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); - return false; - } - *outPtNodeCount = ptNodeCountInArray; - *outFirstPtNodePos = readingPos; - return true; -} - -bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, - int *const outNextPtNodeArrayPos) const { - if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { - // Reading invalid position because of bug or broken dictionary. - AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", - forwordLinkPos, mBuffer->getTailPosition()); - ASSERT(false); - return false; - } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); - const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int readingPos = forwordLinkPos; - if (usesAdditionalBuffer) { - readingPos -= mBuffer->getOriginalBufferSize(); - } - const int nextPtNodeArrayOffset = - DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); - if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { - *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; - } else { - *outNextPtNodeArrayPos = NOT_A_DICT_POS; - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h deleted file mode 100644 index d81808efc..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_PT_NODE_ARRAY_READER_H -#define LATINIME_VER4_PT_NODE_ARRAY_READER_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" - -namespace latinime { - -class BufferWithExtendableBuffer; - -class Ver4PtNodeArrayReader : public PtNodeArrayReader { - public: - Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; - - virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, - int *const outPtNodeCount, int *const outFirstPtNodePos) const; - virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, - int *const outNextPtNodeArrayPos) const; - - private: - DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); - - const BufferWithExtendableBuffer *const mBuffer; -}; -} // namespace latinime -#endif /* LATINIME_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp deleted file mode 100644 index da2c30cd6..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -const size_t BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; -const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; -// TODO: Needs to allocate larger memory corresponding to the current vector size. -const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; - -uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) const { - const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(pos); - const int posInBuffer = readingPosIsInAdditionalBuffer ? pos - mOriginalBuffer.size() : pos; - return ByteArrayUtils::readUint(getBuffer(readingPosIsInAdditionalBuffer), size, posInBuffer); -} - -uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size, - int *const pos) const { - const uint32_t value = readUint(size, *pos); - *pos += size; - return value; -} - -void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxCodePointCount, - int *const outCodePoints, int *outCodePointCount, int *const pos) const { - const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(*pos); - if (readingPosIsInAdditionalBuffer) { - *pos -= mOriginalBuffer.size(); - } - // Code point table is not used for dynamic format. - *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( - getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, - nullptr /* codePointTable */, outCodePoints, pos); - if (readingPosIsInAdditionalBuffer) { - *pos += mOriginalBuffer.size(); - } -} - -bool BufferWithExtendableBuffer::extend(const int size) { - return checkAndPrepareWriting(getTailPosition(), size); -} - -bool BufferWithExtendableBuffer::writeUint(const uint32_t data, const int size, const int pos) { - int writingPos = pos; - return writeUintAndAdvancePosition(data, size, &writingPos); -} - -bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, - int *const pos) { - if (!(size >= 1 && size <= 4)) { - AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); - ASSERT(false); - return false; - } - if (!checkAndPrepareWriting(*pos, size)) { - return false; - } - const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); - uint8_t *const buffer = - usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); - if (usesAdditionalBuffer) { - *pos -= mOriginalBuffer.size(); - } - ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); - if (usesAdditionalBuffer) { - *pos += mOriginalBuffer.size(); - } - return true; -} - -bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints, - const int codePointCount, const bool writesTerminator, int *const pos) { - const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints( - codePoints, codePointCount, writesTerminator); - if (!checkAndPrepareWriting(*pos, size)) { - return false; - } - const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); - uint8_t *const buffer = - usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); - if (usesAdditionalBuffer) { - *pos -= mOriginalBuffer.size(); - } - ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePoints, codePointCount, - writesTerminator, pos); - if (usesAdditionalBuffer) { - *pos += mOriginalBuffer.size(); - } - return true; -} - -bool BufferWithExtendableBuffer::extendBuffer(const size_t size) { - const size_t extendSize = std::max(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP, size); - const size_t sizeAfterExtending = - std::min(mAdditionalBuffer.size() + extendSize, mMaxAdditionalBufferSize); - if (sizeAfterExtending < mAdditionalBuffer.size() + size) { - return false; - } - mAdditionalBuffer.resize(sizeAfterExtending); - return true; -} - -bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) { - if (pos < 0 || size < 0) { - // Invalid position or size. - return false; - } - const size_t totalRequiredSize = static_cast(pos + size); - if (!isInAdditionalBuffer(pos)) { - // Here don't need to care about the additional buffer. - if (mOriginalBuffer.size() < totalRequiredSize) { - // Violate the boundary. - return false; - } - // The buffer has sufficient capacity. - return true; - } - // Hereafter, pos is in the additional buffer. - const size_t tailPosition = static_cast(getTailPosition()); - if (totalRequiredSize <= tailPosition) { - // The buffer has sufficient capacity. - return true; - } - if (static_cast(pos) != tailPosition) { - // The additional buffer must be extended from the tail position. - return false; - } - const size_t extendSize = totalRequiredSize - - std::min(mAdditionalBuffer.size() + mOriginalBuffer.size(), totalRequiredSize); - if (extendSize > 0 && !extendBuffer(extendSize)) { - // Failed to extend the buffer. - return false; - } - mUsedAdditionalBufferSize += size; - return true; -} - -bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) { - int copyingPos = 0; - const int tailPos = sourceBuffer->getTailPosition(); - const int maxDataChunkSize = sizeof(uint32_t); - while (copyingPos < tailPos) { - const int remainingSize = tailPos - copyingPos; - const int copyingSize = (remainingSize >= maxDataChunkSize) ? - maxDataChunkSize : remainingSize; - const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos); - if (!writeUint(data, copyingSize, copyingPos)) { - return false; - } - copyingPos += copyingSize; - } - return true; -} - -} diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h deleted file mode 100644 index fad83aa25..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H -#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H - -#include -#include -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -// This is used as a buffer that can be extended for updatable dictionaries. -// To optimize performance, raw pointer is directly used for reading buffer. The position has to be -// adjusted to access additional buffer. On the other hand, this class does not provide writable -// raw pointer but provides several methods that handle boundary checking for writing data. -class BufferWithExtendableBuffer { - public: - static const size_t DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE; - - BufferWithExtendableBuffer(const ReadWriteByteArrayView originalBuffer, - const int maxAdditionalBufferSize) - : mOriginalBuffer(originalBuffer), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), - mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} - - // Without original buffer. - BufferWithExtendableBuffer(const int maxAdditionalBufferSize) - : mOriginalBuffer(), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), - mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} - - AK_FORCE_INLINE int getTailPosition() const { - return mOriginalBuffer.size() + mUsedAdditionalBufferSize; - } - - AK_FORCE_INLINE int getUsedAdditionalBufferSize() const { - return mUsedAdditionalBufferSize; - } - - /** - * For reading. - */ - AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { - return position >= static_cast(mOriginalBuffer.size()); - } - - // TODO: Resolve the issue that the address can be changed when the vector is resized. - // CAVEAT!: Be careful about array out of bound access with buffers - AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { - if (usesAdditionalBuffer) { - return mAdditionalBuffer.data(); - } else { - return mOriginalBuffer.data(); - } - } - - uint32_t readUint(const int size, const int pos) const; - - uint32_t readUintAndAdvancePosition(const int size, int *const pos) const; - - void readCodePointsAndAdvancePosition(const int maxCodePointCount, - int *const outCodePoints, int *outCodePointCount, int *const pos) const; - - AK_FORCE_INLINE int getOriginalBufferSize() const { - return mOriginalBuffer.size(); - } - - AK_FORCE_INLINE bool isNearSizeLimit() const { - return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize - * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100); - } - - bool extend(const int size); - - /** - * For writing. - * - * Writing is allowed for original buffer, already written region of additional buffer and the - * tail of additional buffer. - */ - bool writeUint(const uint32_t data, const int size, const int pos); - - bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos); - - bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, - const bool writesTerminator, int *const pos); - - bool copy(const BufferWithExtendableBuffer *const sourceBuffer); - - private: - DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); - - static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; - static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; - - const ReadWriteByteArrayView mOriginalBuffer; - std::vector mAdditionalBuffer; - int mUsedAdditionalBufferSize; - const size_t mMaxAdditionalBufferSize; - - // Return if the buffer is successfully extended or not. - bool extendBuffer(const size_t size); - - // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of - // the additional buffer, try extending the buffer. - bool checkAndPrepareWriting(const int pos, const int size); -}; -} -#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp deleted file mode 100644 index 1833e8832..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; -const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; -const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h deleted file mode 100644 index abb979050..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BYTE_ARRAY_UTILS_H -#define LATINIME_BYTE_ARRAY_UTILS_H - -#include - -#include "defines.h" - -namespace latinime { - -/** - * Utility methods for reading byte arrays. - */ -class ByteArrayUtils { - public: - /** - * Integer writing - * - * Each method write a corresponding size integer in a big endian manner. - */ - static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, - const uint32_t data, const int size, int *const pos) { - // size must be in 1 to 4. - ASSERT(size >= 1 && size <= 4); - switch (size) { - case 1: - ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); - return; - case 2: - ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); - return; - case 3: - ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); - return; - case 4: - ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); - return; - default: - break; - } - } - - /** - * Integer reading - * - * Each method read a corresponding size integer in a big endian manner. - */ - static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { - return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) - ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; - } - - static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { - return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; - } - - static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { - return (buffer[pos] << 8) ^ buffer[pos + 1]; - } - - static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { - return buffer[pos]; - } - - static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - const uint32_t value = readUint32(buffer, *pos); - *pos += 4; - return value; - } - - static AK_FORCE_INLINE int readSint24AndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - const uint8_t value = readUint8(buffer, *pos); - if (value < 0x80) { - return readUint24AndAdvancePosition(buffer, pos); - } else { - (*pos)++; - return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); - } - } - - static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - const uint32_t value = readUint24(buffer, *pos); - *pos += 3; - return value; - } - - static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - const uint16_t value = readUint16(buffer, *pos); - *pos += 2; - return value; - } - - static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( - const uint8_t *const buffer, int *const pos) { - return buffer[(*pos)++]; - } - - static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, - const int size, const int pos) { - // size must be in 1 to 4. - ASSERT(size >= 1 && size <= 4); - switch (size) { - case 1: - return ByteArrayUtils::readUint8(buffer, pos); - case 2: - return ByteArrayUtils::readUint16(buffer, pos); - case 3: - return ByteArrayUtils::readUint24(buffer, pos); - case 4: - return ByteArrayUtils::readUint32(buffer, pos); - default: - return 0; - } - } - - /** - * Code Point Reading - * - * 1 byte = bbbbbbbb match - * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte - * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because - * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with - * 00011111 would be outside unicode. - * else: iso-latin-1 code - * This allows for the whole unicode range to be encoded, including chars outside of - * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control - * characters which should never happen anyway (and still work, but take 3 bytes). - */ - static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { - int p = pos; - return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); - } - - static AK_FORCE_INLINE int readCodePointAndAdvancePosition( - const uint8_t *const buffer, const int *const codePointTable, int *const pos) { - /* - * codePointTable is an array to convert the most frequent characters in this dictionary to - * 1 byte code points. It is only made of the original code points of the most frequent - * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. - * The original code points are restored by picking the code points at the indices of the - * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. - */ - const uint8_t firstByte = readUint8(buffer, *pos); - if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { - if (firstByte == CHARACTER_ARRAY_TERMINATOR) { - *pos += 1; - return NOT_A_CODE_POINT; - } else { - return readUint24AndAdvancePosition(buffer, pos); - } - } else { - *pos += 1; - if (codePointTable) { - return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; - } - return firstByte; - } - } - - /** - * String (array of code points) Reading - * - * Reads code points until the terminator is found. - */ - // Returns the length of the string. - static int readStringAndAdvancePosition(const uint8_t *const buffer, - const int maxLength, const int *const codePointTable, int *const outBuffer, - int *const pos) { - int length = 0; - int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); - while (NOT_A_CODE_POINT != codePoint && length < maxLength) { - outBuffer[length++] = codePoint; - codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); - } - return length; - } - - // Advances the position and returns the length of the string. - static int advancePositionToBehindString( - const uint8_t *const buffer, const int maxLength, int *const pos) { - int length = 0; - int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); - while (NOT_A_CODE_POINT != codePoint && length < maxLength) { - codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); - length++; - } - return length; - } - - /** - * String (array of code points) Writing - */ - static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, - const int *const codePoints, const int codePointCount, const bool writesTerminator, - int *const pos) { - for (int i = 0; i < codePointCount; ++i) { - const int codePoint = codePoints[i]; - if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { - break; - } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE - || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { - // three bytes character. - writeUint24AndAdvancePosition(buffer, codePoint, pos); - } else { - // one byte character. - writeUint8AndAdvancePosition(buffer, codePoint, pos); - } - } - if (writesTerminator) { - writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); - } - } - - static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, - const int codePointCount, const bool writesTerminator) { - int byteCount = 0; - for (int i = 0; i < codePointCount; ++i) { - const int codePoint = codePoints[i]; - if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { - break; - } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE - || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { - // three bytes character. - byteCount += 3; - } else { - // one byte character. - byteCount += 1; - } - } - if (writesTerminator) { - // The terminator is one byte. - byteCount += 1; - } - return byteCount; - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); - - static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; - static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; - static const uint8_t CHARACTER_ARRAY_TERMINATOR; - - static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, - const uint32_t data, int *const pos) { - buffer[(*pos)++] = (data >> 24) & 0xFF; - buffer[(*pos)++] = (data >> 16) & 0xFF; - buffer[(*pos)++] = (data >> 8) & 0xFF; - buffer[(*pos)++] = data & 0xFF; - } - - static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, - const uint32_t data, int *const pos) { - buffer[(*pos)++] = (data >> 16) & 0xFF; - buffer[(*pos)++] = (data >> 8) & 0xFF; - buffer[(*pos)++] = data & 0xFF; - } - - static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, - const uint16_t data, int *const pos) { - buffer[(*pos)++] = (data >> 8) & 0xFF; - buffer[(*pos)++] = data & 0xFF; - } - - static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, - const uint8_t data, int *const pos) { - buffer[(*pos)++] = data & 0xFF; - } -}; -} // namespace latinime -#endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp deleted file mode 100644 index edcb43678..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" - -#include -#include -#include -#include -#include - -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "utils/time_keeper.h" - -namespace latinime { - -const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; -// Enough size to describe buffer size. -const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4; - -/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, - const int dictVersion, const std::vector localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { - TimeKeeper::setCurrentTime(); - const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); - switch (formatVersion) { - case FormatUtils::VERSION_402: - return createEmptyV4DictFile( - filePath, localeAsCodePointVector, attributeMap, formatVersion); - case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_403: - return createEmptyV4DictFile( - filePath, localeAsCodePointVector, attributeMap, formatVersion); - default: - AKLOGE("Cannot create dictionary %s because format version %d is not supported.", - filePath, dictVersion); - return false; - } -} - -template -/* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath, - const std::vector localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, - const FormatUtils::FORMAT_VERSION formatVersion) { - HeaderPolicy headerPolicy(formatVersion, localeAsCodePointVector, attributeMap); - DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, - DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); - headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); - if (!DynamicPtWritingUtils::writeEmptyDictionary( - dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { - AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); - return false; - } - return dictBuffers->flush(dirPath); -} - -/* static */ bool DictFileWritingUtils::flushBufferToFileWithSuffix(const char *const basePath, - const char *const suffix, const BufferWithExtendableBuffer *const buffer) { - const int filePathBufSize = FileUtils::getFilePathWithSuffixBufSize(basePath, suffix); - char filePath[filePathBufSize]; - FileUtils::getFilePathWithSuffix(basePath, suffix, filePathBufSize, filePath); - return flushBufferToFile(filePath, buffer); -} - -/* static */ bool DictFileWritingUtils::writeBufferToFileTail(FILE *const file, - const BufferWithExtendableBuffer *const buffer) { - uint8_t bufferSize[SIZE_OF_BUFFER_SIZE_FIELD]; - int writingPos = 0; - ByteArrayUtils::writeUintAndAdvancePosition(bufferSize, buffer->getTailPosition(), - SIZE_OF_BUFFER_SIZE_FIELD, &writingPos); - if (fwrite(bufferSize, SIZE_OF_BUFFER_SIZE_FIELD, 1 /* count */, file) < 1) { - return false; - } - return writeBufferToFile(file, buffer); -} - -/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath, - const BufferWithExtendableBuffer *const buffer) { - const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); - if (fd == -1) { - AKLOGE("File %s cannot be opened. errno: %d", filePath, errno); - ASSERT(false); - return false; - } - FILE *const file = fdopen(fd, "wb"); - if (!file) { - AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno); - ASSERT(false); - return false; - } - if (!writeBufferToFile(file, buffer)) { - fclose(file); - remove(filePath); - AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath, - buffer->getTailPosition()); - ASSERT(false); - return false; - } - fclose(file); - return true; -} - -// Returns whether the writing was succeeded or not. -/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file, - const BufferWithExtendableBuffer *const buffer) { - const int originalBufSize = buffer->getOriginalBufferSize(); - if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */), - originalBufSize, 1, file) < 1) { - return false; - } - const int additionalBufSize = buffer->getUsedAdditionalBufferSize(); - if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */), - additionalBufSize, 1, file) < 1) { - return false; - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h deleted file mode 100644 index 4843b3b32..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H -#define LATINIME_DICT_FILE_WRITING_UTILS_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" - -namespace latinime { - -class BufferWithExtendableBuffer; - -class DictFileWritingUtils { - public: - static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; - - static bool createEmptyDictFile(const char *const filePath, const int dictVersion, - const std::vector localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); - - static bool flushBufferToFileWithSuffix(const char *const basePath, const char *const suffix, - const BufferWithExtendableBuffer *const buffer); - - static bool writeBufferToFileTail(FILE *const file, - const BufferWithExtendableBuffer *const buffer); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); - - static const int SIZE_OF_BUFFER_SIZE_FIELD; - - static bool createEmptyV401DictFile(const char *const filePath, - const std::vector localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, - const FormatUtils::FORMAT_VERSION formatVersion); - - template - static bool createEmptyV4DictFile(const char *const filePath, - const std::vector localeAsCodePointVector, - const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, - const FormatUtils::FORMAT_VERSION formatVersion); - - static bool flushBufferToFile(const char *const filePath, - const BufferWithExtendableBuffer *const buffer); - - static bool writeBufferToFile(FILE *const file, - const BufferWithExtendableBuffer *const buffer); -}; -} // namespace latinime -#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h b/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h deleted file mode 100644 index 5e443026e..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_ENTRY_COUNTERS_H -#define LATINIME_ENTRY_COUNTERS_H - -#include - -#include "defines.h" -#include "utils/ngram_utils.h" - -namespace latinime { - -// Copyable but immutable -class EntryCounts final { - public: - EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {} - - explicit EntryCounts(const std::array &counters) - : mEntryCounts(counters) {} - - int getNgramCount(const NgramType ngramType) const { - return mEntryCounts[static_cast(ngramType)]; - } - - const std::array &getCountArray() const { - return mEntryCounts; - } - - private: - DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts); - - // Counts from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram - // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) - const std::array mEntryCounts; -}; - -class MutableEntryCounters final { - public: - MutableEntryCounters() { - mEntryCounters.fill(0); - } - - explicit MutableEntryCounters( - const std::array &counters) - : mEntryCounters(counters) {} - - const EntryCounts getEntryCounts() const { - return EntryCounts(mEntryCounters); - } - - void incrementNgramCount(const NgramType ngramType) { - ++mEntryCounters[static_cast(ngramType)]; - } - - void decrementNgramCount(const NgramType ngramType) { - --mEntryCounters[static_cast(ngramType)]; - } - - int getNgramCount(const NgramType ngramType) const { - return mEntryCounters[static_cast(ngramType)]; - } - - void setNgramCount(const NgramType ngramType, const int count) { - mEntryCounters[static_cast(ngramType)] = count; - } - - private: - DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters); - - // Counters from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram - // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) - std::array mEntryCounters; -}; -} // namespace latinime -#endif /* LATINIME_ENTRY_COUNTERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp deleted file mode 100644 index fb80f38c5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/file_utils.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace latinime { - -// Returns -1 on error. -/* static */ int FileUtils::getFileSize(const char *const filePath) { - const int fd = open(filePath, O_RDONLY); - if (fd == -1) { - return -1; - } - struct stat statBuf; - if (fstat(fd, &statBuf) != 0) { - close(fd); - return -1; - } - close(fd); - return static_cast(statBuf.st_size); -} - -/* static */ bool FileUtils::existsDir(const char *const dirPath) { - DIR *const dir = opendir(dirPath); - if (dir == NULL) { - return false; - } - closedir(dir); - return true; -} - -// Remove a directory and all files in the directory. -/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath) { - return removeDirAndFiles(dirPath, 5 /* maxTries */); -} - -// Remove a directory and all files in the directory, trying up to maxTimes. -/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath, const int maxTries) { - DIR *const dir = opendir(dirPath); - if (dir == NULL) { - AKLOGE("Cannot open dir %s.", dirPath); - return true; - } - struct dirent *dirent; - while ((dirent = readdir(dir)) != NULL) { - if (dirent->d_type == DT_DIR) { - continue; - } - if (strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0) { - continue; - } - const int filePathBufSize = getFilePathBufSize(dirPath, dirent->d_name); - char filePath[filePathBufSize]; - getFilePath(dirPath, dirent->d_name, filePathBufSize, filePath); - if (remove(filePath) != 0) { - AKLOGE("Cannot remove file %s.", filePath); - closedir(dir); - return false; - } - } - closedir(dir); - if (remove(dirPath) != 0) { - if (maxTries > 0) { - // On NFS, deleting files sometimes creates new files. I'm not sure what the - // correct way of dealing with this is, but for the time being, this seems to work. - removeDirAndFiles(dirPath, maxTries - 1); - } else { - AKLOGE("Cannot remove directory %s.", dirPath); - return false; - } - } - return true; -} - -/* static */ int FileUtils::getFilePathWithSuffixBufSize(const char *const filePath, - const char *const suffix) { - return strlen(filePath) + strlen(suffix) + 1 /* terminator */; -} - -/* static */ void FileUtils::getFilePathWithSuffix(const char *const filePath, - const char *const suffix, const int filePathBufSize, char *const outFilePath) { - snprintf(outFilePath, filePathBufSize, "%s%s", filePath, suffix); -} - -/* static */ int FileUtils::getFilePathBufSize(const char *const dirPath, - const char *const fileName) { - return strlen(dirPath) + 1 /* '/' */ + strlen(fileName) + 1 /* terminator */; -} - -/* static */ void FileUtils::getFilePath(const char *const dirPath, const char *const fileName, - const int filePathBufSize, char *const outFilePath) { - snprintf(outFilePath, filePathBufSize, "%s/%s", dirPath, fileName); -} - -/* static */ bool FileUtils::getFilePathWithoutSuffix(const char *const filePath, - const char *const suffix, const int outDirPathBufSize, char *const outDirPath) { - const int filePathLength = strlen(filePath); - const int suffixLength = strlen(suffix); - if (filePathLength <= suffixLength) { - AKLOGE("File path length (%s:%d) is shorter that suffix length (%s:%d).", - filePath, filePathLength, suffix, suffixLength); - return false; - } - const int resultFilePathLength = filePathLength - suffixLength; - if (outDirPathBufSize <= resultFilePathLength) { - AKLOGE("outDirPathBufSize is too small. filePath: %s, suffix: %s, outDirPathBufSize: %d", - filePath, suffix, outDirPathBufSize); - return false; - } - if (strncmp(filePath + resultFilePathLength, suffix, suffixLength) != 0) { - AKLOGE("File Path %s does not have %s as a suffix", filePath, suffix); - return false; - } - snprintf(outDirPath, resultFilePathLength + 1 /* terminator */, "%s", filePath); - return true; -} - -/* static */ void FileUtils::getDirPath(const char *const filePath, const int outDirPathBufSize, - char *const outDirPath) { - for (int i = strlen(filePath) - 1; i >= 0; --i) { - if (filePath[i] == '/') { - if (i >= outDirPathBufSize) { - AKLOGE("outDirPathBufSize is too small. filePath: %s, outDirPathBufSize: %d", - filePath, outDirPathBufSize); - ASSERT(false); - return; - } - snprintf(outDirPath, i + 1 /* terminator */, "%s", filePath); - return; - } - } -} - -/* static */ void FileUtils::getBasename(const char *const filePath, - const int outNameBufSize, char *const outName) { - const int filePathBufSize = strlen(filePath) + 1 /* terminator */; - char filePathBuf[filePathBufSize]; - snprintf(filePathBuf, filePathBufSize, "%s", filePath); - const char *const baseName = basename(filePathBuf); - const int baseNameLength = strlen(baseName); - if (baseNameLength >= outNameBufSize) { - AKLOGE("outNameBufSize is too small. filePath: %s, outNameBufSize: %d", - filePath, outNameBufSize); - return; - } - snprintf(outName, baseNameLength + 1 /* terminator */, "%s", baseName); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h deleted file mode 100644 index 4f1b93a6a..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_FILE_UTILS_H -#define LATINIME_FILE_UTILS_H - -#include "defines.h" - -namespace latinime { - -class FileUtils { - public: - // Returns -1 on error. - static int getFileSize(const char *const filePath); - - static bool existsDir(const char *const dirPath); - - // Remove a directory and all files in the directory. - static bool removeDirAndFiles(const char *const dirPath); - - static int getFilePathWithSuffixBufSize(const char *const filePath, const char *const suffix); - - static void getFilePathWithSuffix(const char *const filePath, const char *const suffix, - const int filePathBufSize, char *const outFilePath); - - static int getFilePathBufSize(const char *const dirPath, const char *const fileName); - - static void getFilePath(const char *const dirPath, const char *const fileName, - const int filePathBufSize, char *const outFilePath); - - // Returns whether the filePath have the suffix. - static bool getFilePathWithoutSuffix(const char *const filePath, const char *const suffix, - const int dirPathBufSize, char *const outDirPath); - - static void getDirPath(const char *const filePath, const int dirPathBufSize, - char *const outDirPath); - - static void getBasename(const char *const filePath, const int outNameBufSize, - char *const outName); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(FileUtils); - - static bool removeDirAndFiles(const char *const dirPath, const int maxTries); -}; -} // namespace latinime -#endif /* LATINIME_FILE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp deleted file mode 100644 index f05c6149e..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -#include -#include -#include - -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" -#include "utils/time_keeper.h" - -namespace latinime { - -const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; -const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; - -const int ForgettingCurveUtils::MAX_LEVEL = 15; -const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2; -const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31; -const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30; -const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1; -// TODO: Evaluate whether this should be 7.5 days. -// 15 days -const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60; - -const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2; - -const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; - -// TODO: Revise the logic to decide the initial probability depending on the given probability. -/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( - const HistoricalInfo *const originalHistoricalInfo, const int newProbability, - const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { - const int timestamp = newHistoricalInfo->getTimestamp(); - if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { - // Add entry as a valid word. - const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); - const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); - return HistoricalInfo(timestamp, level, count); - } else if (!originalHistoricalInfo->isValid() - || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel() - || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() - && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { - // Initial information. - int count = newHistoricalInfo->getCount(); - if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) { - const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1); - return HistoricalInfo(timestamp, level, 0 /* count */); - } - const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); - return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy)); - } else { - const int updatedCount = originalHistoricalInfo->getCount() + 1; - if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) { - // The count exceeds the max value the level can be incremented. - if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { - // The level is already max. - return HistoricalInfo(timestamp, - originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); - } else { - // Raise the level. - return HistoricalInfo(timestamp, - originalHistoricalInfo->getLevel() + 1, 0 /* count */); - } - } else { - return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); - } - } -} - -/* static */ int ForgettingCurveUtils::decodeProbability( - const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { - const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(), - DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS); - return sProbabilityTable.getProbability( - headerPolicy->getForgettingCurveProbabilityValuesTableId(), - clampToValidLevelRange(historicalInfo->getLevel()), - clampToValidTimeStepCountRange(elapsedTimeStepCount)); -} - -/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo, - const HeaderPolicy *const headerPolicy) { - return historicalInfo->getLevel() > 0 - || getElapsedTimeStepCount(historicalInfo->getTimestamp(), - DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS) - < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; -} - -/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( - const HistoricalInfo *const originalHistoricalInfo, - const HeaderPolicy *const headerPolicy) { - if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) { - return HistoricalInfo(); - } - const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; - const int elapsedTimeStep = getElapsedTimeStepCount( - originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds); - if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { - // No need to update historical info. - return *originalHistoricalInfo; - } - // Lower the level. - const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); - const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? - originalHistoricalInfo->getLevel() : maxLevelDownAmonut; - const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() + - levelDownAmount * durationToLevelDownInSeconds; - return HistoricalInfo(adjustedTimestampInSeconds, - originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); -} - -/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, - const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) { - const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts(); - for (const auto ngramType : AllNgramTypes::ASCENDING) { - if (entryCounts.getNgramCount(ngramType) - >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) { - // Unigram count exceeds the limit. - return true; - } - } - if (mindsBlockByDecay) { - return false; - } - if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS - < TimeKeeper::peekCurrentTime()) { - // Time to decay. - return true; - } - return false; -} - -// See comments in ProbabilityUtils::backoff(). -/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { - // See TODO comments in ForgettingCurveUtils::getProbability(). - return unigramProbability; -} - -/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp, - const int durationToLevelDownInSeconds) { - const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp; - const int timeStepDurationInSeconds = - durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1); - return elapsedTimeInSeconds / timeStepDurationInSeconds; -} - -/* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) { - return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL); -} - -/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, - const HeaderPolicy *const headerPolicy) { - return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1); -} - -/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { - return std::min(std::max(level, 0), MAX_LEVEL); -} - -/* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) { - return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT); -} - -const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; -const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; -const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; -const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2; -const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3; -const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127; -const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8; -const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9; -const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10; - - -ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { - mTables.resize(PROBABILITY_TABLE_COUNT); - for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) { - mTables[tableId].resize(MAX_LEVEL + 1); - for (int level = 0; level <= MAX_LEVEL; ++level) { - mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); - const float initialProbability = getBaseProbabilityForLevel(tableId, level); - const float endProbability = getBaseProbabilityForLevel(tableId, level - 1); - for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; - ++timeStepCount) { - if (level < MIN_VISIBLE_LEVEL) { - mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; - continue; - } - const float probability = initialProbability - * powf(initialProbability / endProbability, - -1.0f * static_cast(timeStepCount) - / static_cast(MAX_ELAPSED_TIME_STEP_COUNT + 1)); - mTables[tableId][level][timeStepCount] = - std::min(std::max(static_cast(probability), 1), MAX_PROBABILITY); - } - } - } -} - -/* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel( - const int tableId, const int level) { - if (tableId == WEAK_PROBABILITY_TABLE_ID) { - // Max probability is 127. - return static_cast(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level))); - } else if (tableId == MODEST_PROBABILITY_TABLE_ID) { - // Max probability is 128. - return static_cast(MODEST_BASE_PROBABILITY * (level + 1)); - } else if (tableId == STRONG_PROBABILITY_TABLE_ID) { - // Max probability is 140. - return static_cast(STRONG_BASE_PROBABILITY * (level + 1)); - } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) { - // Max probability is 160. - return static_cast(AGGRESSIVE_BASE_PROBABILITY * (level + 1)); - } else { - return NOT_A_PROBABILITY; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h deleted file mode 100644 index 06dcae8a1..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_FORGETTING_CURVE_UTILS_H -#define LATINIME_FORGETTING_CURVE_UTILS_H - -#include - -#include "defines.h" -#include "suggest/core/dictionary/property/historical_info.h" -#include "suggest/policyimpl/dictionary/utils/entry_counters.h" - -namespace latinime { - -class HeaderPolicy; - -class ForgettingCurveUtils { - public: - static const HistoricalInfo createUpdatedHistoricalInfo( - const HistoricalInfo *const originalHistoricalInfo, const int newProbability, - const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy); - - static const HistoricalInfo createHistoricalInfoToSave( - const HistoricalInfo *const originalHistoricalInfo, - const HeaderPolicy *const headerPolicy); - - static int decodeProbability(const HistoricalInfo *const historicalInfo, - const HeaderPolicy *const headerPolicy); - - static bool needsToKeep(const HistoricalInfo *const historicalInfo, - const HeaderPolicy *const headerPolicy); - - static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters, - const HeaderPolicy *const headerPolicy); - - // TODO: Improve probability computation method and remove this. - static int getProbabilityBiasForNgram(const int n) { - return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE; - } - - AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) { - return static_cast(static_cast(maxEntryCount) - * ENTRY_COUNT_HARD_LIMIT_WEIGHT); - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); - - class ProbabilityTable { - public: - ProbabilityTable(); - - int getProbability(const int tableId, const int level, - const int elapsedTimeStepCount) const { - return mTables[tableId][level][elapsedTimeStepCount]; - } - - private: - DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); - - static const int PROBABILITY_TABLE_COUNT; - static const int WEAK_PROBABILITY_TABLE_ID; - static const int MODEST_PROBABILITY_TABLE_ID; - static const int STRONG_PROBABILITY_TABLE_ID; - static const int AGGRESSIVE_PROBABILITY_TABLE_ID; - - static const int WEAK_MAX_PROBABILITY; - static const int MODEST_BASE_PROBABILITY; - static const int STRONG_BASE_PROBABILITY; - static const int AGGRESSIVE_BASE_PROBABILITY; - - std::vector>> mTables; - - static int getBaseProbabilityForLevel(const int tableId, const int level); - }; - - static const int MULTIPLIER_TWO_IN_PROBABILITY_SCALE; - static const int DECAY_INTERVAL_SECONDS; - - static const int MAX_LEVEL; - static const int MIN_VISIBLE_LEVEL; - static const int MAX_ELAPSED_TIME_STEP_COUNT; - static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; - static const int OCCURRENCES_TO_RAISE_THE_LEVEL; - static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; - - static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT; - - static const ProbabilityTable sProbabilityTable; - - static int backoff(const int unigramProbability); - static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown); - static int clampToVisibleEntryLevelRange(const int level); - static int clampToValidLevelRange(const int level); - static int clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy); - static int clampToValidTimeStepCountRange(const int timeStepCount); -}; -} // namespace latinime -#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp deleted file mode 100644 index e225c235e..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/format_utils.h" - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; - -// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 -const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; - -/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { - switch (formatVersion) { - case VERSION_2: - case VERSION_201: - AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); - return UNKNOWN_VERSION; - case VERSION_202: - return VERSION_202; - case VERSION_4_ONLY_FOR_TESTING: - return VERSION_4_ONLY_FOR_TESTING; - case VERSION_402: - return VERSION_402; - case VERSION_403: - return VERSION_403; - default: - return UNKNOWN_VERSION; - } -} -/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( - const ReadOnlyByteArrayView dictBuffer) { - // The magic number is stored big-endian. - // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't - // understand this format. - if (dictBuffer.size() < DICTIONARY_MINIMUM_SIZE) { - return UNKNOWN_VERSION; - } - const uint32_t magicNumber = ByteArrayUtils::readUint32(dictBuffer.data(), 0); - switch (magicNumber) { - case MAGIC_NUMBER: - // The layout of the header is as follows: - // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE - // Dictionary format version number (2 bytes) - // Options (2 bytes) - // Header size (4 bytes) : integer, big endian - // Conceptually this converts the hardcoded value of the bytes in the file into - // the symbolic value we use in the code. But we want the constants to be the - // same so we use them for both here. - return getFormatVersion(ByteArrayUtils::readUint16(dictBuffer.data(), 4)); - default: - return UNKNOWN_VERSION; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h deleted file mode 100644 index 1616efcce..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_FORMAT_UTILS_H -#define LATINIME_FORMAT_UTILS_H - -#include - -#include "defines.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -/** - * Methods to handle binary dictionary format version. - */ -class FormatUtils { - public: - enum FORMAT_VERSION { - // These MUST have the same values as the relevant constants in FormatSpec.java. - // TODO: Remove VERSION_2 and VERSION_201 when we: - // * Confirm that old versions of LatinIME download old-format dictionaries - // * We no longer need the corresponding constants on the Java side for dicttool - VERSION_2 = 2, - VERSION_201 = 201, - VERSION_202 = 202, - VERSION_4_ONLY_FOR_TESTING = 399, - VERSION_402 = 402, - VERSION_403 = 403, - UNKNOWN_VERSION = -1 - }; - - // 32 bit magic number is stored at the beginning of the dictionary header to reject - // unsupported or obsolete dictionary formats. - static const uint32_t MAGIC_NUMBER; - - static FORMAT_VERSION getFormatVersion(const int formatVersion); - static FORMAT_VERSION detectFormatVersion(const ReadOnlyByteArrayView dictBuffer); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); - - static const size_t DICTIONARY_MINIMUM_SIZE; -}; -} // namespace latinime -#endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp deleted file mode 100644 index 4a126ff85..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" - -#include -#include -#include -#include -#include -#include - -#include "suggest/policyimpl/dictionary/utils/file_utils.h" - -namespace latinime { - -/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( - const char *const path, const int bufferOffset, const int bufferSize, - const bool isUpdatable) { - const int mmapFd = open(path, O_RDONLY); - if (mmapFd < 0) { - AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); - return nullptr; - } - const int pagesize = sysconf(_SC_PAGESIZE); - const int offset = bufferOffset % pagesize; - int alignedOffset = bufferOffset - offset; - int alignedSize = bufferSize + offset; - const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; - void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, - alignedOffset); - if (mmappedBuffer == MAP_FAILED) { - AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); - close(mmapFd); - return nullptr; - } - uint8_t *const buffer = static_cast(mmappedBuffer) + offset; - if (!buffer) { - AKLOGE("DICT: buffer is null"); - close(mmapFd); - return nullptr; - } - return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, - mmapFd, isUpdatable)); -} - -/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( - const char *const path, const bool isUpdatable) { - const int fileSize = FileUtils::getFileSize(path); - if (fileSize == -1) { - return nullptr; - } else if (fileSize == 0) { - return MmappedBufferPtr(new MmappedBuffer(isUpdatable)); - } else { - return openBuffer(path, 0 /* bufferOffset */, fileSize, isUpdatable); - } -} - -/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( - const char *const dirPath, const char *const fileName, const bool isUpdatable) { - const int filePathBufferSize = PATH_MAX + 1 /* terminator */; - char filePath[filePathBufferSize]; - const int filePathLength = snprintf(filePath, filePathBufferSize, "%s%s", dirPath, - fileName); - if (filePathLength >= filePathBufferSize) { - return nullptr; - } - return openBuffer(filePath, isUpdatable); -} - -MmappedBuffer::~MmappedBuffer() { - if (mAlignedSize == 0) { - return; - } - int ret = munmap(mMmappedBuffer, mAlignedSize); - if (ret != 0) { - AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); - } - ret = close(mMmapFd); - if (ret != 0) { - AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h deleted file mode 100644 index e25310373..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_MMAPPED_BUFFER_H -#define LATINIME_MMAPPED_BUFFER_H - -#include -#include - -#include "defines.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -class MmappedBuffer { - public: - typedef std::unique_ptr MmappedBufferPtr; - - static MmappedBufferPtr openBuffer(const char *const path, - const int bufferOffset, const int bufferSize, const bool isUpdatable); - - // Mmap entire file. - static MmappedBufferPtr openBuffer(const char *const path, const bool isUpdatable); - - static MmappedBufferPtr openBuffer(const char *const dirPath, const char *const fileName, - const bool isUpdatable); - - ~MmappedBuffer(); - - ReadWriteByteArrayView getReadWriteByteArrayView() const { - return mByteArrayView; - } - - ReadOnlyByteArrayView getReadOnlyByteArrayView() const { - return mByteArrayView.getReadOnlyView(); - } - - AK_FORCE_INLINE bool isUpdatable() const { - return mIsUpdatable; - } - - private: - AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize, - void *const mmappedBuffer, const int alignedSize, const int mmapFd, - const bool isUpdatable) - : mByteArrayView(buffer, bufferSize), mMmappedBuffer(mmappedBuffer), - mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} - - // Empty file. We have to handle an empty file as a valid part of a dictionary. - AK_FORCE_INLINE MmappedBuffer(const bool isUpdatable) - : mByteArrayView(), mMmappedBuffer(nullptr), mAlignedSize(0), - mMmapFd(0), mIsUpdatable(isUpdatable) {} - - DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); - - const ReadWriteByteArrayView mByteArrayView; - void *const mMmappedBuffer; - const int mAlignedSize; - const int mMmapFd; - const bool mIsUpdatable; -}; -} -#endif /* LATINIME_MMAPPED_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp deleted file mode 100644 index e8fa06942..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" - -namespace latinime { - -const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f; - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h deleted file mode 100644 index 2050af1e9..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PROBABILITY_UTILS_H -#define LATINIME_PROBABILITY_UTILS_H - -#include -#include - -#include "defines.h" - -namespace latinime { - -// TODO: Quit using bigram probability to indicate the delta. -class ProbabilityUtils { - public: - static AK_FORCE_INLINE int backoff(const int unigramProbability) { - return unigramProbability; - // For some reason, applying the backoff weight gives bad results in tests. To apply the - // backoff weight, we divide the probability by 2, which in our storing format means - // decreasing the score by 8. - // TODO: figure out what's wrong with this. - // return unigramProbability > 8 ? - // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); - } - - static AK_FORCE_INLINE int computeProbabilityForBigram( - const int unigramProbability, const int bigramProbability) { - // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want - // the unigram probability to be the median value of the 17th step from the top. A value of - // 0 for the bigram probability represents the middle of the 16th step from the top, - // while a value of 15 represents the middle of the top step. - // See makedict.BinaryDictEncoder#makeBigramFlags for details. - const float stepSize = static_cast(MAX_PROBABILITY - unigramProbability) - / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); - return unigramProbability - + static_cast(static_cast(bigramProbability + 1) * stepSize); - } - - // Encode probability using the same way as we are doing for main dictionaries. - static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) { - const float probability = static_cast(MAX_PROBABILITY) - + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER; - if (probability < 0.0f) { - return 0; - } - return std::min(static_cast(probability + 0.5f), MAX_PROBABILITY); - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); - - static const float PROBABILITY_ENCODING_SCALER; -}; -} -#endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp deleted file mode 100644 index d336306b9..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/sparse_table.h" - -namespace latinime { - -const int SparseTable::NOT_EXIST = -1; -const int SparseTable::INDEX_SIZE = 4; - -bool SparseTable::contains(const int id) const { - const int readingPos = getPosInIndexTable(id); - if (id < 0 || mIndexTableBuffer->getTailPosition() <= readingPos) { - return false; - } - const int index = mIndexTableBuffer->readUint(INDEX_SIZE, readingPos); - return index != NOT_EXIST; -} - -uint32_t SparseTable::get(const int id) const { - const int indexTableReadingPos = getPosInIndexTable(id); - const int index = mIndexTableBuffer->readUint(INDEX_SIZE, indexTableReadingPos); - const int contentTableReadingPos = getPosInContentTable(id, index); - if (contentTableReadingPos < 0 - || contentTableReadingPos >= mContentTableBuffer->getTailPosition()) { - AKLOGE("contentTableReadingPos(%d) is invalid. id: %d, index: %d", - contentTableReadingPos, id, index); - return NOT_A_DICT_POS; - } - const int contentValue = mContentTableBuffer->readUint(mDataSize, contentTableReadingPos); - return contentValue == NOT_EXIST ? NOT_A_DICT_POS : contentValue; -} - -bool SparseTable::set(const int id, const uint32_t value) { - const int posInIndexTable = getPosInIndexTable(id); - // Extends the index table if needed. - int tailPos = mIndexTableBuffer->getTailPosition(); - while (tailPos <= posInIndexTable) { - if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) { - AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable); - return false; - } - } - if (contains(id)) { - // The entry is already in the content table. - const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable); - if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) { - AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value, - getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(), - mDataSize); - return false; - } - return true; - } - // The entry is not in the content table. - // Create new entry in the content table. - const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition()); - if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) { - AKLOGE("cannot write index %d. pos %d", index, posInIndexTable); - return false; - } - // Write a new block that containing the entry to be set. - int writingPos = getPosInContentTable(0 /* id */, index); - for (int i = 0; i < mBlockSize; ++i) { - if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, mDataSize, - &writingPos)) { - AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, " - "mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize); - return false; - } - } - return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index)); -} - -int SparseTable::getIndexFromContentTablePos(const int contentTablePos) const { - return contentTablePos / mDataSize / mBlockSize; -} - -int SparseTable::getPosInIndexTable(const int id) const { - return (id / mBlockSize) * INDEX_SIZE; -} - -int SparseTable::getPosInContentTable(const int id, const int index) const { - const int offset = id % mBlockSize; - return (index * mBlockSize + offset) * mDataSize; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h b/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h deleted file mode 100644 index e1a96c6f7..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_SPARSE_TABLE_H -#define LATINIME_SPARSE_TABLE_H - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -// TODO: Support multiple content buffers. -class SparseTable { - public: - SparseTable(BufferWithExtendableBuffer *const indexTableBuffer, - BufferWithExtendableBuffer *const contentTableBuffer, const int blockSize, - const int dataSize) - : mIndexTableBuffer(indexTableBuffer), mContentTableBuffer(contentTableBuffer), - mBlockSize(blockSize), mDataSize(dataSize) {} - - bool contains(const int id) const; - - uint32_t get(const int id) const; - - bool set(const int id, const uint32_t value); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTable); - - int getIndexFromContentTablePos(const int contentTablePos) const; - - int getPosInIndexTable(const int id) const; - - int getPosInContentTable(const int id, const int index) const; - - static const int NOT_EXIST; - static const int INDEX_SIZE; - - BufferWithExtendableBuffer *const mIndexTableBuffer; - BufferWithExtendableBuffer *const mContentTableBuffer; - const int mBlockSize; - const int mDataSize; -}; -} // namespace latinime -#endif /* LATINIME_SPARSE_TABLE_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp deleted file mode 100644 index b7ef2b9bd..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/trie_map.h" - -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" - -namespace latinime { - -const int TrieMap::INVALID_INDEX = -1; -const int TrieMap::FIELD0_SIZE = 4; -const int TrieMap::FIELD1_SIZE = 3; -const int TrieMap::ENTRY_SIZE = FIELD0_SIZE + FIELD1_SIZE; -const uint32_t TrieMap::VALUE_FLAG = 0x400000; -const uint32_t TrieMap::VALUE_MASK = 0x3FFFFF; -const uint32_t TrieMap::INVALID_VALUE_IN_KEY_VALUE_ENTRY = VALUE_MASK; -const uint32_t TrieMap::TERMINAL_LINK_FLAG = 0x800000; -const uint32_t TrieMap::TERMINAL_LINK_MASK = 0x7FFFFF; -const int TrieMap::NUM_OF_BITS_USED_FOR_ONE_LEVEL = 5; -const uint32_t TrieMap::LABEL_MASK = 0x1F; -const int TrieMap::MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL = 1 << NUM_OF_BITS_USED_FOR_ONE_LEVEL; -const int TrieMap::ROOT_BITMAP_ENTRY_INDEX = 0; -const int TrieMap::ROOT_BITMAP_ENTRY_POS = MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL * FIELD0_SIZE; -const TrieMap::Entry TrieMap::EMPTY_BITMAP_ENTRY = TrieMap::Entry(0, 0); -const int TrieMap::TERMINAL_LINKED_ENTRY_COUNT = 2; // Value entry and bitmap entry. -const uint64_t TrieMap::MAX_VALUE = - (static_cast(1) << ((FIELD0_SIZE + FIELD1_SIZE) * CHAR_BIT)) - 1; -const int TrieMap::MAX_BUFFER_SIZE = TERMINAL_LINK_MASK * ENTRY_SIZE; - -TrieMap::TrieMap() : mBuffer(MAX_BUFFER_SIZE) { - mBuffer.extend(ROOT_BITMAP_ENTRY_POS); - writeEntry(EMPTY_BITMAP_ENTRY, ROOT_BITMAP_ENTRY_INDEX); -} - -TrieMap::TrieMap(const ReadWriteByteArrayView buffer) - : mBuffer(buffer, BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} - -void TrieMap::dump(const int from, const int to) const { - AKLOGI("BufSize: %d", mBuffer.getTailPosition()); - for (int i = from; i < to; ++i) { - AKLOGI("Entry[%d]: %x, %x", i, readField0(i), readField1(i)); - } - int unusedRegionSize = 0; - for (int i = 1; i <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; ++i) { - int index = readEmptyTableLink(i); - while (index != ROOT_BITMAP_ENTRY_INDEX) { - index = readField0(index); - unusedRegionSize += i; - } - } - AKLOGI("Unused Size: %d", unusedRegionSize); -} - -int TrieMap::getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex) { - const Entry bitmapEntry = readEntry(bitmapEntryIndex); - const uint32_t unsignedKey = static_cast(key); - const int terminalEntryIndex = getTerminalEntryIndex( - unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); - if (terminalEntryIndex == INVALID_INDEX) { - // Not found. - return INVALID_INDEX; - } - const Entry terminalEntry = readEntry(terminalEntryIndex); - if (terminalEntry.hasTerminalLink()) { - return terminalEntry.getValueEntryIndex() + 1; - } - // Create a value entry and a bitmap entry. - const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); - if (valueEntryIndex == INVALID_INDEX) { - return INVALID_INDEX; - } - if (!writeEntry(Entry(0, terminalEntry.getValue()), valueEntryIndex)) { - return INVALID_INDEX; - } - if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { - return INVALID_INDEX; - } - if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex)) { - return INVALID_INDEX; - } - return valueEntryIndex + 1; -} - -const TrieMap::Result TrieMap::get(const int key, const int bitmapEntryIndex) const { - const uint32_t unsignedKey = static_cast(key); - return getInternal(unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntryIndex, - 0 /* level */); -} - -bool TrieMap::put(const int key, const uint64_t value, const int bitmapEntryIndex) { - if (value > MAX_VALUE) { - return false; - } - const uint32_t unsignedKey = static_cast(key); - return putInternal(unsignedKey, value, getBitShuffledKey(unsignedKey), bitmapEntryIndex, - readEntry(bitmapEntryIndex), 0 /* level */); -} - -bool TrieMap::save(FILE *const file) const { - return DictFileWritingUtils::writeBufferToFileTail(file, &mBuffer); -} - -bool TrieMap::remove(const int key, const int bitmapEntryIndex) { - const Entry bitmapEntry = readEntry(bitmapEntryIndex); - const uint32_t unsignedKey = static_cast(key); - const int terminalEntryIndex = getTerminalEntryIndex( - unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); - if (terminalEntryIndex == INVALID_INDEX) { - // Not found. - return false; - } - const Entry terminalEntry = readEntry(terminalEntryIndex); - if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , terminalEntryIndex)) { - return false; - } - if (terminalEntry.hasTerminalLink()) { - const Entry nextLevelBitmapEntry = readEntry(terminalEntry.getValueEntryIndex() + 1); - if (!freeTable(terminalEntry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { - return false; - } - if (!removeInner(nextLevelBitmapEntry)){ - return false; - } - } - return true; -} - -/** - * Iterate next entry in a certain level. - * - * @param iterationState the iteration state that will be read and updated in this method. - * @param outKey the output key - * @return Result instance. mIsValid is false when all entries are iterated. - */ -const TrieMap::Result TrieMap::iterateNext(std::vector *const iterationState, - int *const outKey) const { - while (!iterationState->empty()) { - TableIterationState &state = iterationState->back(); - if (state.mTableSize <= state.mCurrentIndex) { - // Move to parent. - iterationState->pop_back(); - } else { - const int entryIndex = state.mTableIndex + state.mCurrentIndex; - state.mCurrentIndex += 1; - const Entry entry = readEntry(entryIndex); - if (entry.isBitmapEntry()) { - // Move to child. - iterationState->emplace_back(popCount(entry.getBitmap()), entry.getTableIndex()); - } else if (entry.isValidTerminalEntry()) { - if (outKey) { - *outKey = entry.getKey(); - } - if (!entry.hasTerminalLink()) { - return Result(entry.getValue(), true, INVALID_INDEX); - } - const int valueEntryIndex = entry.getValueEntryIndex(); - const Entry valueEntry = readEntry(valueEntryIndex); - return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); - } - } - } - // Visited all entries. - return Result(0, false, INVALID_INDEX); -} - -/** - * Shuffle bits of the key in the fixed order. - * - * This method is used as a hash function. This returns different values for different inputs. - */ -uint32_t TrieMap::getBitShuffledKey(const uint32_t key) const { - uint32_t shuffledKey = 0; - for (int i = 0; i < 4; ++i) { - const uint32_t keyPiece = (key >> (i * 8)) & 0xFF; - shuffledKey ^= ((keyPiece ^ (keyPiece << 7) ^ (keyPiece << 14) ^ (keyPiece << 21)) - & 0x11111111) << i; - } - return shuffledKey; -} - -bool TrieMap::writeValue(const uint64_t value, const int terminalEntryIndex) { - if (value < VALUE_MASK) { - // Write value into the terminal entry. - return writeField1(value | VALUE_FLAG, terminalEntryIndex); - } - // Create value entry and write value. - const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); - if (valueEntryIndex == INVALID_INDEX) { - return false; - } - if (!writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex)) { - return false; - } - if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { - return false; - } - return writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex); -} - -bool TrieMap::updateValue(const Entry &terminalEntry, const uint64_t value, - const int terminalEntryIndex) { - if (!terminalEntry.hasTerminalLink()) { - return writeValue(value, terminalEntryIndex); - } - const int valueEntryIndex = terminalEntry.getValueEntryIndex(); - return writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex); -} - -bool TrieMap::freeTable(const int tableIndex, const int entryCount) { - if (!writeField0(readEmptyTableLink(entryCount), tableIndex)) { - return false; - } - return writeEmptyTableLink(tableIndex, entryCount); -} - -/** - * Allocate table with entryCount-entries. Reuse freed table if possible. - */ -int TrieMap::allocateTable(const int entryCount) { - if (entryCount > 0 && entryCount <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL) { - const int tableIndex = readEmptyTableLink(entryCount); - if (tableIndex > 0) { - if (!writeEmptyTableLink(readField0(tableIndex), entryCount)) { - return INVALID_INDEX; - } - // Reuse the table. - return tableIndex; - } - } - // Allocate memory space at tail position of the buffer. - const int mapIndex = getTailEntryIndex(); - if (!mBuffer.extend(entryCount * ENTRY_SIZE)) { - return INVALID_INDEX; - } - return mapIndex; -} - -int TrieMap::getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, - const Entry &bitmapEntry, const int level) const { - const int label = getLabel(hashedKey, level); - if (!exists(bitmapEntry.getBitmap(), label)) { - return INVALID_INDEX; - } - const int entryIndex = bitmapEntry.getTableIndex() + popCount(bitmapEntry.getBitmap(), label); - const Entry entry = readEntry(entryIndex); - if (entry.isBitmapEntry()) { - // Move to the next level. - return getTerminalEntryIndex(key, hashedKey, entry, level + 1); - } - if (!entry.isValidTerminalEntry()) { - return INVALID_INDEX; - } - if (entry.getKey() == key) { - // Terminal entry is found. - return entryIndex; - } - return INVALID_INDEX; -} - -/** - * Get Result corresponding to the key. - * - * @param key the key. - * @param hashedKey the hashed key. - * @param bitmapEntryIndex the index of bitmap entry - * @param level current level - * @return Result instance corresponding to the key. mIsValid indicates whether the key is in the - * map. - */ -const TrieMap::Result TrieMap::getInternal(const uint32_t key, const uint32_t hashedKey, - const int bitmapEntryIndex, const int level) const { - const int terminalEntryIndex = getTerminalEntryIndex(key, hashedKey, - readEntry(bitmapEntryIndex), level); - if (terminalEntryIndex == INVALID_INDEX) { - // Not found. - return Result(0, false, INVALID_INDEX); - } - const Entry terminalEntry = readEntry(terminalEntryIndex); - if (!terminalEntry.hasTerminalLink()) { - return Result(terminalEntry.getValue(), true, INVALID_INDEX); - } - const int valueEntryIndex = terminalEntry.getValueEntryIndex(); - const Entry valueEntry = readEntry(valueEntryIndex); - return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); -} - -/** - * Put key to value mapping to the map. - * - * @param key the key. - * @param value the value - * @param hashedKey the hashed key. - * @param bitmapEntryIndex the index of bitmap entry - * @param bitmapEntry the bitmap entry - * @param level current level - * @return whether the key-value has been correctly inserted to the map or not. - */ -bool TrieMap::putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, - const int bitmapEntryIndex, const Entry &bitmapEntry, const int level) { - const int label = getLabel(hashedKey, level); - const uint32_t bitmap = bitmapEntry.getBitmap(); - const int mapIndex = bitmapEntry.getTableIndex(); - if (!exists(bitmap, label)) { - // Current map doesn't contain the label. - return addNewEntryByExpandingTable(key, value, mapIndex, bitmap, bitmapEntryIndex, label); - } - const int entryIndex = mapIndex + popCount(bitmap, label); - const Entry entry = readEntry(entryIndex); - if (entry.isBitmapEntry()) { - // Bitmap entry is found. Go to the next level. - return putInternal(key, value, hashedKey, entryIndex, entry, level + 1); - } - if (!entry.isValidTerminalEntry()) { - // Overwrite invalid terminal entry. - return writeTerminalEntry(key, value, entryIndex); - } - if (entry.getKey() == key) { - // Terminal entry for the key is found. Update the value. - return updateValue(entry, value, entryIndex); - } - // Conflict with the existing key. - return addNewEntryByResolvingConflict(key, value, hashedKey, entry, entryIndex, level); -} - -/** - * Resolve a conflict in the current level and add new entry. - * - * @param key the key - * @param value the value - * @param hashedKey the hashed key - * @param conflictedEntry the existing conflicted entry - * @param conflictedEntryIndex the index of existing conflicted entry - * @param level current level - * @return whether the key-value has been correctly inserted to the map or not. - */ -bool TrieMap::addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, - const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, - const int level) { - const int conflictedKeyNextLabel = - getLabel(getBitShuffledKey(conflictedEntry.getKey()), level + 1); - const int nextLabel = getLabel(hashedKey, level + 1); - if (conflictedKeyNextLabel == nextLabel) { - // Conflicted again in the next level. - const int newTableIndex = allocateTable(1 /* entryCount */); - if (newTableIndex == INVALID_INDEX) { - return false; - } - if (!writeEntry(conflictedEntry, newTableIndex)) { - return false; - } - const Entry newBitmapEntry(setExist(0 /* bitmap */, nextLabel), newTableIndex); - if (!writeEntry(newBitmapEntry, conflictedEntryIndex)) { - return false; - } - return putInternal(key, value, hashedKey, conflictedEntryIndex, newBitmapEntry, level + 1); - } - // The conflict has been resolved. Create a table that contains 2 entries. - const int newTableIndex = allocateTable(2 /* entryCount */); - if (newTableIndex == INVALID_INDEX) { - return false; - } - if (nextLabel < conflictedKeyNextLabel) { - if (!writeTerminalEntry(key, value, newTableIndex)) { - return false; - } - if (!writeEntry(conflictedEntry, newTableIndex + 1)) { - return false; - } - } else { // nextLabel > conflictedKeyNextLabel - if (!writeEntry(conflictedEntry, newTableIndex)) { - return false; - } - if (!writeTerminalEntry(key, value, newTableIndex + 1)) { - return false; - } - } - const uint32_t updatedBitmap = - setExist(setExist(0 /* bitmap */, nextLabel), conflictedKeyNextLabel); - return writeEntry(Entry(updatedBitmap, newTableIndex), conflictedEntryIndex); -} - -/** - * Add new entry to the existing table. - */ -bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, - const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, const int label) { - // Current map doesn't contain the label. - const int entryCount = popCount(bitmap); - const int newTableIndex = allocateTable(entryCount + 1); - if (newTableIndex == INVALID_INDEX) { - return false; - } - const int newEntryIndexInTable = popCount(bitmap, label); - // Copy from existing table to the new table. - for (int i = 0; i < entryCount; ++i) { - if (!copyEntry(tableIndex + i, newTableIndex + i + (i >= newEntryIndexInTable ? 1 : 0))) { - return false; - } - } - // Write new terminal entry. - if (!writeTerminalEntry(key, value, newTableIndex + newEntryIndexInTable)) { - return false; - } - // Update bitmap. - if (!writeEntry(Entry(setExist(bitmap, label), newTableIndex), bitmapEntryIndex)) { - return false; - } - if (entryCount > 0) { - return freeTable(tableIndex, entryCount); - } - return true; -} - -bool TrieMap::removeInner(const Entry &bitmapEntry) { - const int tableSize = popCount(bitmapEntry.getBitmap()); - if (tableSize <= 0) { - // The table is empty. No need to remove any entries. - return true; - } - for (int i = 0; i < tableSize; ++i) { - const int entryIndex = bitmapEntry.getTableIndex() + i; - const Entry entry = readEntry(entryIndex); - if (entry.isBitmapEntry()) { - // Delete next bitmap entry recursively. - if (!removeInner(entry)) { - return false; - } - } else { - // Invalidate terminal entry just in case. - if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , entryIndex)) { - return false; - } - if (entry.hasTerminalLink()) { - const Entry nextLevelBitmapEntry = readEntry(entry.getValueEntryIndex() + 1); - if (!freeTable(entry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { - return false; - } - if (!removeInner(nextLevelBitmapEntry)) { - return false; - } - } - } - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h b/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h deleted file mode 100644 index 00765888b..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h +++ /dev/null @@ -1,399 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_TRIE_MAP_H -#define LATINIME_TRIE_MAP_H - -#include -#include -#include -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "utils/byte_array_view.h" - -namespace latinime { - -/** - * Trie map derived from Phil Bagwell's Hash Array Mapped Trie. - * key is int and value is uint64_t. - * This supports multiple level map. Terminal entries can have a bitmap for the next level map. - * This doesn't support root map resizing. - */ -class TrieMap { - public: - struct Result { - const uint64_t mValue; - const bool mIsValid; - const int mNextLevelBitmapEntryIndex; - - Result(const uint64_t value, const bool isValid, const int nextLevelBitmapEntryIndex) - : mValue(value), mIsValid(isValid), - mNextLevelBitmapEntryIndex(nextLevelBitmapEntryIndex) {} - }; - - /** - * Struct to record iteration state in a table. - */ - struct TableIterationState { - int mTableSize; - int mTableIndex; - int mCurrentIndex; - - TableIterationState(const int tableSize, const int tableIndex) - : mTableSize(tableSize), mTableIndex(tableIndex), mCurrentIndex(0) {} - }; - - class TrieMapRange; - class TrieMapIterator { - public: - class IterationResult { - public: - IterationResult(const TrieMap *const trieMap, const int key, const uint64_t value, - const int nextLeveBitmapEntryIndex) - : mTrieMap(trieMap), mKey(key), mValue(value), - mNextLevelBitmapEntryIndex(nextLeveBitmapEntryIndex) {} - - const TrieMapRange getEntriesInNextLevel() const { - return TrieMapRange(mTrieMap, mNextLevelBitmapEntryIndex); - } - - bool hasNextLevelMap() const { - return mNextLevelBitmapEntryIndex != INVALID_INDEX; - } - - AK_FORCE_INLINE int key() const { - return mKey; - } - - AK_FORCE_INLINE uint64_t value() const { - return mValue; - } - - AK_FORCE_INLINE int getNextLevelBitmapEntryIndex() const { - return mNextLevelBitmapEntryIndex; - } - - private: - const TrieMap *const mTrieMap; - const int mKey; - const uint64_t mValue; - const int mNextLevelBitmapEntryIndex; - }; - - TrieMapIterator(const TrieMap *const trieMap, const int bitmapEntryIndex) - : mTrieMap(trieMap), mStateStack(), mBaseBitmapEntryIndex(bitmapEntryIndex), - mKey(0), mValue(0), mIsValid(false), mNextLevelBitmapEntryIndex(INVALID_INDEX) { - if (!trieMap || mBaseBitmapEntryIndex == INVALID_INDEX) { - return; - } - const Entry bitmapEntry = mTrieMap->readEntry(mBaseBitmapEntryIndex); - mStateStack.emplace_back( - mTrieMap->popCount(bitmapEntry.getBitmap()), bitmapEntry.getTableIndex()); - this->operator++(); - } - - const IterationResult operator*() const { - return IterationResult(mTrieMap, mKey, mValue, mNextLevelBitmapEntryIndex); - } - - bool operator!=(const TrieMapIterator &other) const { - // Caveat: This works only for for loops. - return mIsValid || other.mIsValid; - } - - const TrieMapIterator &operator++() { - const Result result = mTrieMap->iterateNext(&mStateStack, &mKey); - mValue = result.mValue; - mIsValid = result.mIsValid; - mNextLevelBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; - return *this; - } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapIterator); - DISALLOW_ASSIGNMENT_OPERATOR(TrieMapIterator); - - const TrieMap *const mTrieMap; - std::vector mStateStack; - const int mBaseBitmapEntryIndex; - int mKey; - uint64_t mValue; - bool mIsValid; - int mNextLevelBitmapEntryIndex; - }; - - /** - * Class to support iterating entries in TrieMap by range base for loops. - */ - class TrieMapRange { - public: - TrieMapRange(const TrieMap *const trieMap, const int bitmapEntryIndex) - : mTrieMap(trieMap), mBaseBitmapEntryIndex(bitmapEntryIndex) {}; - - TrieMapIterator begin() const { - return TrieMapIterator(mTrieMap, mBaseBitmapEntryIndex); - } - - const TrieMapIterator end() const { - return TrieMapIterator(nullptr, INVALID_INDEX); - } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapRange); - DISALLOW_ASSIGNMENT_OPERATOR(TrieMapRange); - - const TrieMap *const mTrieMap; - const int mBaseBitmapEntryIndex; - }; - - static const int INVALID_INDEX; - static const uint64_t MAX_VALUE; - - TrieMap(); - // Construct TrieMap using existing data in the memory region written by save(). - TrieMap(const ReadWriteByteArrayView buffer); - void dump(const int from = 0, const int to = 0) const; - - bool isNearSizeLimit() const { - return mBuffer.isNearSizeLimit(); - } - - int getRootBitmapEntryIndex() const { - return ROOT_BITMAP_ENTRY_INDEX; - } - - // Returns bitmapEntryIndex. Create the next level map if it doesn't exist. - int getNextLevelBitmapEntryIndex(const int key) { - return getNextLevelBitmapEntryIndex(key, ROOT_BITMAP_ENTRY_INDEX); - } - - int getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex); - - const Result getRoot(const int key) const { - return get(key, ROOT_BITMAP_ENTRY_INDEX); - } - - const Result get(const int key, const int bitmapEntryIndex) const; - - bool putRoot(const int key, const uint64_t value) { - return put(key, value, ROOT_BITMAP_ENTRY_INDEX); - } - - bool put(const int key, const uint64_t value, const int bitmapEntryIndex); - - const TrieMapRange getEntriesInRootLevel() const { - return getEntriesInSpecifiedLevel(ROOT_BITMAP_ENTRY_INDEX); - } - - const TrieMapRange getEntriesInSpecifiedLevel(const int bitmapEntryIndex) const { - return TrieMapRange(this, bitmapEntryIndex); - } - - bool save(FILE *const file) const; - - bool remove(const int key, const int bitmapEntryIndex); - - private: - DISALLOW_COPY_AND_ASSIGN(TrieMap); - - /** - * Struct represents an entry. - * - * Entry is one of these entry types. All entries are fixed size and have 2 fields FIELD_0 and - * FIELD_1. - * 1. bitmap entry. bitmap entry contains bitmap and the link to hash table. - * FIELD_0(bitmap) FIELD_1(LINK_TO_HASH_TABLE) - * 2. terminal entry. terminal entry contains hashed key and value or terminal link. terminal - * entry have terminal link when the value is not fit to FIELD_1 or there is a next level map - * for the key. - * FIELD_0(hashed key) (FIELD_1(VALUE_FLAG VALUE) | FIELD_1(TERMINAL_LINK_FLAG TERMINAL_LINK)) - * 3. value entry. value entry represents a value. Upper order bytes are stored in FIELD_0 and - * lower order bytes are stored in FIELD_1. - * FIELD_0(value (upper order bytes)) FIELD_1(value (lower order bytes)) - */ - struct Entry { - const uint32_t mData0; - const uint32_t mData1; - - Entry(const uint32_t data0, const uint32_t data1) : mData0(data0), mData1(data1) {} - - AK_FORCE_INLINE bool isBitmapEntry() const { - return (mData1 & VALUE_FLAG) == 0 && (mData1 & TERMINAL_LINK_FLAG) == 0; - } - - AK_FORCE_INLINE bool hasTerminalLink() const { - return (mData1 & TERMINAL_LINK_FLAG) != 0; - } - - // For terminal entry. - AK_FORCE_INLINE uint32_t getKey() const { - return mData0; - } - - // For terminal entry. - AK_FORCE_INLINE uint32_t getValue() const { - return mData1 & VALUE_MASK; - } - - // For terminal entry. - AK_FORCE_INLINE bool isValidTerminalEntry() const { - return hasTerminalLink() || ((mData1 & VALUE_MASK) != INVALID_VALUE_IN_KEY_VALUE_ENTRY); - } - - // For terminal entry. - AK_FORCE_INLINE uint32_t getValueEntryIndex() const { - return mData1 & TERMINAL_LINK_MASK; - } - - // For bitmap entry. - AK_FORCE_INLINE uint32_t getBitmap() const { - return mData0; - } - - // For bitmap entry. - AK_FORCE_INLINE int getTableIndex() const { - return static_cast(mData1); - } - - // For value entry. - AK_FORCE_INLINE uint64_t getValueOfValueEntry() const { - return ((static_cast(mData0) << (FIELD1_SIZE * CHAR_BIT)) ^ mData1); - } - }; - - BufferWithExtendableBuffer mBuffer; - - static const int FIELD0_SIZE; - static const int FIELD1_SIZE; - static const int ENTRY_SIZE; - static const uint32_t VALUE_FLAG; - static const uint32_t VALUE_MASK; - static const uint32_t INVALID_VALUE_IN_KEY_VALUE_ENTRY; - static const uint32_t TERMINAL_LINK_FLAG; - static const uint32_t TERMINAL_LINK_MASK; - static const int NUM_OF_BITS_USED_FOR_ONE_LEVEL; - static const uint32_t LABEL_MASK; - static const int MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; - static const int ROOT_BITMAP_ENTRY_INDEX; - static const int ROOT_BITMAP_ENTRY_POS; - static const Entry EMPTY_BITMAP_ENTRY; - static const int TERMINAL_LINKED_ENTRY_COUNT; - static const int MAX_BUFFER_SIZE; - - uint32_t getBitShuffledKey(const uint32_t key) const; - bool writeValue(const uint64_t value, const int terminalEntryIndex); - bool updateValue(const Entry &terminalEntry, const uint64_t value, - const int terminalEntryIndex); - bool freeTable(const int tableIndex, const int entryCount); - int allocateTable(const int entryCount); - int getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, - const Entry &bitmapEntry, const int level) const; - const Result getInternal(const uint32_t key, const uint32_t hashedKey, - const int bitmapEntryIndex, const int level) const; - bool putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, - const int bitmapEntryIndex, const Entry &bitmapEntry, const int level); - bool addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, - const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, - const int level); - bool addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, - const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, - const int label); - const Result iterateNext(std::vector *const iterationState, - int *const outKey) const; - - AK_FORCE_INLINE const Entry readEntry(const int entryIndex) const { - return Entry(readField0(entryIndex), readField1(entryIndex)); - } - - // Returns whether an entry for the index is existing by testing if the index-th bit in the - // bitmap is set or not. - AK_FORCE_INLINE bool exists(const uint32_t bitmap, const int index) const { - return (bitmap & (1 << index)) != 0; - } - - // Set index-th bit in the bitmap. - AK_FORCE_INLINE uint32_t setExist(const uint32_t bitmap, const int index) const { - return bitmap | (1 << index); - } - - // Count set bits before index in the bitmap. - AK_FORCE_INLINE int popCount(const uint32_t bitmap, const int index) const { - return popCount(bitmap & ((1 << index) - 1)); - } - - // Count set bits in the bitmap. - AK_FORCE_INLINE int popCount(const uint32_t bitmap) const { - return __builtin_popcount(bitmap); - // int v = bitmap - ((bitmap >> 1) & 0x55555555); - // v = (v & 0x33333333) + ((v >> 2) & 0x33333333); - // return (((v + (v >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; - } - - AK_FORCE_INLINE int getLabel(const uint32_t hashedKey, const int level) const { - return (hashedKey >> (level * NUM_OF_BITS_USED_FOR_ONE_LEVEL)) & LABEL_MASK; - } - - AK_FORCE_INLINE uint32_t readField0(const int entryIndex) const { - return mBuffer.readUint(FIELD0_SIZE, ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); - } - - AK_FORCE_INLINE uint32_t readField1(const int entryIndex) const { - return mBuffer.readUint(FIELD1_SIZE, - ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); - } - - AK_FORCE_INLINE int readEmptyTableLink(const int entryCount) const { - return mBuffer.readUint(FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); - } - - AK_FORCE_INLINE bool writeEmptyTableLink(const int tableIndex, const int entryCount) { - return mBuffer.writeUint(tableIndex, FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); - } - - AK_FORCE_INLINE bool writeField0(const uint32_t data, const int entryIndex) { - return mBuffer.writeUint(data, FIELD0_SIZE, - ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); - } - - AK_FORCE_INLINE bool writeField1(const uint32_t data, const int entryIndex) { - return mBuffer.writeUint(data, FIELD1_SIZE, - ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); - } - - AK_FORCE_INLINE bool writeEntry(const Entry &entry, const int entryIndex) { - return writeField0(entry.mData0, entryIndex) && writeField1(entry.mData1, entryIndex); - } - - AK_FORCE_INLINE bool writeTerminalEntry(const uint32_t key, const uint64_t value, - const int entryIndex) { - return writeField0(key, entryIndex) && writeValue(value, entryIndex); - } - - AK_FORCE_INLINE bool copyEntry(const int originalEntryIndex, const int newEntryIndex) { - return writeEntry(readEntry(originalEntryIndex), newEntryIndex); - } - - AK_FORCE_INLINE int getTailEntryIndex() const { - return (mBuffer.getTailPosition() - ROOT_BITMAP_ENTRY_POS) / ENTRY_SIZE; - } - - bool removeInner(const Entry &bitmapEntry); -}; - -} // namespace latinime -#endif /* LATINIME_TRIE_MAP_H */ diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h index 01295859c..8024e34c4 100644 --- a/native/jni/src/utils/jni_data_utils.h +++ b/native/jni/src/utils/jni_data_utils.h @@ -20,11 +20,11 @@ #include #include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/word_property.h" #include "jni.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/session/ngram_context.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" #include "utils/char_utils.h" namespace latinime { diff --git a/native/jni/tests/dictionary/header/header_read_write_utils_test.cpp b/native/jni/tests/dictionary/header/header_read_write_utils_test.cpp new file mode 100644 index 000000000..eab5d6575 --- /dev/null +++ b/native/jni/tests/dictionary/header/header_read_write_utils_test.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_read_write_utils.h" + +#include + +#include +#include + +#include "dictionary/interface/dictionary_header_structure_policy.h" + +namespace latinime { +namespace { + +TEST(HeaderReadWriteUtilsTest, TestInsertCharactersIntoVector) { + DictionaryHeaderStructurePolicy::AttributeMap::key_type vector; + + HeaderReadWriteUtils::insertCharactersIntoVector("", &vector); + EXPECT_TRUE(vector.empty()); + + static const char *str = "abc-xyz!?"; + HeaderReadWriteUtils::insertCharactersIntoVector(str, &vector); + EXPECT_EQ(strlen(str) , vector.size()); + for (size_t i = 0; i < vector.size(); ++i) { + EXPECT_EQ(str[i], vector[i]); + } +} + +TEST(HeaderReadWriteUtilsTest, TestAttributeMapForInt) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + + // Returns default value if not exists. + EXPECT_EQ(-1, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "", -1)); + EXPECT_EQ(100, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); + + HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abc", 10); + EXPECT_EQ(10, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); + HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abc", 20); + EXPECT_EQ(20, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); + HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abcd", 30); + EXPECT_EQ(30, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abcd", 100)); + EXPECT_EQ(20, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); +} + +TEST(HeaderReadWriteUtilsTest, TestAttributeMapCodeForPoints) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + + // Returns empty vector if not exists. + EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue(&attributeMap, "").empty()); + EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue( + &attributeMap, "abc").empty()); + + HeaderReadWriteUtils::setCodePointVectorAttribute(&attributeMap, "abc", {}); + EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue( + &attributeMap, "abc").empty()); + + const std::vector codePoints = { 0x0, 0x20, 0x1F, 0x100000 }; + HeaderReadWriteUtils::setCodePointVectorAttribute(&attributeMap, "abc", codePoints); + EXPECT_EQ(codePoints, HeaderReadWriteUtils::readCodePointVectorAttributeValue( + &attributeMap, "abc")); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp b/native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp new file mode 100644 index 000000000..2e3047eda --- /dev/null +++ b/native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" + +#include + +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { +namespace { + +TEST(LanguageModelDictContentGlobalCountersTest, TestUpdateMaxValueOfCounters) { + LanguageModelDictContentGlobalCounters globalCounters; + + EXPECT_FALSE(globalCounters.needsToHalveCounters()); + globalCounters.updateMaxValueOfCounters(10); + EXPECT_FALSE(globalCounters.needsToHalveCounters()); + const int count = (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 1; + globalCounters.updateMaxValueOfCounters(count); + EXPECT_TRUE(globalCounters.needsToHalveCounters()); + globalCounters.halveCounters(); + EXPECT_FALSE(globalCounters.needsToHalveCounters()); +} + +TEST(LanguageModelDictContentGlobalCountersTest, TestIncrementTotalCount) { + LanguageModelDictContentGlobalCounters globalCounters; + + EXPECT_EQ(0, globalCounters.getTotalCount()); + globalCounters.incrementTotalCount(); + EXPECT_EQ(1, globalCounters.getTotalCount()); + for (int i = 1; i < 50; ++i) { + globalCounters.incrementTotalCount(); + } + EXPECT_EQ(50, globalCounters.getTotalCount()); + globalCounters.halveCounters(); + EXPECT_EQ(25, globalCounters.getTotalCount()); + globalCounters.halveCounters(); + EXPECT_EQ(12, globalCounters.getTotalCount()); + for (int i = 0; i < 4; ++i) { + globalCounters.halveCounters(); + } + EXPECT_EQ(0, globalCounters.getTotalCount()); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp b/native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp new file mode 100644 index 000000000..ca8626e2f --- /dev/null +++ b/native/jni/tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content.h" + +#include + +#include +#include + +#include "utils/int_array_view.h" + +namespace latinime { +namespace { + +TEST(LanguageModelDictContentTest, TestUnigramProbability) { + LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); + + const int flag = 0xF0; + const int probability = 10; + const int wordId = 100; + const ProbabilityEntry probabilityEntry(flag, probability); + languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); + const ProbabilityEntry entry = + languageModelDictContent.getProbabilityEntry(wordId); + EXPECT_EQ(flag, entry.getFlags()); + EXPECT_EQ(probability, entry.getProbability()); + + // Remove + EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_FALSE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); + EXPECT_FALSE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_TRUE(languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry)); + EXPECT_TRUE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); +} + +TEST(LanguageModelDictContentTest, TestUnigramProbabilityWithHistoricalInfo) { + LanguageModelDictContent languageModelDictContent(true /* useHistoricalInfo */); + + const int flag = 0xF0; + const int timestamp = 0x3FFFFFFF; + const int count = 10; + const int wordId = 100; + const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); + const ProbabilityEntry probabilityEntry(flag, &historicalInfo); + languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); + const ProbabilityEntry entry = languageModelDictContent.getProbabilityEntry(wordId); + EXPECT_EQ(flag, entry.getFlags()); + EXPECT_EQ(timestamp, entry.getHistoricalInfo()->getTimestamp()); + EXPECT_EQ(count, entry.getHistoricalInfo()->getCount()); + + // Remove + EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_FALSE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); + EXPECT_FALSE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_TRUE(languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry)); + EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); +} + +TEST(LanguageModelDictContentTest, TestIterateProbabilityEntry) { + LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); + + const ProbabilityEntry originalEntry(0xFC, 100); + + const int wordIds[] = { 1, 2, 3, 4, 5 }; + for (const int wordId : wordIds) { + languageModelDictContent.setProbabilityEntry(wordId, &originalEntry); + } + std::unordered_set wordIdSet(std::begin(wordIds), std::end(wordIds)); + for (const auto entry : languageModelDictContent.getProbabilityEntries(WordIdArrayView())) { + EXPECT_EQ(originalEntry.getFlags(), entry.getProbabilityEntry().getFlags()); + EXPECT_EQ(originalEntry.getProbability(), entry.getProbabilityEntry().getProbability()); + wordIdSet.erase(entry.getWordId()); + } + EXPECT_TRUE(wordIdSet.empty()); +} + +TEST(LanguageModelDictContentTest, TestGetWordProbability) { + LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); + + const int flag = 0xFF; + const int probability = 10; + const int bigramProbability = 20; + const int trigramProbability = 30; + const int wordId = 100; + const std::array prevWordIdArray = {{ 1, 2 }}; + const WordIdArrayView prevWordIds = WordIdArrayView::fromArray(prevWordIdArray); + + const ProbabilityEntry probabilityEntry(flag, probability); + languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); + const ProbabilityEntry bigramProbabilityEntry(flag, bigramProbability); + languageModelDictContent.setProbabilityEntry(prevWordIds[0], &probabilityEntry); + languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), wordId, + &bigramProbabilityEntry); + EXPECT_EQ(bigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); + const ProbabilityEntry trigramProbabilityEntry(flag, trigramProbability); + languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), + prevWordIds[1], &probabilityEntry); + languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(2), wordId, + &trigramProbabilityEntry); + EXPECT_EQ(trigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/structure/v4/content/probability_entry_test.cpp b/native/jni/tests/dictionary/structure/v4/content/probability_entry_test.cpp new file mode 100644 index 000000000..ba81671b5 --- /dev/null +++ b/native/jni/tests/dictionary/structure/v4/content/probability_entry_test.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/probability_entry.h" + +#include + +#include "defines.h" + +namespace latinime { +namespace { + +TEST(ProbabilityEntryTest, TestEncodeDecode) { + const int flag = 0xFF; + const int probability = 10; + + const ProbabilityEntry entry(flag, probability); + const uint64_t encodedEntry = entry.encode(false /* hasHistoricalInfo */); + const ProbabilityEntry decodedEntry = + ProbabilityEntry::decode(encodedEntry, false /* hasHistoricalInfo */); + EXPECT_EQ(0xFF0Aull, encodedEntry); + EXPECT_EQ(flag, decodedEntry.getFlags()); + EXPECT_EQ(probability, decodedEntry.getProbability()); +} + +TEST(ProbabilityEntryTest, TestEncodeDecodeWithHistoricalInfo) { + const int flag = 0xF0; + const int timestamp = 0x3FFFFFFF; + const int count = 0xABCD; + + const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); + const ProbabilityEntry entry(flag, &historicalInfo); + + const uint64_t encodedEntry = entry.encode(true /* hasHistoricalInfo */); + EXPECT_EQ(0xF03FFFFFFFABCDull, encodedEntry); + const ProbabilityEntry decodedEntry = + ProbabilityEntry::decode(encodedEntry, true /* hasHistoricalInfo */); + + EXPECT_EQ(flag, decodedEntry.getFlags()); + EXPECT_EQ(timestamp, decodedEntry.getHistoricalInfo()->getTimestamp()); + EXPECT_EQ(count, decodedEntry.getHistoricalInfo()->getCount()); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp b/native/jni/tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp new file mode 100644 index 000000000..4f23889ca --- /dev/null +++ b/native/jni/tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" + +#include + +#include + +#include "defines.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { +namespace { + +TEST(TerminalPositionLookupTableTest, TestGetFromEmptyTable) { + TerminalPositionLookupTable lookupTable; + + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(0)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(-1)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition( + Ver4DictConstants::NOT_A_TERMINAL_ID)); +} + +TEST(TerminalPositionLookupTableTest, TestSetAndGet) { + TerminalPositionLookupTable lookupTable; + + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(10, 100)); + EXPECT_EQ(100, lookupTable.getTerminalPtNodePosition(10)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(9)); + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(9, 200)); + EXPECT_EQ(200, lookupTable.getTerminalPtNodePosition(9)); + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(10, 300)); + EXPECT_EQ(300, lookupTable.getTerminalPtNodePosition(10)); + EXPECT_FALSE(lookupTable.setTerminalPtNodePosition(-1, 400)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(-1)); + EXPECT_FALSE(lookupTable.setTerminalPtNodePosition(Ver4DictConstants::NOT_A_TERMINAL_ID, 500)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition( + Ver4DictConstants::NOT_A_TERMINAL_ID)); +} + +TEST(TerminalPositionLookupTableTest, TestGC) { + TerminalPositionLookupTable lookupTable; + + const std::vector terminalIds = { 10, 20, 30 }; + const std::vector terminalPositions = { 100, 200, 300 }; + + for (size_t i = 0; i < terminalIds.size(); ++i) { + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(terminalIds[i], terminalPositions[i])); + } + + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + EXPECT_TRUE(lookupTable.runGCTerminalIds(&terminalIdMap)); + + for (size_t i = 0; i < terminalIds.size(); ++i) { + EXPECT_EQ(static_cast(i), terminalIdMap[terminalIds[i]]) + << "Terminal id (" << terminalIds[i] << ") should be changed to " << i; + EXPECT_EQ(terminalPositions[i], lookupTable.getTerminalPtNodePosition(i)); + } +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/utils/bloom_filter_test.cpp b/native/jni/tests/dictionary/utils/bloom_filter_test.cpp new file mode 100644 index 000000000..bcc88438c --- /dev/null +++ b/native/jni/tests/dictionary/utils/bloom_filter_test.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/bloom_filter.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace latinime { +namespace { + +TEST(BloomFilterTest, TestFilter) { + static const int TEST_RANDOM_DATA_MAX = 65536; + static const int ELEMENT_COUNT = 1000; + std::vector elements; + + // Initialize data set with random integers. + { + // Use the uniform integer distribution [0, TEST_RANDOM_DATA_MAX]. + std::uniform_int_distribution distribution(0, TEST_RANDOM_DATA_MAX); + auto randomNumberGenerator = std::bind(distribution, std::mt19937()); + for (int i = 0; i < ELEMENT_COUNT; ++i) { + elements.push_back(randomNumberGenerator()); + } + } + + // Make sure BloomFilter contains nothing by default. + BloomFilter bloomFilter; + for (const int elem : elements) { + ASSERT_FALSE(bloomFilter.isInFilter(elem)); + } + + // Copy some of the test vector into bloom filter. + std::unordered_set elementsThatHaveBeenSetInFilter; + { + // Use the uniform integer distribution [0, 1]. + std::uniform_int_distribution distribution(0, 1); + auto randomBitGenerator = std::bind(distribution, std::mt19937()); + for (const int elem : elements) { + if (randomBitGenerator() == 0) { + bloomFilter.setInFilter(elem); + elementsThatHaveBeenSetInFilter.insert(elem); + } + } + } + + for (const int elem : elements) { + const bool existsInFilter = bloomFilter.isInFilter(elem); + const bool hasBeenSetInFilter = + elementsThatHaveBeenSetInFilter.find(elem) != elementsThatHaveBeenSetInFilter.end(); + if (hasBeenSetInFilter) { + EXPECT_TRUE(existsInFilter) << "elem: " << elem; + } + if (!existsInFilter) { + EXPECT_FALSE(hasBeenSetInFilter) << "elem: " << elem; + } + } +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp b/native/jni/tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp new file mode 100644 index 000000000..25878910b --- /dev/null +++ b/native/jni/tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +#include + +namespace latinime { +namespace { + +const int DEFAULT_MAX_BUFFER_SIZE = 1024; + +TEST(BufferWithExtendablebufferTest, TestWriteAndRead) { + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + int pos = 0; + // 1 byte + const uint32_t data_1 = 0xFF; + EXPECT_TRUE(buffer.writeUint(data_1, 1 /* size */, pos)); + EXPECT_EQ(data_1, buffer.readUint(1, pos)); + pos += 1; + // 2 byte + const uint32_t data_2 = 0xFFFF; + EXPECT_TRUE(buffer.writeUint(data_2, 2 /* size */, pos)); + EXPECT_EQ(data_2, buffer.readUint(2, pos)); + pos += 2; + // 3 byte + const uint32_t data_3 = 0xFFFFFF; + EXPECT_TRUE(buffer.writeUint(data_3, 3 /* size */, pos)); + EXPECT_EQ(data_3, buffer.readUint(3, pos)); + pos += 3; + // 4 byte + const uint32_t data_4 = 0xFFFFFFFF; + EXPECT_TRUE(buffer.writeUint(data_4, 4 /* size */, pos)); + EXPECT_EQ(data_4, buffer.readUint(4, pos)); +} + +TEST(BufferWithExtendablebufferTest, TestExtend) { + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_EQ(0, buffer.getTailPosition()); + EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); + EXPECT_EQ(4, buffer.getTailPosition()); + EXPECT_TRUE(buffer.extend(8 /* size */)); + EXPECT_EQ(12, buffer.getTailPosition()); + EXPECT_TRUE(buffer.writeUint(0xFFFF /* data */, 4 /* size */, 8 /* pos */)); + EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); +} + +TEST(BufferWithExtendablebufferTest, TestCopy) { + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); + EXPECT_TRUE(buffer.writeUint(0xFFFF /* data */, 4 /* size */, 4 /* pos */)); + BufferWithExtendableBuffer targetBuffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_TRUE(targetBuffer.copy(&buffer)); + EXPECT_EQ(0xFFu, targetBuffer.readUint(4 /* size */, 0 /* pos */)); + EXPECT_EQ(0xFFFFu, targetBuffer.readUint(4 /* size */, 4 /* pos */)); +} + +TEST(BufferWithExtendablebufferTest, TestSizeLimit) { + BufferWithExtendableBuffer emptyBuffer(0 /* maxAdditionalBufferSize */); + EXPECT_FALSE(emptyBuffer.writeUint(0 /* data */, 1 /* size */, 0 /* pos */)); + EXPECT_FALSE(emptyBuffer.extend(1 /* size */)); + + BufferWithExtendableBuffer smallBuffer(4 /* maxAdditionalBufferSize */); + EXPECT_TRUE(smallBuffer.writeUint(0 /* data */, 4 /* size */, 0 /* pos */)); + EXPECT_FALSE(smallBuffer.writeUint(0 /* data */, 1 /* size */, 4 /* pos */)); + + EXPECT_TRUE(smallBuffer.copy(&emptyBuffer)); + EXPECT_FALSE(emptyBuffer.copy(&smallBuffer)); + + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_FALSE(buffer.isNearSizeLimit()); + int pos = 0; + while (!buffer.isNearSizeLimit()) { + EXPECT_TRUE(buffer.writeUintAndAdvancePosition(0 /* data */, 4 /* size */, &pos)); + } + EXPECT_GT(pos, 0); + EXPECT_LE(pos, DEFAULT_MAX_BUFFER_SIZE); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/utils/byte_array_utils_test.cpp b/native/jni/tests/dictionary/utils/byte_array_utils_test.cpp new file mode 100644 index 000000000..07257530b --- /dev/null +++ b/native/jni/tests/dictionary/utils/byte_array_utils_test.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/byte_array_utils.h" + +#include + +#include + +namespace latinime { +namespace { + +TEST(ByteArrayUtilsTest, TestReadCodePointTable) { + const int codePointTable[] = { 0x6f, 0x6b }; + const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u }; + int pos = 0; + // Expect the first entry of codePointTable + EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); + // Expect the second entry of codePointTable + EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); + // Expect the original code point from buffer[2] to buffer[4], 0x100 + // It isn't picked from the codePointTable, since it exceeds the range of the codePointTable. + EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); +} + +TEST(ByteArrayUtilsTest, TestReadInt) { + const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu }; + + EXPECT_EQ(0x01u, ByteArrayUtils::readUint8(buffer, 0)); + EXPECT_EQ(0x8Au, ByteArrayUtils::readUint8(buffer, 1)); + EXPECT_EQ(0x0u, ByteArrayUtils::readUint8(buffer, 2)); + EXPECT_EQ(0xAAu, ByteArrayUtils::readUint8(buffer, 3)); + + EXPECT_EQ(0x018Au, ByteArrayUtils::readUint16(buffer, 0)); + EXPECT_EQ(0x8A00u, ByteArrayUtils::readUint16(buffer, 1)); + EXPECT_EQ(0xAAu, ByteArrayUtils::readUint16(buffer, 2)); + + EXPECT_EQ(0x18A00AAu, ByteArrayUtils::readUint32(buffer, 0)); + + int pos = 0; + EXPECT_EQ(0x18A00, ByteArrayUtils::readSint24AndAdvancePosition(buffer, &pos)); + pos = 1; + EXPECT_EQ(-0xA00AA, ByteArrayUtils::readSint24AndAdvancePosition(buffer, &pos)); +} + +TEST(ByteArrayUtilsTest, TestWriteAndReadInt) { + uint8_t buffer[4]; + + int pos = 0; + const uint8_t data_1B = 0xC8; + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data_1B, 1, &pos); + EXPECT_EQ(data_1B, ByteArrayUtils::readUint(buffer, 1, 0)); + + pos = 0; + const uint32_t data_4B = 0xABCD1234; + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data_4B, 4, &pos); + EXPECT_EQ(data_4B, ByteArrayUtils::readUint(buffer, 4, 0)); +} + +TEST(ByteArrayUtilsTest, TestReadCodePoint) { + const uint8_t buffer[] = { 0x10, 0xFF, 0x00u, 0x20u, 0x41u, 0x1Fu, 0x60 }; + + EXPECT_EQ(0x10FF00, ByteArrayUtils::readCodePoint(buffer, 0)); + EXPECT_EQ(0x20, ByteArrayUtils::readCodePoint(buffer, 3)); + EXPECT_EQ(0x41, ByteArrayUtils::readCodePoint(buffer, 4)); + EXPECT_EQ(NOT_A_CODE_POINT, ByteArrayUtils::readCodePoint(buffer, 5)); + + int pos = 0; + int codePointArray[3]; + EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr, + codePointArray, &pos)); + EXPECT_EQ(0x10FF00, codePointArray[0]); + EXPECT_EQ(0x20, codePointArray[1]); + EXPECT_EQ(0x41, codePointArray[2]); + EXPECT_EQ(0x60, ByteArrayUtils::readCodePoint(buffer, pos)); +} + +TEST(ByteArrayUtilsTest, TestWriteAndReadCodePoint) { + uint8_t buffer[10]; + + const int codePointArray[] = { 0x10FF00, 0x20, 0x41 }; + int pos = 0; + ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePointArray, 3, + true /* writesTerminator */, &pos); + EXPECT_EQ(0x10FF00, ByteArrayUtils::readCodePoint(buffer, 0)); + EXPECT_EQ(0x20, ByteArrayUtils::readCodePoint(buffer, 3)); + EXPECT_EQ(0x41, ByteArrayUtils::readCodePoint(buffer, 4)); + EXPECT_EQ(NOT_A_CODE_POINT, ByteArrayUtils::readCodePoint(buffer, 5)); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/utils/format_utils_test.cpp b/native/jni/tests/dictionary/utils/format_utils_test.cpp new file mode 100644 index 000000000..3561bda30 --- /dev/null +++ b/native/jni/tests/dictionary/utils/format_utils_test.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/format_utils.h" + +#include + +#include + +#include "utils/byte_array_view.h" + +namespace latinime { +namespace { + +TEST(FormatUtilsTest, TestMagicNumber) { + EXPECT_EQ(0x9BC13AFE, FormatUtils::MAGIC_NUMBER) << "Magic number must not be changed."; +} + +const std::vector getBuffer(const int magicNumber, const int version, const uint16_t flags, + const size_t headerSize) { + std::vector buffer; + buffer.push_back(magicNumber >> 24); + buffer.push_back(magicNumber >> 16); + buffer.push_back(magicNumber >> 8); + buffer.push_back(magicNumber); + + buffer.push_back(version >> 8); + buffer.push_back(version); + + buffer.push_back(flags >> 8); + buffer.push_back(flags); + + buffer.push_back(headerSize >> 24); + buffer.push_back(headerSize >> 16); + buffer.push_back(headerSize >> 8); + buffer.push_back(headerSize); + return buffer; +} + +TEST(FormatUtilsTest, TestDetectFormatVersion) { + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, + FormatUtils::detectFormatVersion(ReadOnlyByteArrayView())); + + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_2, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_2, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_402, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_402, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_403, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_403, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER - 1, FormatUtils::VERSION_2, 0, 0); + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, 100, 0, 0); + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_2, 0, 0); + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size() - 1))); + } +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/utils/probability_utils_test.cpp b/native/jni/tests/dictionary/utils/probability_utils_test.cpp new file mode 100644 index 000000000..4020ea441 --- /dev/null +++ b/native/jni/tests/dictionary/utils/probability_utils_test.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/probability_utils.h" + +#include + +#include "defines.h" + +namespace latinime { +namespace { + +TEST(ProbabilityUtilsTest, TestEncodeRawProbability) { + EXPECT_EQ(MAX_PROBABILITY, ProbabilityUtils::encodeRawProbability(1.0f)); + EXPECT_EQ(MAX_PROBABILITY - 9, ProbabilityUtils::encodeRawProbability(0.5f)); + EXPECT_EQ(0, ProbabilityUtils::encodeRawProbability(0.0f)); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/utils/sparse_table_test.cpp b/native/jni/tests/dictionary/utils/sparse_table_test.cpp new file mode 100644 index 000000000..237c9631c --- /dev/null +++ b/native/jni/tests/dictionary/utils/sparse_table_test.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/sparse_table.h" + +#include + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace { + +TEST(SparseTableTest, TestSetAndGet) { + static const int BLOCK_SIZE = 64; + static const int DATA_SIZE = 4; + BufferWithExtendableBuffer indexTableBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + BufferWithExtendableBuffer contentTableBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + SparseTable sparseTable(&indexTableBuffer, &contentTableBuffer, BLOCK_SIZE, DATA_SIZE); + + EXPECT_FALSE(sparseTable.contains(10)); + EXPECT_TRUE(sparseTable.set(10, 100u)); + EXPECT_EQ(100u, sparseTable.get(10)); + EXPECT_TRUE(sparseTable.contains(10)); + EXPECT_TRUE(sparseTable.contains(BLOCK_SIZE - 1)); + EXPECT_FALSE(sparseTable.contains(BLOCK_SIZE)); + EXPECT_TRUE(sparseTable.set(11, 101u)); + EXPECT_EQ(100u, sparseTable.get(10)); + EXPECT_EQ(101u, sparseTable.get(11)); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/dictionary/utils/trie_map_test.cpp b/native/jni/tests/dictionary/utils/trie_map_test.cpp new file mode 100644 index 000000000..745d39897 --- /dev/null +++ b/native/jni/tests/dictionary/utils/trie_map_test.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/trie_map.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace latinime { +namespace { + +TEST(TrieMapTest, TestSetAndGet) { + TrieMap trieMap; + trieMap.putRoot(10, 10); + EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); + trieMap.putRoot(0x10A, 10); + EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); + EXPECT_EQ(10ull, trieMap.getRoot(0x10A).mValue); + trieMap.putRoot(10, 1000); + EXPECT_EQ(1000ull, trieMap.getRoot(10).mValue); + trieMap.putRoot(11, 1000); + EXPECT_EQ(1000ull, trieMap.getRoot(11).mValue); + const int next = trieMap.getNextLevelBitmapEntryIndex(10); + EXPECT_EQ(1000ull, trieMap.getRoot(10).mValue); + trieMap.put(9, 9, next); + EXPECT_EQ(9ull, trieMap.get(9, next).mValue); + EXPECT_FALSE(trieMap.get(11, next).mIsValid); + trieMap.putRoot(0, 0xFFFFFFFFFull); + EXPECT_EQ(0xFFFFFFFFFull, trieMap.getRoot(0).mValue); +} + +TEST(TrieMapTest, TestRemove) { + TrieMap trieMap; + trieMap.putRoot(10, 10); + EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); + EXPECT_TRUE(trieMap.remove(10, trieMap.getRootBitmapEntryIndex())); + EXPECT_FALSE(trieMap.getRoot(10).mIsValid); + for (const auto &element : trieMap.getEntriesInRootLevel()) { + EXPECT_TRUE(false); + } + EXPECT_TRUE(trieMap.putRoot(10, 0x3FFFFF)); + EXPECT_FALSE(trieMap.remove(11, trieMap.getRootBitmapEntryIndex())) + << "Should fail if the key does not exist."; + EXPECT_EQ(0x3FFFFFull, trieMap.getRoot(10).mValue); + trieMap.putRoot(12, 11); + const int nextLevel = trieMap.getNextLevelBitmapEntryIndex(10); + trieMap.put(10, 10, nextLevel); + EXPECT_EQ(0x3FFFFFull, trieMap.getRoot(10).mValue); + EXPECT_EQ(10ull, trieMap.get(10, nextLevel).mValue); + EXPECT_TRUE(trieMap.remove(10, trieMap.getRootBitmapEntryIndex())); + const TrieMap::Result result = trieMap.getRoot(10); + EXPECT_FALSE(result.mIsValid); + EXPECT_EQ(TrieMap::INVALID_INDEX, result.mNextLevelBitmapEntryIndex); + EXPECT_EQ(11ull, trieMap.getRoot(12).mValue); + EXPECT_TRUE(trieMap.putRoot(S_INT_MAX, 0xFFFFFFFFFull)); + EXPECT_TRUE(trieMap.remove(S_INT_MAX, trieMap.getRootBitmapEntryIndex())); +} + +TEST(TrieMapTest, TestSetAndGetLarge) { + static const int ELEMENT_COUNT = 200000; + TrieMap trieMap; + for (int i = 0; i < ELEMENT_COUNT; ++i) { + EXPECT_TRUE(trieMap.putRoot(i, i)); + } + for (int i = 0; i < ELEMENT_COUNT; ++i) { + EXPECT_EQ(static_cast(i), trieMap.getRoot(i).mValue); + } +} + +TEST(TrieMapTest, TestRandSetAndGetLarge) { + static const int ELEMENT_COUNT = 100000; + TrieMap trieMap; + std::unordered_map testKeyValuePairs; + + // Use the uniform integer distribution [S_INT_MIN, S_INT_MAX]. + std::uniform_int_distribution keyDistribution(S_INT_MIN, S_INT_MAX); + auto keyRandomNumberGenerator = std::bind(keyDistribution, std::mt19937()); + + // Use the uniform distribution [0, TrieMap::MAX_VALUE]. + std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); + auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); + + for (int i = 0; i < ELEMENT_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + EXPECT_TRUE(trieMap.putRoot(key, value)) << key << " " << value; + testKeyValuePairs[key] = value; + } + for (const auto &v : testKeyValuePairs) { + EXPECT_EQ(v.second, trieMap.getRoot(v.first).mValue); + } +} + +TEST(TrieMapTest, TestMultiLevel) { + static const int FIRST_LEVEL_ENTRY_COUNT = 10000; + static const int SECOND_LEVEL_ENTRY_COUNT = 20000; + static const int THIRD_LEVEL_ENTRY_COUNT = 40000; + + TrieMap trieMap; + std::vector firstLevelKeys; + std::map firstLevelEntries; + std::vector> secondLevelKeys; + std::map> twoLevelMap; + std::map>> threeLevelMap; + + // Use the uniform integer distribution [0, S_INT_MAX]. + std::uniform_int_distribution distribution(0, S_INT_MAX); + auto keyRandomNumberGenerator = std::bind(distribution, std::mt19937()); + auto randomNumberGeneratorForKeySelection = std::bind(distribution, std::mt19937()); + + // Use the uniform distribution [0, TrieMap::MAX_VALUE]. + std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); + auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); + + for (int i = 0; i < FIRST_LEVEL_ENTRY_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + EXPECT_TRUE(trieMap.putRoot(key, value)); + firstLevelKeys.push_back(key); + firstLevelEntries[key] = value; + } + + for (int i = 0; i < SECOND_LEVEL_ENTRY_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + const int firstLevelKey = + firstLevelKeys[randomNumberGeneratorForKeySelection() % FIRST_LEVEL_ENTRY_COUNT]; + const int nextLevelBitmapEntryIndex = trieMap.getNextLevelBitmapEntryIndex(firstLevelKey); + EXPECT_NE(TrieMap::INVALID_INDEX, nextLevelBitmapEntryIndex); + EXPECT_TRUE(trieMap.put(key, value, nextLevelBitmapEntryIndex)); + secondLevelKeys.push_back(std::make_pair(firstLevelKey, key)); + twoLevelMap[firstLevelKey][key] = value; + } + + for (int i = 0; i < THIRD_LEVEL_ENTRY_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + const std::pair secondLevelKey = + secondLevelKeys[randomNumberGeneratorForKeySelection() % SECOND_LEVEL_ENTRY_COUNT]; + const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(secondLevelKey.first); + EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); + const int thirdLevel = trieMap.getNextLevelBitmapEntryIndex( + secondLevelKey.second, secondLevel); + EXPECT_NE(TrieMap::INVALID_INDEX, thirdLevel); + EXPECT_TRUE(trieMap.put(key, value, thirdLevel)); + threeLevelMap[secondLevelKey.first][secondLevelKey.second][key] = value; + } + + for (const auto &firstLevelEntry : firstLevelEntries) { + EXPECT_EQ(firstLevelEntry.second, trieMap.getRoot(firstLevelEntry.first).mValue); + } + + for (const auto &firstLevelEntry : twoLevelMap) { + const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(firstLevelEntry.first); + EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); + for (const auto &secondLevelEntry : firstLevelEntry.second) { + EXPECT_EQ(secondLevelEntry.second, + trieMap.get(secondLevelEntry.first, secondLevel).mValue); + } + } + + for (const auto &firstLevelEntry : threeLevelMap) { + const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(firstLevelEntry.first); + EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); + for (const auto &secondLevelEntry : firstLevelEntry.second) { + const int thirdLevel = + trieMap.getNextLevelBitmapEntryIndex(secondLevelEntry.first, secondLevel); + EXPECT_NE(TrieMap::INVALID_INDEX, thirdLevel); + for (const auto &thirdLevelEntry : secondLevelEntry.second) { + EXPECT_EQ(thirdLevelEntry.second, + trieMap.get(thirdLevelEntry.first, thirdLevel).mValue); + } + } + } + + // Iteration + for (const auto &firstLevelEntry : trieMap.getEntriesInRootLevel()) { + EXPECT_EQ(trieMap.getRoot(firstLevelEntry.key()).mValue, firstLevelEntry.value()); + EXPECT_EQ(firstLevelEntries[firstLevelEntry.key()], firstLevelEntry.value()); + firstLevelEntries.erase(firstLevelEntry.key()); + for (const auto &secondLevelEntry : firstLevelEntry.getEntriesInNextLevel()) { + EXPECT_EQ(twoLevelMap[firstLevelEntry.key()][secondLevelEntry.key()], + secondLevelEntry.value()); + twoLevelMap[firstLevelEntry.key()].erase(secondLevelEntry.key()); + for (const auto &thirdLevelEntry : secondLevelEntry.getEntriesInNextLevel()) { + EXPECT_EQ(threeLevelMap[firstLevelEntry.key()][secondLevelEntry.key()] + [thirdLevelEntry.key()], thirdLevelEntry.value()); + threeLevelMap[firstLevelEntry.key()][secondLevelEntry.key()].erase( + thirdLevelEntry.key()); + } + } + } + + // Ensure all entries have been traversed. + EXPECT_TRUE(firstLevelEntries.empty()); + for (const auto &secondLevelEntry : twoLevelMap) { + EXPECT_TRUE(secondLevelEntry.second.empty()); + } + for (const auto &secondLevelEntry : threeLevelMap) { + for (const auto &thirdLevelEntry : secondLevelEntry.second) { + EXPECT_TRUE(thirdLevelEntry.second.empty()); + } + } +} + +TEST(TrieMapTest, TestIteration) { + static const int ELEMENT_COUNT = 200000; + TrieMap trieMap; + std::unordered_map testKeyValuePairs; + + // Use the uniform integer distribution [S_INT_MIN, S_INT_MAX]. + std::uniform_int_distribution keyDistribution(S_INT_MIN, S_INT_MAX); + auto keyRandomNumberGenerator = std::bind(keyDistribution, std::mt19937()); + + // Use the uniform distribution [0, TrieMap::MAX_VALUE]. + std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); + auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); + for (int i = 0; i < ELEMENT_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + EXPECT_TRUE(trieMap.putRoot(key, value)); + testKeyValuePairs[key] = value; + } + for (const auto &entry : trieMap.getEntriesInRootLevel()) { + EXPECT_EQ(trieMap.getRoot(entry.key()).mValue, entry.value()); + EXPECT_EQ(testKeyValuePairs[entry.key()], entry.value()); + testKeyValuePairs.erase(entry.key()); + } + EXPECT_TRUE(testKeyValuePairs.empty()); +} + +} // namespace +} // namespace latinime diff --git a/native/jni/tests/suggest/core/dictionary/bloom_filter_test.cpp b/native/jni/tests/suggest/core/dictionary/bloom_filter_test.cpp deleted file mode 100644 index b62021784..000000000 --- a/native/jni/tests/suggest/core/dictionary/bloom_filter_test.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/dictionary/bloom_filter.h" - -#include - -#include -#include -#include -#include -#include -#include - -namespace latinime { -namespace { - -TEST(BloomFilterTest, TestFilter) { - static const int TEST_RANDOM_DATA_MAX = 65536; - static const int ELEMENT_COUNT = 1000; - std::vector elements; - - // Initialize data set with random integers. - { - // Use the uniform integer distribution [0, TEST_RANDOM_DATA_MAX]. - std::uniform_int_distribution distribution(0, TEST_RANDOM_DATA_MAX); - auto randomNumberGenerator = std::bind(distribution, std::mt19937()); - for (int i = 0; i < ELEMENT_COUNT; ++i) { - elements.push_back(randomNumberGenerator()); - } - } - - // Make sure BloomFilter contains nothing by default. - BloomFilter bloomFilter; - for (const int elem : elements) { - ASSERT_FALSE(bloomFilter.isInFilter(elem)); - } - - // Copy some of the test vector into bloom filter. - std::unordered_set elementsThatHaveBeenSetInFilter; - { - // Use the uniform integer distribution [0, 1]. - std::uniform_int_distribution distribution(0, 1); - auto randomBitGenerator = std::bind(distribution, std::mt19937()); - for (const int elem : elements) { - if (randomBitGenerator() == 0) { - bloomFilter.setInFilter(elem); - elementsThatHaveBeenSetInFilter.insert(elem); - } - } - } - - for (const int elem : elements) { - const bool existsInFilter = bloomFilter.isInFilter(elem); - const bool hasBeenSetInFilter = - elementsThatHaveBeenSetInFilter.find(elem) != elementsThatHaveBeenSetInFilter.end(); - if (hasBeenSetInFilter) { - EXPECT_TRUE(existsInFilter) << "elem: " << elem; - } - if (!existsInFilter) { - EXPECT_FALSE(hasBeenSetInFilter) << "elem: " << elem; - } - } -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/header/header_read_write_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/header/header_read_write_utils_test.cpp deleted file mode 100644 index da6a2af27..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/header/header_read_write_utils_test.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" - -#include - -#include -#include - -#include "suggest/core/policy/dictionary_header_structure_policy.h" - -namespace latinime { -namespace { - -TEST(HeaderReadWriteUtilsTest, TestInsertCharactersIntoVector) { - DictionaryHeaderStructurePolicy::AttributeMap::key_type vector; - - HeaderReadWriteUtils::insertCharactersIntoVector("", &vector); - EXPECT_TRUE(vector.empty()); - - static const char *str = "abc-xyz!?"; - HeaderReadWriteUtils::insertCharactersIntoVector(str, &vector); - EXPECT_EQ(strlen(str) , vector.size()); - for (size_t i = 0; i < vector.size(); ++i) { - EXPECT_EQ(str[i], vector[i]); - } -} - -TEST(HeaderReadWriteUtilsTest, TestAttributeMapForInt) { - DictionaryHeaderStructurePolicy::AttributeMap attributeMap; - - // Returns default value if not exists. - EXPECT_EQ(-1, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "", -1)); - EXPECT_EQ(100, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); - - HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abc", 10); - EXPECT_EQ(10, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); - HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abc", 20); - EXPECT_EQ(20, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); - HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abcd", 30); - EXPECT_EQ(30, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abcd", 100)); - EXPECT_EQ(20, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); -} - -TEST(HeaderReadWriteUtilsTest, TestAttributeMapCodeForPoints) { - DictionaryHeaderStructurePolicy::AttributeMap attributeMap; - - // Returns empty vector if not exists. - EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue(&attributeMap, "").empty()); - EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue( - &attributeMap, "abc").empty()); - - HeaderReadWriteUtils::setCodePointVectorAttribute(&attributeMap, "abc", {}); - EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue( - &attributeMap, "abc").empty()); - - const std::vector codePoints = { 0x0, 0x20, 0x1F, 0x100000 }; - HeaderReadWriteUtils::setCodePointVectorAttribute(&attributeMap, "abc", codePoints); - EXPECT_EQ(codePoints, HeaderReadWriteUtils::readCodePointVectorAttributeValue( - &attributeMap, "abc")); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp deleted file mode 100644 index 44b5a8aaa..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_global_counters.h" - -#include - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" - -namespace latinime { -namespace { - -TEST(LanguageModelDictContentGlobalCountersTest, TestUpdateMaxValueOfCounters) { - LanguageModelDictContentGlobalCounters globalCounters; - - EXPECT_FALSE(globalCounters.needsToHalveCounters()); - globalCounters.updateMaxValueOfCounters(10); - EXPECT_FALSE(globalCounters.needsToHalveCounters()); - const int count = (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 1; - globalCounters.updateMaxValueOfCounters(count); - EXPECT_TRUE(globalCounters.needsToHalveCounters()); - globalCounters.halveCounters(); - EXPECT_FALSE(globalCounters.needsToHalveCounters()); -} - -TEST(LanguageModelDictContentGlobalCountersTest, TestIncrementTotalCount) { - LanguageModelDictContentGlobalCounters globalCounters; - - EXPECT_EQ(0, globalCounters.getTotalCount()); - globalCounters.incrementTotalCount(); - EXPECT_EQ(1, globalCounters.getTotalCount()); - for (int i = 1; i < 50; ++i) { - globalCounters.incrementTotalCount(); - } - EXPECT_EQ(50, globalCounters.getTotalCount()); - globalCounters.halveCounters(); - EXPECT_EQ(25, globalCounters.getTotalCount()); - globalCounters.halveCounters(); - EXPECT_EQ(12, globalCounters.getTotalCount()); - for (int i = 0; i < 4; ++i) { - globalCounters.halveCounters(); - } - EXPECT_EQ(0, globalCounters.getTotalCount()); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp deleted file mode 100644 index 313a9af10..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content_test.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" - -#include - -#include -#include - -#include "utils/int_array_view.h" - -namespace latinime { -namespace { - -TEST(LanguageModelDictContentTest, TestUnigramProbability) { - LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); - - const int flag = 0xF0; - const int probability = 10; - const int wordId = 100; - const ProbabilityEntry probabilityEntry(flag, probability); - languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); - const ProbabilityEntry entry = - languageModelDictContent.getProbabilityEntry(wordId); - EXPECT_EQ(flag, entry.getFlags()); - EXPECT_EQ(probability, entry.getProbability()); - - // Remove - EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); - EXPECT_FALSE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); - EXPECT_FALSE(languageModelDictContent.removeProbabilityEntry(wordId)); - EXPECT_TRUE(languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry)); - EXPECT_TRUE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); -} - -TEST(LanguageModelDictContentTest, TestUnigramProbabilityWithHistoricalInfo) { - LanguageModelDictContent languageModelDictContent(true /* useHistoricalInfo */); - - const int flag = 0xF0; - const int timestamp = 0x3FFFFFFF; - const int count = 10; - const int wordId = 100; - const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); - const ProbabilityEntry probabilityEntry(flag, &historicalInfo); - languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); - const ProbabilityEntry entry = languageModelDictContent.getProbabilityEntry(wordId); - EXPECT_EQ(flag, entry.getFlags()); - EXPECT_EQ(timestamp, entry.getHistoricalInfo()->getTimestamp()); - EXPECT_EQ(count, entry.getHistoricalInfo()->getCount()); - - // Remove - EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); - EXPECT_FALSE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); - EXPECT_FALSE(languageModelDictContent.removeProbabilityEntry(wordId)); - EXPECT_TRUE(languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry)); - EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); -} - -TEST(LanguageModelDictContentTest, TestIterateProbabilityEntry) { - LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); - - const ProbabilityEntry originalEntry(0xFC, 100); - - const int wordIds[] = { 1, 2, 3, 4, 5 }; - for (const int wordId : wordIds) { - languageModelDictContent.setProbabilityEntry(wordId, &originalEntry); - } - std::unordered_set wordIdSet(std::begin(wordIds), std::end(wordIds)); - for (const auto entry : languageModelDictContent.getProbabilityEntries(WordIdArrayView())) { - EXPECT_EQ(originalEntry.getFlags(), entry.getProbabilityEntry().getFlags()); - EXPECT_EQ(originalEntry.getProbability(), entry.getProbabilityEntry().getProbability()); - wordIdSet.erase(entry.getWordId()); - } - EXPECT_TRUE(wordIdSet.empty()); -} - -TEST(LanguageModelDictContentTest, TestGetWordProbability) { - LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); - - const int flag = 0xFF; - const int probability = 10; - const int bigramProbability = 20; - const int trigramProbability = 30; - const int wordId = 100; - const std::array prevWordIdArray = {{ 1, 2 }}; - const WordIdArrayView prevWordIds = WordIdArrayView::fromArray(prevWordIdArray); - - const ProbabilityEntry probabilityEntry(flag, probability); - languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); - const ProbabilityEntry bigramProbabilityEntry(flag, bigramProbability); - languageModelDictContent.setProbabilityEntry(prevWordIds[0], &probabilityEntry); - languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), wordId, - &bigramProbabilityEntry); - EXPECT_EQ(bigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, - false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); - const ProbabilityEntry trigramProbabilityEntry(flag, trigramProbability); - languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), - prevWordIds[1], &probabilityEntry); - languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(2), wordId, - &trigramProbabilityEntry); - EXPECT_EQ(trigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, - false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp deleted file mode 100644 index eb78034ba..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/probability_entry_test.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" - -#include - -#include "defines.h" - -namespace latinime { -namespace { - -TEST(ProbabilityEntryTest, TestEncodeDecode) { - const int flag = 0xFF; - const int probability = 10; - - const ProbabilityEntry entry(flag, probability); - const uint64_t encodedEntry = entry.encode(false /* hasHistoricalInfo */); - const ProbabilityEntry decodedEntry = - ProbabilityEntry::decode(encodedEntry, false /* hasHistoricalInfo */); - EXPECT_EQ(0xFF0Aull, encodedEntry); - EXPECT_EQ(flag, decodedEntry.getFlags()); - EXPECT_EQ(probability, decodedEntry.getProbability()); -} - -TEST(ProbabilityEntryTest, TestEncodeDecodeWithHistoricalInfo) { - const int flag = 0xF0; - const int timestamp = 0x3FFFFFFF; - const int count = 0xABCD; - - const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); - const ProbabilityEntry entry(flag, &historicalInfo); - - const uint64_t encodedEntry = entry.encode(true /* hasHistoricalInfo */); - EXPECT_EQ(0xF03FFFFFFFABCDull, encodedEntry); - const ProbabilityEntry decodedEntry = - ProbabilityEntry::decode(encodedEntry, true /* hasHistoricalInfo */); - - EXPECT_EQ(flag, decodedEntry.getFlags()); - EXPECT_EQ(timestamp, decodedEntry.getHistoricalInfo()->getTimestamp()); - EXPECT_EQ(count, decodedEntry.getHistoricalInfo()->getCount()); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp deleted file mode 100644 index 23b9c55f7..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" - -#include - -#include - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" - -namespace latinime { -namespace { - -TEST(TerminalPositionLookupTableTest, TestGetFromEmptyTable) { - TerminalPositionLookupTable lookupTable; - - EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(0)); - EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(-1)); - EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition( - Ver4DictConstants::NOT_A_TERMINAL_ID)); -} - -TEST(TerminalPositionLookupTableTest, TestSetAndGet) { - TerminalPositionLookupTable lookupTable; - - EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(10, 100)); - EXPECT_EQ(100, lookupTable.getTerminalPtNodePosition(10)); - EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(9)); - EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(9, 200)); - EXPECT_EQ(200, lookupTable.getTerminalPtNodePosition(9)); - EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(10, 300)); - EXPECT_EQ(300, lookupTable.getTerminalPtNodePosition(10)); - EXPECT_FALSE(lookupTable.setTerminalPtNodePosition(-1, 400)); - EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(-1)); - EXPECT_FALSE(lookupTable.setTerminalPtNodePosition(Ver4DictConstants::NOT_A_TERMINAL_ID, 500)); - EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition( - Ver4DictConstants::NOT_A_TERMINAL_ID)); -} - -TEST(TerminalPositionLookupTableTest, TestGC) { - TerminalPositionLookupTable lookupTable; - - const std::vector terminalIds = { 10, 20, 30 }; - const std::vector terminalPositions = { 100, 200, 300 }; - - for (size_t i = 0; i < terminalIds.size(); ++i) { - EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(terminalIds[i], terminalPositions[i])); - } - - TerminalPositionLookupTable::TerminalIdMap terminalIdMap; - EXPECT_TRUE(lookupTable.runGCTerminalIds(&terminalIdMap)); - - for (size_t i = 0; i < terminalIds.size(); ++i) { - EXPECT_EQ(static_cast(i), terminalIdMap[terminalIds[i]]) - << "Terminal id (" << terminalIds[i] << ") should be changed to " << i; - EXPECT_EQ(terminalPositions[i], lookupTable.getTerminalPtNodePosition(i)); - } -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer_test.cpp deleted file mode 100644 index fa6c6d71e..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer_test.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -#include - -namespace latinime { -namespace { - -const int DEFAULT_MAX_BUFFER_SIZE = 1024; - -TEST(BufferWithExtendablebufferTest, TestWriteAndRead) { - BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); - int pos = 0; - // 1 byte - const uint32_t data_1 = 0xFF; - EXPECT_TRUE(buffer.writeUint(data_1, 1 /* size */, pos)); - EXPECT_EQ(data_1, buffer.readUint(1, pos)); - pos += 1; - // 2 byte - const uint32_t data_2 = 0xFFFF; - EXPECT_TRUE(buffer.writeUint(data_2, 2 /* size */, pos)); - EXPECT_EQ(data_2, buffer.readUint(2, pos)); - pos += 2; - // 3 byte - const uint32_t data_3 = 0xFFFFFF; - EXPECT_TRUE(buffer.writeUint(data_3, 3 /* size */, pos)); - EXPECT_EQ(data_3, buffer.readUint(3, pos)); - pos += 3; - // 4 byte - const uint32_t data_4 = 0xFFFFFFFF; - EXPECT_TRUE(buffer.writeUint(data_4, 4 /* size */, pos)); - EXPECT_EQ(data_4, buffer.readUint(4, pos)); -} - -TEST(BufferWithExtendablebufferTest, TestExtend) { - BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); - EXPECT_EQ(0, buffer.getTailPosition()); - EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); - EXPECT_EQ(4, buffer.getTailPosition()); - EXPECT_TRUE(buffer.extend(8 /* size */)); - EXPECT_EQ(12, buffer.getTailPosition()); - EXPECT_TRUE(buffer.writeUint(0xFFFF /* data */, 4 /* size */, 8 /* pos */)); - EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); -} - -TEST(BufferWithExtendablebufferTest, TestCopy) { - BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); - EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); - EXPECT_TRUE(buffer.writeUint(0xFFFF /* data */, 4 /* size */, 4 /* pos */)); - BufferWithExtendableBuffer targetBuffer(DEFAULT_MAX_BUFFER_SIZE); - EXPECT_TRUE(targetBuffer.copy(&buffer)); - EXPECT_EQ(0xFFu, targetBuffer.readUint(4 /* size */, 0 /* pos */)); - EXPECT_EQ(0xFFFFu, targetBuffer.readUint(4 /* size */, 4 /* pos */)); -} - -TEST(BufferWithExtendablebufferTest, TestSizeLimit) { - BufferWithExtendableBuffer emptyBuffer(0 /* maxAdditionalBufferSize */); - EXPECT_FALSE(emptyBuffer.writeUint(0 /* data */, 1 /* size */, 0 /* pos */)); - EXPECT_FALSE(emptyBuffer.extend(1 /* size */)); - - BufferWithExtendableBuffer smallBuffer(4 /* maxAdditionalBufferSize */); - EXPECT_TRUE(smallBuffer.writeUint(0 /* data */, 4 /* size */, 0 /* pos */)); - EXPECT_FALSE(smallBuffer.writeUint(0 /* data */, 1 /* size */, 4 /* pos */)); - - EXPECT_TRUE(smallBuffer.copy(&emptyBuffer)); - EXPECT_FALSE(emptyBuffer.copy(&smallBuffer)); - - BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); - EXPECT_FALSE(buffer.isNearSizeLimit()); - int pos = 0; - while (!buffer.isNearSizeLimit()) { - EXPECT_TRUE(buffer.writeUintAndAdvancePosition(0 /* data */, 4 /* size */, &pos)); - } - EXPECT_GT(pos, 0); - EXPECT_LE(pos, DEFAULT_MAX_BUFFER_SIZE); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp deleted file mode 100644 index c201e0d00..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/byte_array_utils_test.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -#include - -#include - -namespace latinime { -namespace { - -TEST(ByteArrayUtilsTest, TestReadCodePointTable) { - const int codePointTable[] = { 0x6f, 0x6b }; - const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u }; - int pos = 0; - // Expect the first entry of codePointTable - EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); - // Expect the second entry of codePointTable - EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); - // Expect the original code point from buffer[2] to buffer[4], 0x100 - // It isn't picked from the codePointTable, since it exceeds the range of the codePointTable. - EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); -} - -TEST(ByteArrayUtilsTest, TestReadInt) { - const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu }; - - EXPECT_EQ(0x01u, ByteArrayUtils::readUint8(buffer, 0)); - EXPECT_EQ(0x8Au, ByteArrayUtils::readUint8(buffer, 1)); - EXPECT_EQ(0x0u, ByteArrayUtils::readUint8(buffer, 2)); - EXPECT_EQ(0xAAu, ByteArrayUtils::readUint8(buffer, 3)); - - EXPECT_EQ(0x018Au, ByteArrayUtils::readUint16(buffer, 0)); - EXPECT_EQ(0x8A00u, ByteArrayUtils::readUint16(buffer, 1)); - EXPECT_EQ(0xAAu, ByteArrayUtils::readUint16(buffer, 2)); - - EXPECT_EQ(0x18A00AAu, ByteArrayUtils::readUint32(buffer, 0)); - - int pos = 0; - EXPECT_EQ(0x18A00, ByteArrayUtils::readSint24AndAdvancePosition(buffer, &pos)); - pos = 1; - EXPECT_EQ(-0xA00AA, ByteArrayUtils::readSint24AndAdvancePosition(buffer, &pos)); -} - -TEST(ByteArrayUtilsTest, TestWriteAndReadInt) { - uint8_t buffer[4]; - - int pos = 0; - const uint8_t data_1B = 0xC8; - ByteArrayUtils::writeUintAndAdvancePosition(buffer, data_1B, 1, &pos); - EXPECT_EQ(data_1B, ByteArrayUtils::readUint(buffer, 1, 0)); - - pos = 0; - const uint32_t data_4B = 0xABCD1234; - ByteArrayUtils::writeUintAndAdvancePosition(buffer, data_4B, 4, &pos); - EXPECT_EQ(data_4B, ByteArrayUtils::readUint(buffer, 4, 0)); -} - -TEST(ByteArrayUtilsTest, TestReadCodePoint) { - const uint8_t buffer[] = { 0x10, 0xFF, 0x00u, 0x20u, 0x41u, 0x1Fu, 0x60 }; - - EXPECT_EQ(0x10FF00, ByteArrayUtils::readCodePoint(buffer, 0)); - EXPECT_EQ(0x20, ByteArrayUtils::readCodePoint(buffer, 3)); - EXPECT_EQ(0x41, ByteArrayUtils::readCodePoint(buffer, 4)); - EXPECT_EQ(NOT_A_CODE_POINT, ByteArrayUtils::readCodePoint(buffer, 5)); - - int pos = 0; - int codePointArray[3]; - EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr, - codePointArray, &pos)); - EXPECT_EQ(0x10FF00, codePointArray[0]); - EXPECT_EQ(0x20, codePointArray[1]); - EXPECT_EQ(0x41, codePointArray[2]); - EXPECT_EQ(0x60, ByteArrayUtils::readCodePoint(buffer, pos)); -} - -TEST(ByteArrayUtilsTest, TestWriteAndReadCodePoint) { - uint8_t buffer[10]; - - const int codePointArray[] = { 0x10FF00, 0x20, 0x41 }; - int pos = 0; - ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePointArray, 3, - true /* writesTerminator */, &pos); - EXPECT_EQ(0x10FF00, ByteArrayUtils::readCodePoint(buffer, 0)); - EXPECT_EQ(0x20, ByteArrayUtils::readCodePoint(buffer, 3)); - EXPECT_EQ(0x41, ByteArrayUtils::readCodePoint(buffer, 4)); - EXPECT_EQ(NOT_A_CODE_POINT, ByteArrayUtils::readCodePoint(buffer, 5)); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp deleted file mode 100644 index 494200568..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/format_utils_test.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/format_utils.h" - -#include - -#include - -#include "utils/byte_array_view.h" - -namespace latinime { -namespace { - -TEST(FormatUtilsTest, TestMagicNumber) { - EXPECT_EQ(0x9BC13AFE, FormatUtils::MAGIC_NUMBER) << "Magic number must not be changed."; -} - -const std::vector getBuffer(const int magicNumber, const int version, const uint16_t flags, - const size_t headerSize) { - std::vector buffer; - buffer.push_back(magicNumber >> 24); - buffer.push_back(magicNumber >> 16); - buffer.push_back(magicNumber >> 8); - buffer.push_back(magicNumber); - - buffer.push_back(version >> 8); - buffer.push_back(version); - - buffer.push_back(flags >> 8); - buffer.push_back(flags); - - buffer.push_back(headerSize >> 24); - buffer.push_back(headerSize >> 16); - buffer.push_back(headerSize >> 8); - buffer.push_back(headerSize); - return buffer; -} - -TEST(FormatUtilsTest, TestDetectFormatVersion) { - EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, - FormatUtils::detectFormatVersion(ReadOnlyByteArrayView())); - - { - const std::vector buffer = - getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_2, 0, 0); - EXPECT_EQ(FormatUtils::VERSION_2, FormatUtils::detectFormatVersion( - ReadOnlyByteArrayView(buffer.data(), buffer.size()))); - } - { - const std::vector buffer = - getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_402, 0, 0); - EXPECT_EQ(FormatUtils::VERSION_402, FormatUtils::detectFormatVersion( - ReadOnlyByteArrayView(buffer.data(), buffer.size()))); - } - { - const std::vector buffer = - getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_403, 0, 0); - EXPECT_EQ(FormatUtils::VERSION_403, FormatUtils::detectFormatVersion( - ReadOnlyByteArrayView(buffer.data(), buffer.size()))); - } - - { - const std::vector buffer = - getBuffer(FormatUtils::MAGIC_NUMBER - 1, FormatUtils::VERSION_2, 0, 0); - EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( - ReadOnlyByteArrayView(buffer.data(), buffer.size()))); - } - { - const std::vector buffer = - getBuffer(FormatUtils::MAGIC_NUMBER, 100, 0, 0); - EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( - ReadOnlyByteArrayView(buffer.data(), buffer.size()))); - } - { - const std::vector buffer = - getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_2, 0, 0); - EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( - ReadOnlyByteArrayView(buffer.data(), buffer.size() - 1))); - } -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp deleted file mode 100644 index be1f278c6..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/probability_utils_test.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" - -#include - -#include "defines.h" - -namespace latinime { -namespace { - -TEST(ProbabilityUtilsTest, TestEncodeRawProbability) { - EXPECT_EQ(MAX_PROBABILITY, ProbabilityUtils::encodeRawProbability(1.0f)); - EXPECT_EQ(MAX_PROBABILITY - 9, ProbabilityUtils::encodeRawProbability(0.5f)); - EXPECT_EQ(0, ProbabilityUtils::encodeRawProbability(0.0f)); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/sparse_table_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/sparse_table_test.cpp deleted file mode 100644 index 0b57156a0..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/sparse_table_test.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/sparse_table.h" - -#include - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { -namespace { - -TEST(SparseTableTest, TestSetAndGet) { - static const int BLOCK_SIZE = 64; - static const int DATA_SIZE = 4; - BufferWithExtendableBuffer indexTableBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - BufferWithExtendableBuffer contentTableBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - SparseTable sparseTable(&indexTableBuffer, &contentTableBuffer, BLOCK_SIZE, DATA_SIZE); - - EXPECT_FALSE(sparseTable.contains(10)); - EXPECT_TRUE(sparseTable.set(10, 100u)); - EXPECT_EQ(100u, sparseTable.get(10)); - EXPECT_TRUE(sparseTable.contains(10)); - EXPECT_TRUE(sparseTable.contains(BLOCK_SIZE - 1)); - EXPECT_FALSE(sparseTable.contains(BLOCK_SIZE)); - EXPECT_TRUE(sparseTable.set(11, 101u)); - EXPECT_EQ(100u, sparseTable.get(10)); - EXPECT_EQ(101u, sparseTable.get(11)); -} - -} // namespace -} // namespace latinime diff --git a/native/jni/tests/suggest/policyimpl/dictionary/utils/trie_map_test.cpp b/native/jni/tests/suggest/policyimpl/dictionary/utils/trie_map_test.cpp deleted file mode 100644 index 56b5aa985..000000000 --- a/native/jni/tests/suggest/policyimpl/dictionary/utils/trie_map_test.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/utils/trie_map.h" - -#include - -#include -#include -#include -#include -#include -#include - -namespace latinime { -namespace { - -TEST(TrieMapTest, TestSetAndGet) { - TrieMap trieMap; - trieMap.putRoot(10, 10); - EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); - trieMap.putRoot(0x10A, 10); - EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); - EXPECT_EQ(10ull, trieMap.getRoot(0x10A).mValue); - trieMap.putRoot(10, 1000); - EXPECT_EQ(1000ull, trieMap.getRoot(10).mValue); - trieMap.putRoot(11, 1000); - EXPECT_EQ(1000ull, trieMap.getRoot(11).mValue); - const int next = trieMap.getNextLevelBitmapEntryIndex(10); - EXPECT_EQ(1000ull, trieMap.getRoot(10).mValue); - trieMap.put(9, 9, next); - EXPECT_EQ(9ull, trieMap.get(9, next).mValue); - EXPECT_FALSE(trieMap.get(11, next).mIsValid); - trieMap.putRoot(0, 0xFFFFFFFFFull); - EXPECT_EQ(0xFFFFFFFFFull, trieMap.getRoot(0).mValue); -} - -TEST(TrieMapTest, TestRemove) { - TrieMap trieMap; - trieMap.putRoot(10, 10); - EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); - EXPECT_TRUE(trieMap.remove(10, trieMap.getRootBitmapEntryIndex())); - EXPECT_FALSE(trieMap.getRoot(10).mIsValid); - for (const auto &element : trieMap.getEntriesInRootLevel()) { - EXPECT_TRUE(false); - } - EXPECT_TRUE(trieMap.putRoot(10, 0x3FFFFF)); - EXPECT_FALSE(trieMap.remove(11, trieMap.getRootBitmapEntryIndex())) - << "Should fail if the key does not exist."; - EXPECT_EQ(0x3FFFFFull, trieMap.getRoot(10).mValue); - trieMap.putRoot(12, 11); - const int nextLevel = trieMap.getNextLevelBitmapEntryIndex(10); - trieMap.put(10, 10, nextLevel); - EXPECT_EQ(0x3FFFFFull, trieMap.getRoot(10).mValue); - EXPECT_EQ(10ull, trieMap.get(10, nextLevel).mValue); - EXPECT_TRUE(trieMap.remove(10, trieMap.getRootBitmapEntryIndex())); - const TrieMap::Result result = trieMap.getRoot(10); - EXPECT_FALSE(result.mIsValid); - EXPECT_EQ(TrieMap::INVALID_INDEX, result.mNextLevelBitmapEntryIndex); - EXPECT_EQ(11ull, trieMap.getRoot(12).mValue); - EXPECT_TRUE(trieMap.putRoot(S_INT_MAX, 0xFFFFFFFFFull)); - EXPECT_TRUE(trieMap.remove(S_INT_MAX, trieMap.getRootBitmapEntryIndex())); -} - -TEST(TrieMapTest, TestSetAndGetLarge) { - static const int ELEMENT_COUNT = 200000; - TrieMap trieMap; - for (int i = 0; i < ELEMENT_COUNT; ++i) { - EXPECT_TRUE(trieMap.putRoot(i, i)); - } - for (int i = 0; i < ELEMENT_COUNT; ++i) { - EXPECT_EQ(static_cast(i), trieMap.getRoot(i).mValue); - } -} - -TEST(TrieMapTest, TestRandSetAndGetLarge) { - static const int ELEMENT_COUNT = 100000; - TrieMap trieMap; - std::unordered_map testKeyValuePairs; - - // Use the uniform integer distribution [S_INT_MIN, S_INT_MAX]. - std::uniform_int_distribution keyDistribution(S_INT_MIN, S_INT_MAX); - auto keyRandomNumberGenerator = std::bind(keyDistribution, std::mt19937()); - - // Use the uniform distribution [0, TrieMap::MAX_VALUE]. - std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); - auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); - - for (int i = 0; i < ELEMENT_COUNT; ++i) { - const int key = keyRandomNumberGenerator(); - const uint64_t value = valueRandomNumberGenerator(); - EXPECT_TRUE(trieMap.putRoot(key, value)) << key << " " << value; - testKeyValuePairs[key] = value; - } - for (const auto &v : testKeyValuePairs) { - EXPECT_EQ(v.second, trieMap.getRoot(v.first).mValue); - } -} - -TEST(TrieMapTest, TestMultiLevel) { - static const int FIRST_LEVEL_ENTRY_COUNT = 10000; - static const int SECOND_LEVEL_ENTRY_COUNT = 20000; - static const int THIRD_LEVEL_ENTRY_COUNT = 40000; - - TrieMap trieMap; - std::vector firstLevelKeys; - std::map firstLevelEntries; - std::vector> secondLevelKeys; - std::map> twoLevelMap; - std::map>> threeLevelMap; - - // Use the uniform integer distribution [0, S_INT_MAX]. - std::uniform_int_distribution distribution(0, S_INT_MAX); - auto keyRandomNumberGenerator = std::bind(distribution, std::mt19937()); - auto randomNumberGeneratorForKeySelection = std::bind(distribution, std::mt19937()); - - // Use the uniform distribution [0, TrieMap::MAX_VALUE]. - std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); - auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); - - for (int i = 0; i < FIRST_LEVEL_ENTRY_COUNT; ++i) { - const int key = keyRandomNumberGenerator(); - const uint64_t value = valueRandomNumberGenerator(); - EXPECT_TRUE(trieMap.putRoot(key, value)); - firstLevelKeys.push_back(key); - firstLevelEntries[key] = value; - } - - for (int i = 0; i < SECOND_LEVEL_ENTRY_COUNT; ++i) { - const int key = keyRandomNumberGenerator(); - const uint64_t value = valueRandomNumberGenerator(); - const int firstLevelKey = - firstLevelKeys[randomNumberGeneratorForKeySelection() % FIRST_LEVEL_ENTRY_COUNT]; - const int nextLevelBitmapEntryIndex = trieMap.getNextLevelBitmapEntryIndex(firstLevelKey); - EXPECT_NE(TrieMap::INVALID_INDEX, nextLevelBitmapEntryIndex); - EXPECT_TRUE(trieMap.put(key, value, nextLevelBitmapEntryIndex)); - secondLevelKeys.push_back(std::make_pair(firstLevelKey, key)); - twoLevelMap[firstLevelKey][key] = value; - } - - for (int i = 0; i < THIRD_LEVEL_ENTRY_COUNT; ++i) { - const int key = keyRandomNumberGenerator(); - const uint64_t value = valueRandomNumberGenerator(); - const std::pair secondLevelKey = - secondLevelKeys[randomNumberGeneratorForKeySelection() % SECOND_LEVEL_ENTRY_COUNT]; - const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(secondLevelKey.first); - EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); - const int thirdLevel = trieMap.getNextLevelBitmapEntryIndex( - secondLevelKey.second, secondLevel); - EXPECT_NE(TrieMap::INVALID_INDEX, thirdLevel); - EXPECT_TRUE(trieMap.put(key, value, thirdLevel)); - threeLevelMap[secondLevelKey.first][secondLevelKey.second][key] = value; - } - - for (const auto &firstLevelEntry : firstLevelEntries) { - EXPECT_EQ(firstLevelEntry.second, trieMap.getRoot(firstLevelEntry.first).mValue); - } - - for (const auto &firstLevelEntry : twoLevelMap) { - const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(firstLevelEntry.first); - EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); - for (const auto &secondLevelEntry : firstLevelEntry.second) { - EXPECT_EQ(secondLevelEntry.second, - trieMap.get(secondLevelEntry.first, secondLevel).mValue); - } - } - - for (const auto &firstLevelEntry : threeLevelMap) { - const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(firstLevelEntry.first); - EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); - for (const auto &secondLevelEntry : firstLevelEntry.second) { - const int thirdLevel = - trieMap.getNextLevelBitmapEntryIndex(secondLevelEntry.first, secondLevel); - EXPECT_NE(TrieMap::INVALID_INDEX, thirdLevel); - for (const auto &thirdLevelEntry : secondLevelEntry.second) { - EXPECT_EQ(thirdLevelEntry.second, - trieMap.get(thirdLevelEntry.first, thirdLevel).mValue); - } - } - } - - // Iteration - for (const auto &firstLevelEntry : trieMap.getEntriesInRootLevel()) { - EXPECT_EQ(trieMap.getRoot(firstLevelEntry.key()).mValue, firstLevelEntry.value()); - EXPECT_EQ(firstLevelEntries[firstLevelEntry.key()], firstLevelEntry.value()); - firstLevelEntries.erase(firstLevelEntry.key()); - for (const auto &secondLevelEntry : firstLevelEntry.getEntriesInNextLevel()) { - EXPECT_EQ(twoLevelMap[firstLevelEntry.key()][secondLevelEntry.key()], - secondLevelEntry.value()); - twoLevelMap[firstLevelEntry.key()].erase(secondLevelEntry.key()); - for (const auto &thirdLevelEntry : secondLevelEntry.getEntriesInNextLevel()) { - EXPECT_EQ(threeLevelMap[firstLevelEntry.key()][secondLevelEntry.key()] - [thirdLevelEntry.key()], thirdLevelEntry.value()); - threeLevelMap[firstLevelEntry.key()][secondLevelEntry.key()].erase( - thirdLevelEntry.key()); - } - } - } - - // Ensure all entries have been traversed. - EXPECT_TRUE(firstLevelEntries.empty()); - for (const auto &secondLevelEntry : twoLevelMap) { - EXPECT_TRUE(secondLevelEntry.second.empty()); - } - for (const auto &secondLevelEntry : threeLevelMap) { - for (const auto &thirdLevelEntry : secondLevelEntry.second) { - EXPECT_TRUE(thirdLevelEntry.second.empty()); - } - } -} - -TEST(TrieMapTest, TestIteration) { - static const int ELEMENT_COUNT = 200000; - TrieMap trieMap; - std::unordered_map testKeyValuePairs; - - // Use the uniform integer distribution [S_INT_MIN, S_INT_MAX]. - std::uniform_int_distribution keyDistribution(S_INT_MIN, S_INT_MAX); - auto keyRandomNumberGenerator = std::bind(keyDistribution, std::mt19937()); - - // Use the uniform distribution [0, TrieMap::MAX_VALUE]. - std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); - auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); - for (int i = 0; i < ELEMENT_COUNT; ++i) { - const int key = keyRandomNumberGenerator(); - const uint64_t value = valueRandomNumberGenerator(); - EXPECT_TRUE(trieMap.putRoot(key, value)); - testKeyValuePairs[key] = value; - } - for (const auto &entry : trieMap.getEntriesInRootLevel()) { - EXPECT_EQ(trieMap.getRoot(entry.key()).mValue, entry.value()); - EXPECT_EQ(testKeyValuePairs[entry.key()], entry.value()); - testKeyValuePairs.erase(entry.key()); - } - EXPECT_TRUE(testKeyValuePairs.empty()); -} - -} // namespace -} // namespace latinime -- cgit v1.2.3-83-g751a